All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] bcache: set max writeback rate when I/O request is idle
@ 2018-07-22 16:13 Coly Li
  2018-07-23  6:38 ` Stefan Priebe - Profihost AG
  2018-07-23 13:14 ` Kai Krakow
  0 siblings, 2 replies; 7+ messages in thread
From: Coly Li @ 2018-07-22 16:13 UTC (permalink / raw)
  To: colyli, linux-bcache; +Cc: linux-block, stable, Michael Lyle

Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
allows the writeback rate to be faster if there is no I/O request on a
bcache device. It works well if there is only one bcache device attached
to the cache set. If there are many bcache devices attached to a cache
set, it may introduce performance regression because multiple faster
writeback threads of the idle bcache devices will compete the btree level
locks with the bcache device who have I/O requests coming.

This patch fixes the above issue by only permitting fast writebac when
all bcache devices attached on the cache set are idle. And if one of the
bcache devices has new I/O request coming, minimized all writeback
throughput immediately and let PI controller __update_writeback_rate()
to decide the upcoming writeback rate for each bcache device.

Also when all bcache devices are idle, limited wrieback rate to a small
number is wast of thoughput, especially when backing devices are slower
non-rotation devices (e.g. SATA SSD). This patch sets a max writeback
rate for each backing device if the whole cache set is idle. A faster
writeback rate in idle time means new I/Os may have more available space
for dirty data, and people may observe a better write performance then.

Please note bcache may change its cache mode in run time, and this patch
still works if the cache mode is switched from writeback mode and there
is still dirty data on cache.

Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
Cc: stable@vger.kernel.org #4.16+
Signed-off-by: Coly Li <colyli@suse.de>
Cc: Michael Lyle <mlyle@lyle.org>
---
 drivers/md/bcache/bcache.h    |  9 +---
 drivers/md/bcache/request.c   | 42 ++++++++++++++-
 drivers/md/bcache/sysfs.c     | 14 +++--
 drivers/md/bcache/util.c      |  2 +-
 drivers/md/bcache/util.h      |  2 +-
 drivers/md/bcache/writeback.c | 98 +++++++++++++++++++++++++----------
 6 files changed, 126 insertions(+), 41 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d6bf294f3907..f7451e8be03c 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -328,13 +328,6 @@ struct cached_dev {
 	 */
 	atomic_t		has_dirty;
 
-	/*
-	 * Set to zero by things that touch the backing volume-- except
-	 * writeback.  Incremented by writeback.  Used to determine when to
-	 * accelerate idle writeback.
-	 */
-	atomic_t		backing_idle;
-
 	struct bch_ratelimit	writeback_rate;
 	struct delayed_work	writeback_rate_update;
 
@@ -514,6 +507,8 @@ struct cache_set {
 	struct cache_accounting accounting;
 
 	unsigned long		flags;
+	atomic_t		idle_counter;
+	atomic_t		at_max_writeback_rate;
 
 	struct cache_sb		sb;
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ae67f5fa8047..fe45f561a054 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -1104,6 +1104,34 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
 
 /* Cached devices - read & write stuff */
 
+static void quit_max_writeback_rate(struct cache_set *c)
+{
+	int i;
+	struct bcache_device *d;
+	struct cached_dev *dc;
+
+	mutex_lock(&bch_register_lock);
+
+	for (i = 0; i < c->devices_max_used; i++) {
+		if (!c->devices[i])
+			continue;
+
+		if (UUID_FLASH_ONLY(&c->uuids[i]))
+			continue;
+
+		d = c->devices[i];
+		dc = container_of(d, struct cached_dev, disk);
+		/*
+		 * set writeback rate to default minimum value,
+		 * then let update_writeback_rate() to decide the
+		 * upcoming rate.
+		 */
+		atomic64_set(&dc->writeback_rate.rate, 1);
+	}
+
+	mutex_unlock(&bch_register_lock);
+}
+
 static blk_qc_t cached_dev_make_request(struct request_queue *q,
 					struct bio *bio)
 {
@@ -1119,7 +1147,19 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
 		return BLK_QC_T_NONE;
 	}
 
-	atomic_set(&dc->backing_idle, 0);
+	if (d->c) {
+		atomic_set(&d->c->idle_counter, 0);
+		/*
+		 * If at_max_writeback_rate of cache set is true and new I/O
+		 * comes, quit max writeback rate of all cached devices
+		 * attached to this cache set, and set at_max_writeback_rate
+		 * to false.
+		 */
+		if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
+			atomic_set(&d->c->at_max_writeback_rate, 0);
+			quit_max_writeback_rate(d->c);
+		}
+	}
 	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
 
 	bio_set_dev(bio, dc->bdev);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 225b15aa0340..d719021bff81 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -170,7 +170,8 @@ SHOW(__bch_cached_dev)
 	var_printf(writeback_running,	"%i");
 	var_print(writeback_delay);
 	var_print(writeback_percent);
-	sysfs_hprint(writeback_rate,	dc->writeback_rate.rate << 9);
+	sysfs_hprint(writeback_rate,
+		     atomic64_read(&dc->writeback_rate.rate) << 9);
 	sysfs_hprint(io_errors,		atomic_read(&dc->io_errors));
 	sysfs_printf(io_error_limit,	"%i", dc->error_limit);
 	sysfs_printf(io_disable,	"%i", dc->io_disable);
@@ -188,7 +189,8 @@ SHOW(__bch_cached_dev)
 		char change[20];
 		s64 next_io;
 
-		bch_hprint(rate,	dc->writeback_rate.rate << 9);
+		bch_hprint(rate,
+			   atomic64_read(&dc->writeback_rate.rate) << 9);
 		bch_hprint(dirty,	bcache_dev_sectors_dirty(&dc->disk) << 9);
 		bch_hprint(target,	dc->writeback_rate_target << 9);
 		bch_hprint(proportional,dc->writeback_rate_proportional << 9);
@@ -255,8 +257,12 @@ STORE(__cached_dev)
 
 	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
 
-	sysfs_strtoul_clamp(writeback_rate,
-			    dc->writeback_rate.rate, 1, INT_MAX);
+	if (attr == &sysfs_writeback_rate) {
+		int v;
+
+		sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX);
+		atomic64_set(&dc->writeback_rate.rate, v);
+	}
 
 	sysfs_strtoul_clamp(writeback_rate_update_seconds,
 			    dc->writeback_rate_update_seconds,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index fc479b026d6d..84f90c3d996d 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 {
 	uint64_t now = local_clock();
 
-	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+	d->next += div_u64(done * NSEC_PER_SEC, atomic64_read(&d->rate));
 
 	/* Bound the time.  Don't let us fall further than 2 seconds behind
 	 * (this prevents unnecessary backlog that would make it impossible
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index cced87f8eb27..7e17f32ab563 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -442,7 +442,7 @@ struct bch_ratelimit {
 	 * Rate at which we want to do work, in units per second
 	 * The units here correspond to the units passed to bch_next_delay()
 	 */
-	uint32_t		rate;
+	atomic64_t		rate;
 };
 
 static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index ad45ebe1a74b..72059f910230 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -49,6 +49,63 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
 	return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
 }
 
+static bool set_at_max_writeback_rate(struct cache_set *c,
+				      struct cached_dev *dc)
+{
+	int i, dirty_dc_nr = 0;
+	struct bcache_device *d;
+
+	mutex_lock(&bch_register_lock);
+	for (i = 0; i < c->devices_max_used; i++) {
+		if (!c->devices[i])
+			continue;
+		if (UUID_FLASH_ONLY(&c->uuids[i]))
+			continue;
+		d = c->devices[i];
+		dc = container_of(d, struct cached_dev, disk);
+		if (atomic_read(&dc->has_dirty))
+			dirty_dc_nr++;
+	}
+	mutex_unlock(&bch_register_lock);
+
+	/*
+	 * Idle_counter is increased everytime when update_writeback_rate()
+	 * is rescheduled in. If all backing devices attached to the same
+	 * cache set has same dc->writeback_rate_update_seconds value, it
+	 * is about 10 rounds of update_writeback_rate() is called on each
+	 * backing device, then the code will fall through at set 1 to
+	 * c->at_max_writeback_rate, and a max wrteback rate to each
+	 * dc->writeback_rate.rate. This is not very accurate but works well
+	 * to make sure the whole cache set has no new I/O coming before
+	 * writeback rate is set to a max number.
+	 */
+	if (atomic_inc_return(&c->idle_counter) < dirty_dc_nr * 10)
+		return false;
+
+	if (atomic_read(&c->at_max_writeback_rate) != 1)
+		atomic_set(&c->at_max_writeback_rate, 1);
+
+
+	atomic64_set(&dc->writeback_rate.rate, INT_MAX);
+
+	/* keep writeback_rate_target as existing value */
+	dc->writeback_rate_proportional = 0;
+	dc->writeback_rate_integral_scaled = 0;
+	dc->writeback_rate_change = 0;
+
+	/*
+	 * Check c->idle_counter and c->at_max_writeback_rate agagain in case
+	 * new I/O arrives during before set_at_max_writeback_rate() returns.
+	 * Then the writeback rate is set to 1, and its new value should be
+	 * decided via __update_writeback_rate().
+	 */
+	if (atomic_read(&c->idle_counter) < dirty_dc_nr * 10 ||
+	    !atomic_read(&c->at_max_writeback_rate))
+		return false;
+
+	return true;
+}
+
 static void __update_writeback_rate(struct cached_dev *dc)
 {
 	/*
@@ -104,8 +161,9 @@ static void __update_writeback_rate(struct cached_dev *dc)
 
 	dc->writeback_rate_proportional = proportional_scaled;
 	dc->writeback_rate_integral_scaled = integral_scaled;
-	dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
-	dc->writeback_rate.rate = new_rate;
+	dc->writeback_rate_change = new_rate -
+			atomic64_read(&dc->writeback_rate.rate);
+	atomic64_set(&dc->writeback_rate.rate, new_rate);
 	dc->writeback_rate_target = target;
 }
 
@@ -138,9 +196,16 @@ static void update_writeback_rate(struct work_struct *work)
 
 	down_read(&dc->writeback_lock);
 
-	if (atomic_read(&dc->has_dirty) &&
-	    dc->writeback_percent)
-		__update_writeback_rate(dc);
+	if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
+		/*
+		 * If the whole cache set is idle, set_at_max_writeback_rate()
+		 * will set writeback rate to a max number. Then it is
+		 * unncessary to update writeback rate for an idle cache set
+		 * in maximum writeback rate number(s).
+		 */
+		if (!set_at_max_writeback_rate(c, dc))
+			__update_writeback_rate(dc);
+	}
 
 	up_read(&dc->writeback_lock);
 
@@ -422,27 +487,6 @@ static void read_dirty(struct cached_dev *dc)
 
 		delay = writeback_delay(dc, size);
 
-		/* If the control system would wait for at least half a
-		 * second, and there's been no reqs hitting the backing disk
-		 * for awhile: use an alternate mode where we have at most
-		 * one contiguous set of writebacks in flight at a time.  If
-		 * someone wants to do IO it will be quick, as it will only
-		 * have to contend with one operation in flight, and we'll
-		 * be round-tripping data to the backing disk as quickly as
-		 * it can accept it.
-		 */
-		if (delay >= HZ / 2) {
-			/* 3 means at least 1.5 seconds, up to 7.5 if we
-			 * have slowed way down.
-			 */
-			if (atomic_inc_return(&dc->backing_idle) >= 3) {
-				/* Wait for current I/Os to finish */
-				closure_sync(&cl);
-				/* And immediately launch a new set. */
-				delay = 0;
-			}
-		}
-
 		while (!kthread_should_stop() &&
 		       !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
 		       delay) {
@@ -715,7 +759,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_running		= true;
 	dc->writeback_percent		= 10;
 	dc->writeback_delay		= 30;
-	dc->writeback_rate.rate		= 1024;
+	atomic64_set(&dc->writeback_rate.rate, 1024);
 	dc->writeback_rate_minimum	= 8;
 
 	dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH] bcache: set max writeback rate when I/O request is idle
  2018-07-22 16:13 [PATCH] bcache: set max writeback rate when I/O request is idle Coly Li
@ 2018-07-23  6:38 ` Stefan Priebe - Profihost AG
  2018-07-23  9:05   ` Coly Li
  2018-07-23 13:14 ` Kai Krakow
  1 sibling, 1 reply; 7+ messages in thread
From: Stefan Priebe - Profihost AG @ 2018-07-23  6:38 UTC (permalink / raw)
  To: Coly Li, linux-bcache; +Cc: linux-block, stable, Michael Lyle

Hi Coly,

thanks for this patch. I was right on the way to debug why i have lot's
of stalled I/O on latest SLES12-SP3 kernel. I'll add this patch and try
if this solves my issues.

Each cache set has two backing devices.

Greets,
Stefan


Am 22.07.2018 um 18:13 schrieb Coly Li:
> Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
> allows the writeback rate to be faster if there is no I/O request on a
> bcache device. It works well if there is only one bcache device attached
> to the cache set. If there are many bcache devices attached to a cache
> set, it may introduce performance regression because multiple faster
> writeback threads of the idle bcache devices will compete the btree level
> locks with the bcache device who have I/O requests coming.
> 
> This patch fixes the above issue by only permitting fast writebac when
> all bcache devices attached on the cache set are idle. And if one of the
> bcache devices has new I/O request coming, minimized all writeback
> throughput immediately and let PI controller __update_writeback_rate()
> to decide the upcoming writeback rate for each bcache device.
> 
> Also when all bcache devices are idle, limited wrieback rate to a small
> number is wast of thoughput, especially when backing devices are slower
> non-rotation devices (e.g. SATA SSD). This patch sets a max writeback
> rate for each backing device if the whole cache set is idle. A faster
> writeback rate in idle time means new I/Os may have more available space
> for dirty data, and people may observe a better write performance then.
> 
> Please note bcache may change its cache mode in run time, and this patch
> still works if the cache mode is switched from writeback mode and there
> is still dirty data on cache.
> 
> Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
> Cc: stable@vger.kernel.org #4.16+
> Signed-off-by: Coly Li <colyli@suse.de>
> Cc: Michael Lyle <mlyle@lyle.org>
> ---
>  drivers/md/bcache/bcache.h    |  9 +---
>  drivers/md/bcache/request.c   | 42 ++++++++++++++-
>  drivers/md/bcache/sysfs.c     | 14 +++--
>  drivers/md/bcache/util.c      |  2 +-
>  drivers/md/bcache/util.h      |  2 +-
>  drivers/md/bcache/writeback.c | 98 +++++++++++++++++++++++++----------
>  6 files changed, 126 insertions(+), 41 deletions(-)
> 
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index d6bf294f3907..f7451e8be03c 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -328,13 +328,6 @@ struct cached_dev {
>  	 */
>  	atomic_t		has_dirty;
>  
> -	/*
> -	 * Set to zero by things that touch the backing volume-- except
> -	 * writeback.  Incremented by writeback.  Used to determine when to
> -	 * accelerate idle writeback.
> -	 */
> -	atomic_t		backing_idle;
> -
>  	struct bch_ratelimit	writeback_rate;
>  	struct delayed_work	writeback_rate_update;
>  
> @@ -514,6 +507,8 @@ struct cache_set {
>  	struct cache_accounting accounting;
>  
>  	unsigned long		flags;
> +	atomic_t		idle_counter;
> +	atomic_t		at_max_writeback_rate;
>  
>  	struct cache_sb		sb;
>  
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index ae67f5fa8047..fe45f561a054 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -1104,6 +1104,34 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
>  
>  /* Cached devices - read & write stuff */
>  
> +static void quit_max_writeback_rate(struct cache_set *c)
> +{
> +	int i;
> +	struct bcache_device *d;
> +	struct cached_dev *dc;
> +
> +	mutex_lock(&bch_register_lock);
> +
> +	for (i = 0; i < c->devices_max_used; i++) {
> +		if (!c->devices[i])
> +			continue;
> +
> +		if (UUID_FLASH_ONLY(&c->uuids[i]))
> +			continue;
> +
> +		d = c->devices[i];
> +		dc = container_of(d, struct cached_dev, disk);
> +		/*
> +		 * set writeback rate to default minimum value,
> +		 * then let update_writeback_rate() to decide the
> +		 * upcoming rate.
> +		 */
> +		atomic64_set(&dc->writeback_rate.rate, 1);
> +	}
> +
> +	mutex_unlock(&bch_register_lock);
> +}
> +
>  static blk_qc_t cached_dev_make_request(struct request_queue *q,
>  					struct bio *bio)
>  {
> @@ -1119,7 +1147,19 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
>  		return BLK_QC_T_NONE;
>  	}
>  
> -	atomic_set(&dc->backing_idle, 0);
> +	if (d->c) {
> +		atomic_set(&d->c->idle_counter, 0);
> +		/*
> +		 * If at_max_writeback_rate of cache set is true and new I/O
> +		 * comes, quit max writeback rate of all cached devices
> +		 * attached to this cache set, and set at_max_writeback_rate
> +		 * to false.
> +		 */
> +		if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
> +			atomic_set(&d->c->at_max_writeback_rate, 0);
> +			quit_max_writeback_rate(d->c);
> +		}
> +	}
>  	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
>  
>  	bio_set_dev(bio, dc->bdev);
> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
> index 225b15aa0340..d719021bff81 100644
> --- a/drivers/md/bcache/sysfs.c
> +++ b/drivers/md/bcache/sysfs.c
> @@ -170,7 +170,8 @@ SHOW(__bch_cached_dev)
>  	var_printf(writeback_running,	"%i");
>  	var_print(writeback_delay);
>  	var_print(writeback_percent);
> -	sysfs_hprint(writeback_rate,	dc->writeback_rate.rate << 9);
> +	sysfs_hprint(writeback_rate,
> +		     atomic64_read(&dc->writeback_rate.rate) << 9);
>  	sysfs_hprint(io_errors,		atomic_read(&dc->io_errors));
>  	sysfs_printf(io_error_limit,	"%i", dc->error_limit);
>  	sysfs_printf(io_disable,	"%i", dc->io_disable);
> @@ -188,7 +189,8 @@ SHOW(__bch_cached_dev)
>  		char change[20];
>  		s64 next_io;
>  
> -		bch_hprint(rate,	dc->writeback_rate.rate << 9);
> +		bch_hprint(rate,
> +			   atomic64_read(&dc->writeback_rate.rate) << 9);
>  		bch_hprint(dirty,	bcache_dev_sectors_dirty(&dc->disk) << 9);
>  		bch_hprint(target,	dc->writeback_rate_target << 9);
>  		bch_hprint(proportional,dc->writeback_rate_proportional << 9);
> @@ -255,8 +257,12 @@ STORE(__cached_dev)
>  
>  	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
>  
> -	sysfs_strtoul_clamp(writeback_rate,
> -			    dc->writeback_rate.rate, 1, INT_MAX);
> +	if (attr == &sysfs_writeback_rate) {
> +		int v;
> +
> +		sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX);
> +		atomic64_set(&dc->writeback_rate.rate, v);
> +	}
>  
>  	sysfs_strtoul_clamp(writeback_rate_update_seconds,
>  			    dc->writeback_rate_update_seconds,
> diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
> index fc479b026d6d..84f90c3d996d 100644
> --- a/drivers/md/bcache/util.c
> +++ b/drivers/md/bcache/util.c
> @@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
>  {
>  	uint64_t now = local_clock();
>  
> -	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
> +	d->next += div_u64(done * NSEC_PER_SEC, atomic64_read(&d->rate));
>  
>  	/* Bound the time.  Don't let us fall further than 2 seconds behind
>  	 * (this prevents unnecessary backlog that would make it impossible
> diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
> index cced87f8eb27..7e17f32ab563 100644
> --- a/drivers/md/bcache/util.h
> +++ b/drivers/md/bcache/util.h
> @@ -442,7 +442,7 @@ struct bch_ratelimit {
>  	 * Rate at which we want to do work, in units per second
>  	 * The units here correspond to the units passed to bch_next_delay()
>  	 */
> -	uint32_t		rate;
> +	atomic64_t		rate;
>  };
>  
>  static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index ad45ebe1a74b..72059f910230 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -49,6 +49,63 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
>  	return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
>  }
>  
> +static bool set_at_max_writeback_rate(struct cache_set *c,
> +				      struct cached_dev *dc)
> +{
> +	int i, dirty_dc_nr = 0;
> +	struct bcache_device *d;
> +
> +	mutex_lock(&bch_register_lock);
> +	for (i = 0; i < c->devices_max_used; i++) {
> +		if (!c->devices[i])
> +			continue;
> +		if (UUID_FLASH_ONLY(&c->uuids[i]))
> +			continue;
> +		d = c->devices[i];
> +		dc = container_of(d, struct cached_dev, disk);
> +		if (atomic_read(&dc->has_dirty))
> +			dirty_dc_nr++;
> +	}
> +	mutex_unlock(&bch_register_lock);
> +
> +	/*
> +	 * Idle_counter is increased everytime when update_writeback_rate()
> +	 * is rescheduled in. If all backing devices attached to the same
> +	 * cache set has same dc->writeback_rate_update_seconds value, it
> +	 * is about 10 rounds of update_writeback_rate() is called on each
> +	 * backing device, then the code will fall through at set 1 to
> +	 * c->at_max_writeback_rate, and a max wrteback rate to each
> +	 * dc->writeback_rate.rate. This is not very accurate but works well
> +	 * to make sure the whole cache set has no new I/O coming before
> +	 * writeback rate is set to a max number.
> +	 */
> +	if (atomic_inc_return(&c->idle_counter) < dirty_dc_nr * 10)
> +		return false;
> +
> +	if (atomic_read(&c->at_max_writeback_rate) != 1)
> +		atomic_set(&c->at_max_writeback_rate, 1);
> +
> +
> +	atomic64_set(&dc->writeback_rate.rate, INT_MAX);
> +
> +	/* keep writeback_rate_target as existing value */
> +	dc->writeback_rate_proportional = 0;
> +	dc->writeback_rate_integral_scaled = 0;
> +	dc->writeback_rate_change = 0;
> +
> +	/*
> +	 * Check c->idle_counter and c->at_max_writeback_rate agagain in case
> +	 * new I/O arrives during before set_at_max_writeback_rate() returns.
> +	 * Then the writeback rate is set to 1, and its new value should be
> +	 * decided via __update_writeback_rate().
> +	 */
> +	if (atomic_read(&c->idle_counter) < dirty_dc_nr * 10 ||
> +	    !atomic_read(&c->at_max_writeback_rate))
> +		return false;
> +
> +	return true;
> +}
> +
>  static void __update_writeback_rate(struct cached_dev *dc)
>  {
>  	/*
> @@ -104,8 +161,9 @@ static void __update_writeback_rate(struct cached_dev *dc)
>  
>  	dc->writeback_rate_proportional = proportional_scaled;
>  	dc->writeback_rate_integral_scaled = integral_scaled;
> -	dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
> -	dc->writeback_rate.rate = new_rate;
> +	dc->writeback_rate_change = new_rate -
> +			atomic64_read(&dc->writeback_rate.rate);
> +	atomic64_set(&dc->writeback_rate.rate, new_rate);
>  	dc->writeback_rate_target = target;
>  }
>  
> @@ -138,9 +196,16 @@ static void update_writeback_rate(struct work_struct *work)
>  
>  	down_read(&dc->writeback_lock);
>  
> -	if (atomic_read(&dc->has_dirty) &&
> -	    dc->writeback_percent)
> -		__update_writeback_rate(dc);
> +	if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
> +		/*
> +		 * If the whole cache set is idle, set_at_max_writeback_rate()
> +		 * will set writeback rate to a max number. Then it is
> +		 * unncessary to update writeback rate for an idle cache set
> +		 * in maximum writeback rate number(s).
> +		 */
> +		if (!set_at_max_writeback_rate(c, dc))
> +			__update_writeback_rate(dc);
> +	}
>  
>  	up_read(&dc->writeback_lock);
>  
> @@ -422,27 +487,6 @@ static void read_dirty(struct cached_dev *dc)
>  
>  		delay = writeback_delay(dc, size);
>  
> -		/* If the control system would wait for at least half a
> -		 * second, and there's been no reqs hitting the backing disk
> -		 * for awhile: use an alternate mode where we have at most
> -		 * one contiguous set of writebacks in flight at a time.  If
> -		 * someone wants to do IO it will be quick, as it will only
> -		 * have to contend with one operation in flight, and we'll
> -		 * be round-tripping data to the backing disk as quickly as
> -		 * it can accept it.
> -		 */
> -		if (delay >= HZ / 2) {
> -			/* 3 means at least 1.5 seconds, up to 7.5 if we
> -			 * have slowed way down.
> -			 */
> -			if (atomic_inc_return(&dc->backing_idle) >= 3) {
> -				/* Wait for current I/Os to finish */
> -				closure_sync(&cl);
> -				/* And immediately launch a new set. */
> -				delay = 0;
> -			}
> -		}
> -
>  		while (!kthread_should_stop() &&
>  		       !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
>  		       delay) {
> @@ -715,7 +759,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
>  	dc->writeback_running		= true;
>  	dc->writeback_percent		= 10;
>  	dc->writeback_delay		= 30;
> -	dc->writeback_rate.rate		= 1024;
> +	atomic64_set(&dc->writeback_rate.rate, 1024);
>  	dc->writeback_rate_minimum	= 8;
>  
>  	dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] bcache: set max writeback rate when I/O request is idle
  2018-07-23  6:38 ` Stefan Priebe - Profihost AG
@ 2018-07-23  9:05   ` Coly Li
  2018-07-23  9:11     ` Stefan Priebe - Profihost AG
  0 siblings, 1 reply; 7+ messages in thread
From: Coly Li @ 2018-07-23  9:05 UTC (permalink / raw)
  To: Stefan Priebe - Profihost AG, linux-bcache
  Cc: linux-block, stable, Michael Lyle

On 2018/7/23 2:38 PM, Stefan Priebe - Profihost AG wrote:
> Hi Coly,
> 
> thanks for this patch. I was right on the way to debug why i have lot's
> of stalled I/O on latest SLES12-SP3 kernel. I'll add this patch and try
> if this solves my issues.
> 
> Each cache set has two backing devices.

Hi Stefan,

Thanks for the testing.

Coly Li


> 
> Am 22.07.2018 um 18:13 schrieb Coly Li:
>> Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
>> allows the writeback rate to be faster if there is no I/O request on a
>> bcache device. It works well if there is only one bcache device attached
>> to the cache set. If there are many bcache devices attached to a cache
>> set, it may introduce performance regression because multiple faster
>> writeback threads of the idle bcache devices will compete the btree level
>> locks with the bcache device who have I/O requests coming.
>>
>> This patch fixes the above issue by only permitting fast writebac when
>> all bcache devices attached on the cache set are idle. And if one of the
>> bcache devices has new I/O request coming, minimized all writeback
>> throughput immediately and let PI controller __update_writeback_rate()
>> to decide the upcoming writeback rate for each bcache device.
>>
>> Also when all bcache devices are idle, limited wrieback rate to a small
>> number is wast of thoughput, especially when backing devices are slower
>> non-rotation devices (e.g. SATA SSD). This patch sets a max writeback
>> rate for each backing device if the whole cache set is idle. A faster
>> writeback rate in idle time means new I/Os may have more available space
>> for dirty data, and people may observe a better write performance then.
>>
>> Please note bcache may change its cache mode in run time, and this patch
>> still works if the cache mode is switched from writeback mode and there
>> is still dirty data on cache.
>>
>> Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
>> Cc: stable@vger.kernel.org #4.16+
>> Signed-off-by: Coly Li <colyli@suse.de>
>> Cc: Michael Lyle <mlyle@lyle.org>
>> ---
>>  drivers/md/bcache/bcache.h    |  9 +---
>>  drivers/md/bcache/request.c   | 42 ++++++++++++++-
>>  drivers/md/bcache/sysfs.c     | 14 +++--
>>  drivers/md/bcache/util.c      |  2 +-
>>  drivers/md/bcache/util.h      |  2 +-
>>  drivers/md/bcache/writeback.c | 98 +++++++++++++++++++++++++----------
>>  6 files changed, 126 insertions(+), 41 deletions(-)
>>
>> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
>> index d6bf294f3907..f7451e8be03c 100644
>> --- a/drivers/md/bcache/bcache.h
>> +++ b/drivers/md/bcache/bcache.h
>> @@ -328,13 +328,6 @@ struct cached_dev {
>>  	 */
>>  	atomic_t		has_dirty;
>>  
>> -	/*
>> -	 * Set to zero by things that touch the backing volume-- except
>> -	 * writeback.  Incremented by writeback.  Used to determine when to
>> -	 * accelerate idle writeback.
>> -	 */
>> -	atomic_t		backing_idle;
>> -
>>  	struct bch_ratelimit	writeback_rate;
>>  	struct delayed_work	writeback_rate_update;
>>  
>> @@ -514,6 +507,8 @@ struct cache_set {
>>  	struct cache_accounting accounting;
>>  
>>  	unsigned long		flags;
>> +	atomic_t		idle_counter;
>> +	atomic_t		at_max_writeback_rate;
>>  
>>  	struct cache_sb		sb;
>>  
>> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
>> index ae67f5fa8047..fe45f561a054 100644
>> --- a/drivers/md/bcache/request.c
>> +++ b/drivers/md/bcache/request.c
>> @@ -1104,6 +1104,34 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
>>  
>>  /* Cached devices - read & write stuff */
>>  
>> +static void quit_max_writeback_rate(struct cache_set *c)
>> +{
>> +	int i;
>> +	struct bcache_device *d;
>> +	struct cached_dev *dc;
>> +
>> +	mutex_lock(&bch_register_lock);
>> +
>> +	for (i = 0; i < c->devices_max_used; i++) {
>> +		if (!c->devices[i])
>> +			continue;
>> +
>> +		if (UUID_FLASH_ONLY(&c->uuids[i]))
>> +			continue;
>> +
>> +		d = c->devices[i];
>> +		dc = container_of(d, struct cached_dev, disk);
>> +		/*
>> +		 * set writeback rate to default minimum value,
>> +		 * then let update_writeback_rate() to decide the
>> +		 * upcoming rate.
>> +		 */
>> +		atomic64_set(&dc->writeback_rate.rate, 1);
>> +	}
>> +
>> +	mutex_unlock(&bch_register_lock);
>> +}
>> +
>>  static blk_qc_t cached_dev_make_request(struct request_queue *q,
>>  					struct bio *bio)
>>  {
>> @@ -1119,7 +1147,19 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
>>  		return BLK_QC_T_NONE;
>>  	}
>>  
>> -	atomic_set(&dc->backing_idle, 0);
>> +	if (d->c) {
>> +		atomic_set(&d->c->idle_counter, 0);
>> +		/*
>> +		 * If at_max_writeback_rate of cache set is true and new I/O
>> +		 * comes, quit max writeback rate of all cached devices
>> +		 * attached to this cache set, and set at_max_writeback_rate
>> +		 * to false.
>> +		 */
>> +		if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
>> +			atomic_set(&d->c->at_max_writeback_rate, 0);
>> +			quit_max_writeback_rate(d->c);
>> +		}
>> +	}
>>  	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
>>  
>>  	bio_set_dev(bio, dc->bdev);
>> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
>> index 225b15aa0340..d719021bff81 100644
>> --- a/drivers/md/bcache/sysfs.c
>> +++ b/drivers/md/bcache/sysfs.c
>> @@ -170,7 +170,8 @@ SHOW(__bch_cached_dev)
>>  	var_printf(writeback_running,	"%i");
>>  	var_print(writeback_delay);
>>  	var_print(writeback_percent);
>> -	sysfs_hprint(writeback_rate,	dc->writeback_rate.rate << 9);
>> +	sysfs_hprint(writeback_rate,
>> +		     atomic64_read(&dc->writeback_rate.rate) << 9);
>>  	sysfs_hprint(io_errors,		atomic_read(&dc->io_errors));
>>  	sysfs_printf(io_error_limit,	"%i", dc->error_limit);
>>  	sysfs_printf(io_disable,	"%i", dc->io_disable);
>> @@ -188,7 +189,8 @@ SHOW(__bch_cached_dev)
>>  		char change[20];
>>  		s64 next_io;
>>  
>> -		bch_hprint(rate,	dc->writeback_rate.rate << 9);
>> +		bch_hprint(rate,
>> +			   atomic64_read(&dc->writeback_rate.rate) << 9);
>>  		bch_hprint(dirty,	bcache_dev_sectors_dirty(&dc->disk) << 9);
>>  		bch_hprint(target,	dc->writeback_rate_target << 9);
>>  		bch_hprint(proportional,dc->writeback_rate_proportional << 9);
>> @@ -255,8 +257,12 @@ STORE(__cached_dev)
>>  
>>  	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
>>  
>> -	sysfs_strtoul_clamp(writeback_rate,
>> -			    dc->writeback_rate.rate, 1, INT_MAX);
>> +	if (attr == &sysfs_writeback_rate) {
>> +		int v;
>> +
>> +		sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX);
>> +		atomic64_set(&dc->writeback_rate.rate, v);
>> +	}
>>  
>>  	sysfs_strtoul_clamp(writeback_rate_update_seconds,
>>  			    dc->writeback_rate_update_seconds,
>> diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
>> index fc479b026d6d..84f90c3d996d 100644
>> --- a/drivers/md/bcache/util.c
>> +++ b/drivers/md/bcache/util.c
>> @@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
>>  {
>>  	uint64_t now = local_clock();
>>  
>> -	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
>> +	d->next += div_u64(done * NSEC_PER_SEC, atomic64_read(&d->rate));
>>  
>>  	/* Bound the time.  Don't let us fall further than 2 seconds behind
>>  	 * (this prevents unnecessary backlog that would make it impossible
>> diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
>> index cced87f8eb27..7e17f32ab563 100644
>> --- a/drivers/md/bcache/util.h
>> +++ b/drivers/md/bcache/util.h
>> @@ -442,7 +442,7 @@ struct bch_ratelimit {
>>  	 * Rate at which we want to do work, in units per second
>>  	 * The units here correspond to the units passed to bch_next_delay()
>>  	 */
>> -	uint32_t		rate;
>> +	atomic64_t		rate;
>>  };
>>  
>>  static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
>> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
>> index ad45ebe1a74b..72059f910230 100644
>> --- a/drivers/md/bcache/writeback.c
>> +++ b/drivers/md/bcache/writeback.c
>> @@ -49,6 +49,63 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
>>  	return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
>>  }
>>  
>> +static bool set_at_max_writeback_rate(struct cache_set *c,
>> +				      struct cached_dev *dc)
>> +{
>> +	int i, dirty_dc_nr = 0;
>> +	struct bcache_device *d;
>> +
>> +	mutex_lock(&bch_register_lock);
>> +	for (i = 0; i < c->devices_max_used; i++) {
>> +		if (!c->devices[i])
>> +			continue;
>> +		if (UUID_FLASH_ONLY(&c->uuids[i]))
>> +			continue;
>> +		d = c->devices[i];
>> +		dc = container_of(d, struct cached_dev, disk);
>> +		if (atomic_read(&dc->has_dirty))
>> +			dirty_dc_nr++;
>> +	}
>> +	mutex_unlock(&bch_register_lock);
>> +
>> +	/*
>> +	 * Idle_counter is increased everytime when update_writeback_rate()
>> +	 * is rescheduled in. If all backing devices attached to the same
>> +	 * cache set has same dc->writeback_rate_update_seconds value, it
>> +	 * is about 10 rounds of update_writeback_rate() is called on each
>> +	 * backing device, then the code will fall through at set 1 to
>> +	 * c->at_max_writeback_rate, and a max wrteback rate to each
>> +	 * dc->writeback_rate.rate. This is not very accurate but works well
>> +	 * to make sure the whole cache set has no new I/O coming before
>> +	 * writeback rate is set to a max number.
>> +	 */
>> +	if (atomic_inc_return(&c->idle_counter) < dirty_dc_nr * 10)
>> +		return false;
>> +
>> +	if (atomic_read(&c->at_max_writeback_rate) != 1)
>> +		atomic_set(&c->at_max_writeback_rate, 1);
>> +
>> +
>> +	atomic64_set(&dc->writeback_rate.rate, INT_MAX);
>> +
>> +	/* keep writeback_rate_target as existing value */
>> +	dc->writeback_rate_proportional = 0;
>> +	dc->writeback_rate_integral_scaled = 0;
>> +	dc->writeback_rate_change = 0;
>> +
>> +	/*
>> +	 * Check c->idle_counter and c->at_max_writeback_rate agagain in case
>> +	 * new I/O arrives during before set_at_max_writeback_rate() returns.
>> +	 * Then the writeback rate is set to 1, and its new value should be
>> +	 * decided via __update_writeback_rate().
>> +	 */
>> +	if (atomic_read(&c->idle_counter) < dirty_dc_nr * 10 ||
>> +	    !atomic_read(&c->at_max_writeback_rate))
>> +		return false;
>> +
>> +	return true;
>> +}
>> +
>>  static void __update_writeback_rate(struct cached_dev *dc)
>>  {
>>  	/*
>> @@ -104,8 +161,9 @@ static void __update_writeback_rate(struct cached_dev *dc)
>>  
>>  	dc->writeback_rate_proportional = proportional_scaled;
>>  	dc->writeback_rate_integral_scaled = integral_scaled;
>> -	dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
>> -	dc->writeback_rate.rate = new_rate;
>> +	dc->writeback_rate_change = new_rate -
>> +			atomic64_read(&dc->writeback_rate.rate);
>> +	atomic64_set(&dc->writeback_rate.rate, new_rate);
>>  	dc->writeback_rate_target = target;
>>  }
>>  
>> @@ -138,9 +196,16 @@ static void update_writeback_rate(struct work_struct *work)
>>  
>>  	down_read(&dc->writeback_lock);
>>  
>> -	if (atomic_read(&dc->has_dirty) &&
>> -	    dc->writeback_percent)
>> -		__update_writeback_rate(dc);
>> +	if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
>> +		/*
>> +		 * If the whole cache set is idle, set_at_max_writeback_rate()
>> +		 * will set writeback rate to a max number. Then it is
>> +		 * unncessary to update writeback rate for an idle cache set
>> +		 * in maximum writeback rate number(s).
>> +		 */
>> +		if (!set_at_max_writeback_rate(c, dc))
>> +			__update_writeback_rate(dc);
>> +	}
>>  
>>  	up_read(&dc->writeback_lock);
>>  
>> @@ -422,27 +487,6 @@ static void read_dirty(struct cached_dev *dc)
>>  
>>  		delay = writeback_delay(dc, size);
>>  
>> -		/* If the control system would wait for at least half a
>> -		 * second, and there's been no reqs hitting the backing disk
>> -		 * for awhile: use an alternate mode where we have at most
>> -		 * one contiguous set of writebacks in flight at a time.  If
>> -		 * someone wants to do IO it will be quick, as it will only
>> -		 * have to contend with one operation in flight, and we'll
>> -		 * be round-tripping data to the backing disk as quickly as
>> -		 * it can accept it.
>> -		 */
>> -		if (delay >= HZ / 2) {
>> -			/* 3 means at least 1.5 seconds, up to 7.5 if we
>> -			 * have slowed way down.
>> -			 */
>> -			if (atomic_inc_return(&dc->backing_idle) >= 3) {
>> -				/* Wait for current I/Os to finish */
>> -				closure_sync(&cl);
>> -				/* And immediately launch a new set. */
>> -				delay = 0;
>> -			}
>> -		}
>> -
>>  		while (!kthread_should_stop() &&
>>  		       !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
>>  		       delay) {
>> @@ -715,7 +759,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
>>  	dc->writeback_running		= true;
>>  	dc->writeback_percent		= 10;
>>  	dc->writeback_delay		= 30;
>> -	dc->writeback_rate.rate		= 1024;
>> +	atomic64_set(&dc->writeback_rate.rate, 1024);
>>  	dc->writeback_rate_minimum	= 8;
>>  
>>  	dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
>>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] bcache: set max writeback rate when I/O request is idle
  2018-07-23  9:05   ` Coly Li
@ 2018-07-23  9:11     ` Stefan Priebe - Profihost AG
  0 siblings, 0 replies; 7+ messages in thread
From: Stefan Priebe - Profihost AG @ 2018-07-23  9:11 UTC (permalink / raw)
  To: Coly Li, linux-bcache; +Cc: linux-block, stable, Michael Lyle


Am 23.07.2018 um 11:05 schrieb Coly Li:
> On 2018/7/23 2:38 PM, Stefan Priebe - Profihost AG wrote:
>> Hi Coly,
>>
>> thanks for this patch. I was right on the way to debug why i have lot's
>> of stalled I/O on latest SLES12-SP3 kernel. I'll add this patch and try
>> if this solves my issues.
>>
>> Each cache set has two backing devices.
> 
> Hi Stefan,
> 
> Thanks for the testing.

OK it does not apply to  SLES12-SP3 at all. I'll use SLES15 kernel to test.

Stefan

> Coly Li
> 
> 
>>
>> Am 22.07.2018 um 18:13 schrieb Coly Li:
>>> Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
>>> allows the writeback rate to be faster if there is no I/O request on a
>>> bcache device. It works well if there is only one bcache device attached
>>> to the cache set. If there are many bcache devices attached to a cache
>>> set, it may introduce performance regression because multiple faster
>>> writeback threads of the idle bcache devices will compete the btree level
>>> locks with the bcache device who have I/O requests coming.
>>>
>>> This patch fixes the above issue by only permitting fast writebac when
>>> all bcache devices attached on the cache set are idle. And if one of the
>>> bcache devices has new I/O request coming, minimized all writeback
>>> throughput immediately and let PI controller __update_writeback_rate()
>>> to decide the upcoming writeback rate for each bcache device.
>>>
>>> Also when all bcache devices are idle, limited wrieback rate to a small
>>> number is wast of thoughput, especially when backing devices are slower
>>> non-rotation devices (e.g. SATA SSD). This patch sets a max writeback
>>> rate for each backing device if the whole cache set is idle. A faster
>>> writeback rate in idle time means new I/Os may have more available space
>>> for dirty data, and people may observe a better write performance then.
>>>
>>> Please note bcache may change its cache mode in run time, and this patch
>>> still works if the cache mode is switched from writeback mode and there
>>> is still dirty data on cache.
>>>
>>> Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
>>> Cc: stable@vger.kernel.org #4.16+
>>> Signed-off-by: Coly Li <colyli@suse.de>
>>> Cc: Michael Lyle <mlyle@lyle.org>
>>> ---
>>>  drivers/md/bcache/bcache.h    |  9 +---
>>>  drivers/md/bcache/request.c   | 42 ++++++++++++++-
>>>  drivers/md/bcache/sysfs.c     | 14 +++--
>>>  drivers/md/bcache/util.c      |  2 +-
>>>  drivers/md/bcache/util.h      |  2 +-
>>>  drivers/md/bcache/writeback.c | 98 +++++++++++++++++++++++++----------
>>>  6 files changed, 126 insertions(+), 41 deletions(-)
>>>
>>> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
>>> index d6bf294f3907..f7451e8be03c 100644
>>> --- a/drivers/md/bcache/bcache.h
>>> +++ b/drivers/md/bcache/bcache.h
>>> @@ -328,13 +328,6 @@ struct cached_dev {
>>>  	 */
>>>  	atomic_t		has_dirty;
>>>  
>>> -	/*
>>> -	 * Set to zero by things that touch the backing volume-- except
>>> -	 * writeback.  Incremented by writeback.  Used to determine when to
>>> -	 * accelerate idle writeback.
>>> -	 */
>>> -	atomic_t		backing_idle;
>>> -
>>>  	struct bch_ratelimit	writeback_rate;
>>>  	struct delayed_work	writeback_rate_update;
>>>  
>>> @@ -514,6 +507,8 @@ struct cache_set {
>>>  	struct cache_accounting accounting;
>>>  
>>>  	unsigned long		flags;
>>> +	atomic_t		idle_counter;
>>> +	atomic_t		at_max_writeback_rate;
>>>  
>>>  	struct cache_sb		sb;
>>>  
>>> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
>>> index ae67f5fa8047..fe45f561a054 100644
>>> --- a/drivers/md/bcache/request.c
>>> +++ b/drivers/md/bcache/request.c
>>> @@ -1104,6 +1104,34 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
>>>  
>>>  /* Cached devices - read & write stuff */
>>>  
>>> +static void quit_max_writeback_rate(struct cache_set *c)
>>> +{
>>> +	int i;
>>> +	struct bcache_device *d;
>>> +	struct cached_dev *dc;
>>> +
>>> +	mutex_lock(&bch_register_lock);
>>> +
>>> +	for (i = 0; i < c->devices_max_used; i++) {
>>> +		if (!c->devices[i])
>>> +			continue;
>>> +
>>> +		if (UUID_FLASH_ONLY(&c->uuids[i]))
>>> +			continue;
>>> +
>>> +		d = c->devices[i];
>>> +		dc = container_of(d, struct cached_dev, disk);
>>> +		/*
>>> +		 * set writeback rate to default minimum value,
>>> +		 * then let update_writeback_rate() to decide the
>>> +		 * upcoming rate.
>>> +		 */
>>> +		atomic64_set(&dc->writeback_rate.rate, 1);
>>> +	}
>>> +
>>> +	mutex_unlock(&bch_register_lock);
>>> +}
>>> +
>>>  static blk_qc_t cached_dev_make_request(struct request_queue *q,
>>>  					struct bio *bio)
>>>  {
>>> @@ -1119,7 +1147,19 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
>>>  		return BLK_QC_T_NONE;
>>>  	}
>>>  
>>> -	atomic_set(&dc->backing_idle, 0);
>>> +	if (d->c) {
>>> +		atomic_set(&d->c->idle_counter, 0);
>>> +		/*
>>> +		 * If at_max_writeback_rate of cache set is true and new I/O
>>> +		 * comes, quit max writeback rate of all cached devices
>>> +		 * attached to this cache set, and set at_max_writeback_rate
>>> +		 * to false.
>>> +		 */
>>> +		if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
>>> +			atomic_set(&d->c->at_max_writeback_rate, 0);
>>> +			quit_max_writeback_rate(d->c);
>>> +		}
>>> +	}
>>>  	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
>>>  
>>>  	bio_set_dev(bio, dc->bdev);
>>> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
>>> index 225b15aa0340..d719021bff81 100644
>>> --- a/drivers/md/bcache/sysfs.c
>>> +++ b/drivers/md/bcache/sysfs.c
>>> @@ -170,7 +170,8 @@ SHOW(__bch_cached_dev)
>>>  	var_printf(writeback_running,	"%i");
>>>  	var_print(writeback_delay);
>>>  	var_print(writeback_percent);
>>> -	sysfs_hprint(writeback_rate,	dc->writeback_rate.rate << 9);
>>> +	sysfs_hprint(writeback_rate,
>>> +		     atomic64_read(&dc->writeback_rate.rate) << 9);
>>>  	sysfs_hprint(io_errors,		atomic_read(&dc->io_errors));
>>>  	sysfs_printf(io_error_limit,	"%i", dc->error_limit);
>>>  	sysfs_printf(io_disable,	"%i", dc->io_disable);
>>> @@ -188,7 +189,8 @@ SHOW(__bch_cached_dev)
>>>  		char change[20];
>>>  		s64 next_io;
>>>  
>>> -		bch_hprint(rate,	dc->writeback_rate.rate << 9);
>>> +		bch_hprint(rate,
>>> +			   atomic64_read(&dc->writeback_rate.rate) << 9);
>>>  		bch_hprint(dirty,	bcache_dev_sectors_dirty(&dc->disk) << 9);
>>>  		bch_hprint(target,	dc->writeback_rate_target << 9);
>>>  		bch_hprint(proportional,dc->writeback_rate_proportional << 9);
>>> @@ -255,8 +257,12 @@ STORE(__cached_dev)
>>>  
>>>  	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
>>>  
>>> -	sysfs_strtoul_clamp(writeback_rate,
>>> -			    dc->writeback_rate.rate, 1, INT_MAX);
>>> +	if (attr == &sysfs_writeback_rate) {
>>> +		int v;
>>> +
>>> +		sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX);
>>> +		atomic64_set(&dc->writeback_rate.rate, v);
>>> +	}
>>>  
>>>  	sysfs_strtoul_clamp(writeback_rate_update_seconds,
>>>  			    dc->writeback_rate_update_seconds,
>>> diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
>>> index fc479b026d6d..84f90c3d996d 100644
>>> --- a/drivers/md/bcache/util.c
>>> +++ b/drivers/md/bcache/util.c
>>> @@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
>>>  {
>>>  	uint64_t now = local_clock();
>>>  
>>> -	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
>>> +	d->next += div_u64(done * NSEC_PER_SEC, atomic64_read(&d->rate));
>>>  
>>>  	/* Bound the time.  Don't let us fall further than 2 seconds behind
>>>  	 * (this prevents unnecessary backlog that would make it impossible
>>> diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
>>> index cced87f8eb27..7e17f32ab563 100644
>>> --- a/drivers/md/bcache/util.h
>>> +++ b/drivers/md/bcache/util.h
>>> @@ -442,7 +442,7 @@ struct bch_ratelimit {
>>>  	 * Rate at which we want to do work, in units per second
>>>  	 * The units here correspond to the units passed to bch_next_delay()
>>>  	 */
>>> -	uint32_t		rate;
>>> +	atomic64_t		rate;
>>>  };
>>>  
>>>  static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
>>> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
>>> index ad45ebe1a74b..72059f910230 100644
>>> --- a/drivers/md/bcache/writeback.c
>>> +++ b/drivers/md/bcache/writeback.c
>>> @@ -49,6 +49,63 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
>>>  	return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
>>>  }
>>>  
>>> +static bool set_at_max_writeback_rate(struct cache_set *c,
>>> +				      struct cached_dev *dc)
>>> +{
>>> +	int i, dirty_dc_nr = 0;
>>> +	struct bcache_device *d;
>>> +
>>> +	mutex_lock(&bch_register_lock);
>>> +	for (i = 0; i < c->devices_max_used; i++) {
>>> +		if (!c->devices[i])
>>> +			continue;
>>> +		if (UUID_FLASH_ONLY(&c->uuids[i]))
>>> +			continue;
>>> +		d = c->devices[i];
>>> +		dc = container_of(d, struct cached_dev, disk);
>>> +		if (atomic_read(&dc->has_dirty))
>>> +			dirty_dc_nr++;
>>> +	}
>>> +	mutex_unlock(&bch_register_lock);
>>> +
>>> +	/*
>>> +	 * Idle_counter is increased everytime when update_writeback_rate()
>>> +	 * is rescheduled in. If all backing devices attached to the same
>>> +	 * cache set has same dc->writeback_rate_update_seconds value, it
>>> +	 * is about 10 rounds of update_writeback_rate() is called on each
>>> +	 * backing device, then the code will fall through at set 1 to
>>> +	 * c->at_max_writeback_rate, and a max wrteback rate to each
>>> +	 * dc->writeback_rate.rate. This is not very accurate but works well
>>> +	 * to make sure the whole cache set has no new I/O coming before
>>> +	 * writeback rate is set to a max number.
>>> +	 */
>>> +	if (atomic_inc_return(&c->idle_counter) < dirty_dc_nr * 10)
>>> +		return false;
>>> +
>>> +	if (atomic_read(&c->at_max_writeback_rate) != 1)
>>> +		atomic_set(&c->at_max_writeback_rate, 1);
>>> +
>>> +
>>> +	atomic64_set(&dc->writeback_rate.rate, INT_MAX);
>>> +
>>> +	/* keep writeback_rate_target as existing value */
>>> +	dc->writeback_rate_proportional = 0;
>>> +	dc->writeback_rate_integral_scaled = 0;
>>> +	dc->writeback_rate_change = 0;
>>> +
>>> +	/*
>>> +	 * Check c->idle_counter and c->at_max_writeback_rate agagain in case
>>> +	 * new I/O arrives during before set_at_max_writeback_rate() returns.
>>> +	 * Then the writeback rate is set to 1, and its new value should be
>>> +	 * decided via __update_writeback_rate().
>>> +	 */
>>> +	if (atomic_read(&c->idle_counter) < dirty_dc_nr * 10 ||
>>> +	    !atomic_read(&c->at_max_writeback_rate))
>>> +		return false;
>>> +
>>> +	return true;
>>> +}
>>> +
>>>  static void __update_writeback_rate(struct cached_dev *dc)
>>>  {
>>>  	/*
>>> @@ -104,8 +161,9 @@ static void __update_writeback_rate(struct cached_dev *dc)
>>>  
>>>  	dc->writeback_rate_proportional = proportional_scaled;
>>>  	dc->writeback_rate_integral_scaled = integral_scaled;
>>> -	dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
>>> -	dc->writeback_rate.rate = new_rate;
>>> +	dc->writeback_rate_change = new_rate -
>>> +			atomic64_read(&dc->writeback_rate.rate);
>>> +	atomic64_set(&dc->writeback_rate.rate, new_rate);
>>>  	dc->writeback_rate_target = target;
>>>  }
>>>  
>>> @@ -138,9 +196,16 @@ static void update_writeback_rate(struct work_struct *work)
>>>  
>>>  	down_read(&dc->writeback_lock);
>>>  
>>> -	if (atomic_read(&dc->has_dirty) &&
>>> -	    dc->writeback_percent)
>>> -		__update_writeback_rate(dc);
>>> +	if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
>>> +		/*
>>> +		 * If the whole cache set is idle, set_at_max_writeback_rate()
>>> +		 * will set writeback rate to a max number. Then it is
>>> +		 * unncessary to update writeback rate for an idle cache set
>>> +		 * in maximum writeback rate number(s).
>>> +		 */
>>> +		if (!set_at_max_writeback_rate(c, dc))
>>> +			__update_writeback_rate(dc);
>>> +	}
>>>  
>>>  	up_read(&dc->writeback_lock);
>>>  
>>> @@ -422,27 +487,6 @@ static void read_dirty(struct cached_dev *dc)
>>>  
>>>  		delay = writeback_delay(dc, size);
>>>  
>>> -		/* If the control system would wait for at least half a
>>> -		 * second, and there's been no reqs hitting the backing disk
>>> -		 * for awhile: use an alternate mode where we have at most
>>> -		 * one contiguous set of writebacks in flight at a time.  If
>>> -		 * someone wants to do IO it will be quick, as it will only
>>> -		 * have to contend with one operation in flight, and we'll
>>> -		 * be round-tripping data to the backing disk as quickly as
>>> -		 * it can accept it.
>>> -		 */
>>> -		if (delay >= HZ / 2) {
>>> -			/* 3 means at least 1.5 seconds, up to 7.5 if we
>>> -			 * have slowed way down.
>>> -			 */
>>> -			if (atomic_inc_return(&dc->backing_idle) >= 3) {
>>> -				/* Wait for current I/Os to finish */
>>> -				closure_sync(&cl);
>>> -				/* And immediately launch a new set. */
>>> -				delay = 0;
>>> -			}
>>> -		}
>>> -
>>>  		while (!kthread_should_stop() &&
>>>  		       !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
>>>  		       delay) {
>>> @@ -715,7 +759,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
>>>  	dc->writeback_running		= true;
>>>  	dc->writeback_percent		= 10;
>>>  	dc->writeback_delay		= 30;
>>> -	dc->writeback_rate.rate		= 1024;
>>> +	atomic64_set(&dc->writeback_rate.rate, 1024);
>>>  	dc->writeback_rate_minimum	= 8;
>>>  
>>>  	dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
>>>
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] bcache: set max writeback rate when I/O request is idle
  2018-07-22 16:13 [PATCH] bcache: set max writeback rate when I/O request is idle Coly Li
  2018-07-23  6:38 ` Stefan Priebe - Profihost AG
@ 2018-07-23 13:14 ` Kai Krakow
  2018-07-23 13:39   ` Coly Li
  1 sibling, 1 reply; 7+ messages in thread
From: Kai Krakow @ 2018-07-23 13:14 UTC (permalink / raw)
  To: Coly Li; +Cc: linux-bcache, linux-block, stable, Michael Lyle

2018-07-22 18:13 GMT+02:00 Coly Li <colyli@suse.de>:
> Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
> allows the writeback rate to be faster if there is no I/O request on a
> bcache device. It works well if there is only one bcache device attached
> to the cache set. If there are many bcache devices attached to a cache
> set, it may introduce performance regression because multiple faster
> writeback threads of the idle bcache devices will compete the btree level
> locks with the bcache device who have I/O requests coming.
>
> This patch fixes the above issue by only permitting fast writebac when
> all bcache devices attached on the cache set are idle. And if one of the
> bcache devices has new I/O request coming, minimized all writeback
> throughput immediately and let PI controller __update_writeback_rate()
> to decide the upcoming writeback rate for each bcache device.
>
> Also when all bcache devices are idle, limited wrieback rate to a small
> number is wast of thoughput, especially when backing devices are slower
> non-rotation devices (e.g. SATA SSD). This patch sets a max writeback
> rate for each backing device if the whole cache set is idle. A faster
> writeback rate in idle time means new I/Os may have more available space
> for dirty data, and people may observe a better write performance then.
>
> Please note bcache may change its cache mode in run time, and this patch
> still works if the cache mode is switched from writeback mode and there
> is still dirty data on cache.

Running with this patch on 4.17, I see reduced IO lags in desktop
usage during heavy writes. The reduction is minor but noticeable. The
small short IO lags seem to be fixed with this but I still see the
stalls with long lags in desktop usage. I don't address those to
bcache, tho. This seems more like an issue resulting from using bees
on btrfs.


> Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
> Cc: stable@vger.kernel.org #4.16+
> Signed-off-by: Coly Li <colyli@suse.de>
> Cc: Michael Lyle <mlyle@lyle.org>

Tested-by: Kai Krakow <kai@kaishome.de>


> ---
>  drivers/md/bcache/bcache.h    |  9 +---
>  drivers/md/bcache/request.c   | 42 ++++++++++++++-
>  drivers/md/bcache/sysfs.c     | 14 +++--
>  drivers/md/bcache/util.c      |  2 +-
>  drivers/md/bcache/util.h      |  2 +-
>  drivers/md/bcache/writeback.c | 98 +++++++++++++++++++++++++----------
>  6 files changed, 126 insertions(+), 41 deletions(-)
>
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index d6bf294f3907..f7451e8be03c 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -328,13 +328,6 @@ struct cached_dev {
>          */
>         atomic_t                has_dirty;
>
> -       /*
> -        * Set to zero by things that touch the backing volume-- except
> -        * writeback.  Incremented by writeback.  Used to determine when to
> -        * accelerate idle writeback.
> -        */
> -       atomic_t                backing_idle;
> -
>         struct bch_ratelimit    writeback_rate;
>         struct delayed_work     writeback_rate_update;
>
> @@ -514,6 +507,8 @@ struct cache_set {
>         struct cache_accounting accounting;
>
>         unsigned long           flags;
> +       atomic_t                idle_counter;
> +       atomic_t                at_max_writeback_rate;
>
>         struct cache_sb         sb;
>
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index ae67f5fa8047..fe45f561a054 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -1104,6 +1104,34 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
>
>  /* Cached devices - read & write stuff */
>
> +static void quit_max_writeback_rate(struct cache_set *c)
> +{
> +       int i;
> +       struct bcache_device *d;
> +       struct cached_dev *dc;
> +
> +       mutex_lock(&bch_register_lock);
> +
> +       for (i = 0; i < c->devices_max_used; i++) {
> +               if (!c->devices[i])
> +                       continue;
> +
> +               if (UUID_FLASH_ONLY(&c->uuids[i]))
> +                       continue;
> +
> +               d = c->devices[i];
> +               dc = container_of(d, struct cached_dev, disk);
> +               /*
> +                * set writeback rate to default minimum value,
> +                * then let update_writeback_rate() to decide the
> +                * upcoming rate.
> +                */
> +               atomic64_set(&dc->writeback_rate.rate, 1);
> +       }
> +
> +       mutex_unlock(&bch_register_lock);
> +}
> +
>  static blk_qc_t cached_dev_make_request(struct request_queue *q,
>                                         struct bio *bio)
>  {
> @@ -1119,7 +1147,19 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
>                 return BLK_QC_T_NONE;
>         }
>
> -       atomic_set(&dc->backing_idle, 0);
> +       if (d->c) {
> +               atomic_set(&d->c->idle_counter, 0);
> +               /*
> +                * If at_max_writeback_rate of cache set is true and new I/O
> +                * comes, quit max writeback rate of all cached devices
> +                * attached to this cache set, and set at_max_writeback_rate
> +                * to false.
> +                */
> +               if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
> +                       atomic_set(&d->c->at_max_writeback_rate, 0);
> +                       quit_max_writeback_rate(d->c);
> +               }
> +       }
>         generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
>
>         bio_set_dev(bio, dc->bdev);
> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
> index 225b15aa0340..d719021bff81 100644
> --- a/drivers/md/bcache/sysfs.c
> +++ b/drivers/md/bcache/sysfs.c
> @@ -170,7 +170,8 @@ SHOW(__bch_cached_dev)
>         var_printf(writeback_running,   "%i");
>         var_print(writeback_delay);
>         var_print(writeback_percent);
> -       sysfs_hprint(writeback_rate,    dc->writeback_rate.rate << 9);
> +       sysfs_hprint(writeback_rate,
> +                    atomic64_read(&dc->writeback_rate.rate) << 9);
>         sysfs_hprint(io_errors,         atomic_read(&dc->io_errors));
>         sysfs_printf(io_error_limit,    "%i", dc->error_limit);
>         sysfs_printf(io_disable,        "%i", dc->io_disable);
> @@ -188,7 +189,8 @@ SHOW(__bch_cached_dev)
>                 char change[20];
>                 s64 next_io;
>
> -               bch_hprint(rate,        dc->writeback_rate.rate << 9);
> +               bch_hprint(rate,
> +                          atomic64_read(&dc->writeback_rate.rate) << 9);
>                 bch_hprint(dirty,       bcache_dev_sectors_dirty(&dc->disk) << 9);
>                 bch_hprint(target,      dc->writeback_rate_target << 9);
>                 bch_hprint(proportional,dc->writeback_rate_proportional << 9);
> @@ -255,8 +257,12 @@ STORE(__cached_dev)
>
>         sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
>
> -       sysfs_strtoul_clamp(writeback_rate,
> -                           dc->writeback_rate.rate, 1, INT_MAX);
> +       if (attr == &sysfs_writeback_rate) {
> +               int v;
> +
> +               sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX);
> +               atomic64_set(&dc->writeback_rate.rate, v);
> +       }
>
>         sysfs_strtoul_clamp(writeback_rate_update_seconds,
>                             dc->writeback_rate_update_seconds,
> diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
> index fc479b026d6d..84f90c3d996d 100644
> --- a/drivers/md/bcache/util.c
> +++ b/drivers/md/bcache/util.c
> @@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
>  {
>         uint64_t now = local_clock();
>
> -       d->next += div_u64(done * NSEC_PER_SEC, d->rate);
> +       d->next += div_u64(done * NSEC_PER_SEC, atomic64_read(&d->rate));
>
>         /* Bound the time.  Don't let us fall further than 2 seconds behind
>          * (this prevents unnecessary backlog that would make it impossible
> diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
> index cced87f8eb27..7e17f32ab563 100644
> --- a/drivers/md/bcache/util.h
> +++ b/drivers/md/bcache/util.h
> @@ -442,7 +442,7 @@ struct bch_ratelimit {
>          * Rate at which we want to do work, in units per second
>          * The units here correspond to the units passed to bch_next_delay()
>          */
> -       uint32_t                rate;
> +       atomic64_t              rate;
>  };
>
>  static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index ad45ebe1a74b..72059f910230 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -49,6 +49,63 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
>         return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
>  }
>
> +static bool set_at_max_writeback_rate(struct cache_set *c,
> +                                     struct cached_dev *dc)
> +{
> +       int i, dirty_dc_nr = 0;
> +       struct bcache_device *d;
> +
> +       mutex_lock(&bch_register_lock);
> +       for (i = 0; i < c->devices_max_used; i++) {
> +               if (!c->devices[i])
> +                       continue;
> +               if (UUID_FLASH_ONLY(&c->uuids[i]))
> +                       continue;
> +               d = c->devices[i];
> +               dc = container_of(d, struct cached_dev, disk);
> +               if (atomic_read(&dc->has_dirty))
> +                       dirty_dc_nr++;
> +       }
> +       mutex_unlock(&bch_register_lock);
> +
> +       /*
> +        * Idle_counter is increased everytime when update_writeback_rate()
> +        * is rescheduled in. If all backing devices attached to the same
> +        * cache set has same dc->writeback_rate_update_seconds value, it
> +        * is about 10 rounds of update_writeback_rate() is called on each
> +        * backing device, then the code will fall through at set 1 to
> +        * c->at_max_writeback_rate, and a max wrteback rate to each
> +        * dc->writeback_rate.rate. This is not very accurate but works well
> +        * to make sure the whole cache set has no new I/O coming before
> +        * writeback rate is set to a max number.
> +        */
> +       if (atomic_inc_return(&c->idle_counter) < dirty_dc_nr * 10)
> +               return false;
> +
> +       if (atomic_read(&c->at_max_writeback_rate) != 1)
> +               atomic_set(&c->at_max_writeback_rate, 1);
> +
> +
> +       atomic64_set(&dc->writeback_rate.rate, INT_MAX);
> +
> +       /* keep writeback_rate_target as existing value */
> +       dc->writeback_rate_proportional = 0;
> +       dc->writeback_rate_integral_scaled = 0;
> +       dc->writeback_rate_change = 0;
> +
> +       /*
> +        * Check c->idle_counter and c->at_max_writeback_rate agagain in case
> +        * new I/O arrives during before set_at_max_writeback_rate() returns.
> +        * Then the writeback rate is set to 1, and its new value should be
> +        * decided via __update_writeback_rate().
> +        */
> +       if (atomic_read(&c->idle_counter) < dirty_dc_nr * 10 ||
> +           !atomic_read(&c->at_max_writeback_rate))
> +               return false;
> +
> +       return true;
> +}
> +
>  static void __update_writeback_rate(struct cached_dev *dc)
>  {
>         /*
> @@ -104,8 +161,9 @@ static void __update_writeback_rate(struct cached_dev *dc)
>
>         dc->writeback_rate_proportional = proportional_scaled;
>         dc->writeback_rate_integral_scaled = integral_scaled;
> -       dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
> -       dc->writeback_rate.rate = new_rate;
> +       dc->writeback_rate_change = new_rate -
> +                       atomic64_read(&dc->writeback_rate.rate);
> +       atomic64_set(&dc->writeback_rate.rate, new_rate);
>         dc->writeback_rate_target = target;
>  }
>
> @@ -138,9 +196,16 @@ static void update_writeback_rate(struct work_struct *work)
>
>         down_read(&dc->writeback_lock);
>
> -       if (atomic_read(&dc->has_dirty) &&
> -           dc->writeback_percent)
> -               __update_writeback_rate(dc);
> +       if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
> +               /*
> +                * If the whole cache set is idle, set_at_max_writeback_rate()
> +                * will set writeback rate to a max number. Then it is
> +                * unncessary to update writeback rate for an idle cache set
> +                * in maximum writeback rate number(s).
> +                */
> +               if (!set_at_max_writeback_rate(c, dc))
> +                       __update_writeback_rate(dc);
> +       }
>
>         up_read(&dc->writeback_lock);
>
> @@ -422,27 +487,6 @@ static void read_dirty(struct cached_dev *dc)
>
>                 delay = writeback_delay(dc, size);
>
> -               /* If the control system would wait for at least half a
> -                * second, and there's been no reqs hitting the backing disk
> -                * for awhile: use an alternate mode where we have at most
> -                * one contiguous set of writebacks in flight at a time.  If
> -                * someone wants to do IO it will be quick, as it will only
> -                * have to contend with one operation in flight, and we'll
> -                * be round-tripping data to the backing disk as quickly as
> -                * it can accept it.
> -                */
> -               if (delay >= HZ / 2) {
> -                       /* 3 means at least 1.5 seconds, up to 7.5 if we
> -                        * have slowed way down.
> -                        */
> -                       if (atomic_inc_return(&dc->backing_idle) >= 3) {
> -                               /* Wait for current I/Os to finish */
> -                               closure_sync(&cl);
> -                               /* And immediately launch a new set. */
> -                               delay = 0;
> -                       }
> -               }
> -
>                 while (!kthread_should_stop() &&
>                        !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
>                        delay) {
> @@ -715,7 +759,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
>         dc->writeback_running           = true;
>         dc->writeback_percent           = 10;
>         dc->writeback_delay             = 30;
> -       dc->writeback_rate.rate         = 1024;
> +       atomic64_set(&dc->writeback_rate.rate, 1024);
>         dc->writeback_rate_minimum      = 8;
>
>         dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
> --
> 2.17.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] bcache: set max writeback rate when I/O request is idle
  2018-07-23 13:14 ` Kai Krakow
@ 2018-07-23 13:39   ` Coly Li
  0 siblings, 0 replies; 7+ messages in thread
From: Coly Li @ 2018-07-23 13:39 UTC (permalink / raw)
  To: Kai Krakow; +Cc: linux-bcache, linux-block, stable, Michael Lyle

On 2018/7/23 9:14 PM, Kai Krakow wrote:
> 2018-07-22 18:13 GMT+02:00 Coly Li <colyli@suse.de>:
>> Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
>> allows the writeback rate to be faster if there is no I/O request on a
>> bcache device. It works well if there is only one bcache device attached
>> to the cache set. If there are many bcache devices attached to a cache
>> set, it may introduce performance regression because multiple faster
>> writeback threads of the idle bcache devices will compete the btree level
>> locks with the bcache device who have I/O requests coming.
>>
>> This patch fixes the above issue by only permitting fast writebac when
>> all bcache devices attached on the cache set are idle. And if one of the
>> bcache devices has new I/O request coming, minimized all writeback
>> throughput immediately and let PI controller __update_writeback_rate()
>> to decide the upcoming writeback rate for each bcache device.
>>
>> Also when all bcache devices are idle, limited wrieback rate to a small
>> number is wast of thoughput, especially when backing devices are slower
>> non-rotation devices (e.g. SATA SSD). This patch sets a max writeback
>> rate for each backing device if the whole cache set is idle. A faster
>> writeback rate in idle time means new I/Os may have more available space
>> for dirty data, and people may observe a better write performance then.
>>
>> Please note bcache may change its cache mode in run time, and this patch
>> still works if the cache mode is switched from writeback mode and there
>> is still dirty data on cache.
> 
> Running with this patch on 4.17, I see reduced IO lags in desktop
> usage during heavy writes. The reduction is minor but noticeable. The
> small short IO lags seem to be fixed with this but I still see the
> stalls with long lags in desktop usage. I don't address those to
> bcache, tho. This seems more like an issue resulting from using bees
> on btrfs.
> 
> 
>> Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
>> Cc: stable@vger.kernel.org #4.16+
>> Signed-off-by: Coly Li <colyli@suse.de>
>> Cc: Michael Lyle <mlyle@lyle.org>
> 

Hi Kai,

> Tested-by: Kai Krakow <kai@kaishome.de>
> 

Thank you for the testing and information sharing, very helpful :-)

Coly Li

> 
>> ---
>>  drivers/md/bcache/bcache.h    |  9 +---
>>  drivers/md/bcache/request.c   | 42 ++++++++++++++-
>>  drivers/md/bcache/sysfs.c     | 14 +++--
>>  drivers/md/bcache/util.c      |  2 +-
>>  drivers/md/bcache/util.h      |  2 +-
>>  drivers/md/bcache/writeback.c | 98 +++++++++++++++++++++++++----------
>>  6 files changed, 126 insertions(+), 41 deletions(-)
[snip]

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH] bcache: set max writeback rate when I/O request is idle
@ 2018-07-22 16:10 Coly Li
  0 siblings, 0 replies; 7+ messages in thread
From: Coly Li @ 2018-07-22 16:10 UTC (permalink / raw)
  To: colyli; +Cc: stable, Michael Lyle

Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
allows the writeback rate to be faster if there is no I/O request on a
bcache device. It works well if there is only one bcache device attached
to the cache set. If there are many bcache devices attached to a cache
set, it may introduce performance regression because multiple faster
writeback threads of the idle bcache devices will compete the btree level
locks with the bcache device who have I/O requests coming.

This patch fixes the above issue by only permitting fast writebac when
all bcache devices attached on the cache set are idle. And if one of the
bcache devices has new I/O request coming, minimized all writeback
throughput immediately and let PI controller __update_writeback_rate()
to decide the upcoming writeback rate for each bcache device.

Also when all bcache devices are idle, limited wrieback rate to a small
number is wast of thoughput, especially when backing devices are slower
non-rotation devices (e.g. SATA SSD). This patch sets a max writeback
rate for each backing device if the whole cache set is idle. A faster
writeback rate in idle time means new I/Os may have more available space
for dirty data, and people may observe a better write performance then.

Please note bcache may change its cache mode in run time, and this patch
still works if the cache mode is switched from writeback mode and there
is still dirty data on cache.

Fixes: Commit b1092c9af9ed ("bcache: allow quick writeback when backing idle")
Cc: stable@vger.kernel.org #4.16+
Signed-off-by: Coly Li <colyli@suse.de>
Cc: Michael Lyle <mlyle@lyle.org>
---
 drivers/md/bcache/bcache.h    |  9 +---
 drivers/md/bcache/request.c   | 42 ++++++++++++++-
 drivers/md/bcache/sysfs.c     | 14 +++--
 drivers/md/bcache/util.c      |  2 +-
 drivers/md/bcache/util.h      |  2 +-
 drivers/md/bcache/writeback.c | 98 +++++++++++++++++++++++++----------
 6 files changed, 126 insertions(+), 41 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index d6bf294f3907..f7451e8be03c 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -328,13 +328,6 @@ struct cached_dev {
 	 */
 	atomic_t		has_dirty;
 
-	/*
-	 * Set to zero by things that touch the backing volume-- except
-	 * writeback.  Incremented by writeback.  Used to determine when to
-	 * accelerate idle writeback.
-	 */
-	atomic_t		backing_idle;
-
 	struct bch_ratelimit	writeback_rate;
 	struct delayed_work	writeback_rate_update;
 
@@ -514,6 +507,8 @@ struct cache_set {
 	struct cache_accounting accounting;
 
 	unsigned long		flags;
+	atomic_t		idle_counter;
+	atomic_t		at_max_writeback_rate;
 
 	struct cache_sb		sb;
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ae67f5fa8047..fe45f561a054 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -1104,6 +1104,34 @@ static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
 
 /* Cached devices - read & write stuff */
 
+static void quit_max_writeback_rate(struct cache_set *c)
+{
+	int i;
+	struct bcache_device *d;
+	struct cached_dev *dc;
+
+	mutex_lock(&bch_register_lock);
+
+	for (i = 0; i < c->devices_max_used; i++) {
+		if (!c->devices[i])
+			continue;
+
+		if (UUID_FLASH_ONLY(&c->uuids[i]))
+			continue;
+
+		d = c->devices[i];
+		dc = container_of(d, struct cached_dev, disk);
+		/*
+		 * set writeback rate to default minimum value,
+		 * then let update_writeback_rate() to decide the
+		 * upcoming rate.
+		 */
+		atomic64_set(&dc->writeback_rate.rate, 1);
+	}
+
+	mutex_unlock(&bch_register_lock);
+}
+
 static blk_qc_t cached_dev_make_request(struct request_queue *q,
 					struct bio *bio)
 {
@@ -1119,7 +1147,19 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
 		return BLK_QC_T_NONE;
 	}
 
-	atomic_set(&dc->backing_idle, 0);
+	if (d->c) {
+		atomic_set(&d->c->idle_counter, 0);
+		/*
+		 * If at_max_writeback_rate of cache set is true and new I/O
+		 * comes, quit max writeback rate of all cached devices
+		 * attached to this cache set, and set at_max_writeback_rate
+		 * to false.
+		 */
+		if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
+			atomic_set(&d->c->at_max_writeback_rate, 0);
+			quit_max_writeback_rate(d->c);
+		}
+	}
 	generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
 
 	bio_set_dev(bio, dc->bdev);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 225b15aa0340..d719021bff81 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -170,7 +170,8 @@ SHOW(__bch_cached_dev)
 	var_printf(writeback_running,	"%i");
 	var_print(writeback_delay);
 	var_print(writeback_percent);
-	sysfs_hprint(writeback_rate,	dc->writeback_rate.rate << 9);
+	sysfs_hprint(writeback_rate,
+		     atomic64_read(&dc->writeback_rate.rate) << 9);
 	sysfs_hprint(io_errors,		atomic_read(&dc->io_errors));
 	sysfs_printf(io_error_limit,	"%i", dc->error_limit);
 	sysfs_printf(io_disable,	"%i", dc->io_disable);
@@ -188,7 +189,8 @@ SHOW(__bch_cached_dev)
 		char change[20];
 		s64 next_io;
 
-		bch_hprint(rate,	dc->writeback_rate.rate << 9);
+		bch_hprint(rate,
+			   atomic64_read(&dc->writeback_rate.rate) << 9);
 		bch_hprint(dirty,	bcache_dev_sectors_dirty(&dc->disk) << 9);
 		bch_hprint(target,	dc->writeback_rate_target << 9);
 		bch_hprint(proportional,dc->writeback_rate_proportional << 9);
@@ -255,8 +257,12 @@ STORE(__cached_dev)
 
 	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
 
-	sysfs_strtoul_clamp(writeback_rate,
-			    dc->writeback_rate.rate, 1, INT_MAX);
+	if (attr == &sysfs_writeback_rate) {
+		int v;
+
+		sysfs_strtoul_clamp(writeback_rate, v, 1, INT_MAX);
+		atomic64_set(&dc->writeback_rate.rate, v);
+	}
 
 	sysfs_strtoul_clamp(writeback_rate_update_seconds,
 			    dc->writeback_rate_update_seconds,
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index fc479b026d6d..84f90c3d996d 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -200,7 +200,7 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 {
 	uint64_t now = local_clock();
 
-	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+	d->next += div_u64(done * NSEC_PER_SEC, atomic64_read(&d->rate));
 
 	/* Bound the time.  Don't let us fall further than 2 seconds behind
 	 * (this prevents unnecessary backlog that would make it impossible
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index cced87f8eb27..7e17f32ab563 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -442,7 +442,7 @@ struct bch_ratelimit {
 	 * Rate at which we want to do work, in units per second
 	 * The units here correspond to the units passed to bch_next_delay()
 	 */
-	uint32_t		rate;
+	atomic64_t		rate;
 };
 
 static inline void bch_ratelimit_reset(struct bch_ratelimit *d)
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index ad45ebe1a74b..72059f910230 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -49,6 +49,63 @@ static uint64_t __calc_target_rate(struct cached_dev *dc)
 	return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
 }
 
+static bool set_at_max_writeback_rate(struct cache_set *c,
+				      struct cached_dev *dc)
+{
+	int i, dirty_dc_nr = 0;
+	struct bcache_device *d;
+
+	mutex_lock(&bch_register_lock);
+	for (i = 0; i < c->devices_max_used; i++) {
+		if (!c->devices[i])
+			continue;
+		if (UUID_FLASH_ONLY(&c->uuids[i]))
+			continue;
+		d = c->devices[i];
+		dc = container_of(d, struct cached_dev, disk);
+		if (atomic_read(&dc->has_dirty))
+			dirty_dc_nr++;
+	}
+	mutex_unlock(&bch_register_lock);
+
+	/*
+	 * Idle_counter is increased everytime when update_writeback_rate()
+	 * is rescheduled in. If all backing devices attached to the same
+	 * cache set has same dc->writeback_rate_update_seconds value, it
+	 * is about 10 rounds of update_writeback_rate() is called on each
+	 * backing device, then the code will fall through at set 1 to
+	 * c->at_max_writeback_rate, and a max wrteback rate to each
+	 * dc->writeback_rate.rate. This is not very accurate but works well
+	 * to make sure the whole cache set has no new I/O coming before
+	 * writeback rate is set to a max number.
+	 */
+	if (atomic_inc_return(&c->idle_counter) < dirty_dc_nr * 10)
+		return false;
+
+	if (atomic_read(&c->at_max_writeback_rate) != 1)
+		atomic_set(&c->at_max_writeback_rate, 1);
+
+
+	atomic64_set(&dc->writeback_rate.rate, INT_MAX);
+
+	/* keep writeback_rate_target as existing value */
+	dc->writeback_rate_proportional = 0;
+	dc->writeback_rate_integral_scaled = 0;
+	dc->writeback_rate_change = 0;
+
+	/*
+	 * Check c->idle_counter and c->at_max_writeback_rate agagain in case
+	 * new I/O arrives during before set_at_max_writeback_rate() returns.
+	 * Then the writeback rate is set to 1, and its new value should be
+	 * decided via __update_writeback_rate().
+	 */
+	if (atomic_read(&c->idle_counter) < dirty_dc_nr * 10 ||
+	    !atomic_read(&c->at_max_writeback_rate))
+		return false;
+
+	return true;
+}
+
 static void __update_writeback_rate(struct cached_dev *dc)
 {
 	/*
@@ -104,8 +161,9 @@ static void __update_writeback_rate(struct cached_dev *dc)
 
 	dc->writeback_rate_proportional = proportional_scaled;
 	dc->writeback_rate_integral_scaled = integral_scaled;
-	dc->writeback_rate_change = new_rate - dc->writeback_rate.rate;
-	dc->writeback_rate.rate = new_rate;
+	dc->writeback_rate_change = new_rate -
+			atomic64_read(&dc->writeback_rate.rate);
+	atomic64_set(&dc->writeback_rate.rate, new_rate);
 	dc->writeback_rate_target = target;
 }
 
@@ -138,9 +196,16 @@ static void update_writeback_rate(struct work_struct *work)
 
 	down_read(&dc->writeback_lock);
 
-	if (atomic_read(&dc->has_dirty) &&
-	    dc->writeback_percent)
-		__update_writeback_rate(dc);
+	if (atomic_read(&dc->has_dirty) && dc->writeback_percent) {
+		/*
+		 * If the whole cache set is idle, set_at_max_writeback_rate()
+		 * will set writeback rate to a max number. Then it is
+		 * unncessary to update writeback rate for an idle cache set
+		 * in maximum writeback rate number(s).
+		 */
+		if (!set_at_max_writeback_rate(c, dc))
+			__update_writeback_rate(dc);
+	}
 
 	up_read(&dc->writeback_lock);
 
@@ -422,27 +487,6 @@ static void read_dirty(struct cached_dev *dc)
 
 		delay = writeback_delay(dc, size);
 
-		/* If the control system would wait for at least half a
-		 * second, and there's been no reqs hitting the backing disk
-		 * for awhile: use an alternate mode where we have at most
-		 * one contiguous set of writebacks in flight at a time.  If
-		 * someone wants to do IO it will be quick, as it will only
-		 * have to contend with one operation in flight, and we'll
-		 * be round-tripping data to the backing disk as quickly as
-		 * it can accept it.
-		 */
-		if (delay >= HZ / 2) {
-			/* 3 means at least 1.5 seconds, up to 7.5 if we
-			 * have slowed way down.
-			 */
-			if (atomic_inc_return(&dc->backing_idle) >= 3) {
-				/* Wait for current I/Os to finish */
-				closure_sync(&cl);
-				/* And immediately launch a new set. */
-				delay = 0;
-			}
-		}
-
 		while (!kthread_should_stop() &&
 		       !test_bit(CACHE_SET_IO_DISABLE, &dc->disk.c->flags) &&
 		       delay) {
@@ -715,7 +759,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_running		= true;
 	dc->writeback_percent		= 10;
 	dc->writeback_delay		= 30;
-	dc->writeback_rate.rate		= 1024;
+	atomic64_set(&dc->writeback_rate.rate, 1024);
 	dc->writeback_rate_minimum	= 8;
 
 	dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2018-07-23 14:40 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-22 16:13 [PATCH] bcache: set max writeback rate when I/O request is idle Coly Li
2018-07-23  6:38 ` Stefan Priebe - Profihost AG
2018-07-23  9:05   ` Coly Li
2018-07-23  9:11     ` Stefan Priebe - Profihost AG
2018-07-23 13:14 ` Kai Krakow
2018-07-23 13:39   ` Coly Li
  -- strict thread matches above, loose matches on Subject: below --
2018-07-22 16:10 Coly Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.