All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] bcache: dynamic incremental gc
@ 2022-04-21 12:17 mingzhe.zou
  2022-04-21 13:28 ` Coly Li
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: mingzhe.zou @ 2022-04-21 12:17 UTC (permalink / raw)
  To: colyli, linux-bcache; +Cc: zoumingzhe

From: ZouMingzhe <mingzhe.zou@easystack.cn>

During GC, IO performance would be reduced by half or more.
According to our test data, when nvme is used as the cache,
it takes about 1ms for GC to handle each node (block 4k and
bucket 512k).

So, GC process at least 100 nodes each time, resulting in
IOPS decreasing by half and latency increasing.

This patch add some cost statistics and hold the inflight peak.
When IO depth up to maximum, gc sleep and handle these requests.
GC sleep time dynamically calculate based on gc_cost.

Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
---
 drivers/md/bcache/bcache.h |  8 ++++
 drivers/md/bcache/btree.c  | 83 ++++++++++++++++++++++++++++++++------
 2 files changed, 78 insertions(+), 13 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 9ed9c955add7..065a1137db68 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -471,6 +471,14 @@ struct cache {
 };
 
 struct gc_stat {
+	uint64_t		gc_cost;
+	uint64_t		sleep_cost;
+	uint64_t		average_cost;
+	uint64_t		start_time;
+	uint64_t		finish_time;
+	size_t			max_inflight;
+
+	size_t			times;
 	size_t			nodes;
 	size_t			nodes_pre;
 	size_t			key_bytes;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f5f2718e03e5..fc721f216eb7 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -88,11 +88,12 @@
  * Test module load/unload
  */
 
-#define MAX_NEED_GC		64
-#define MAX_SAVE_PRIO		72
 #define MAX_GC_TIMES		100
 #define MIN_GC_NODES		100
-#define GC_SLEEP_MS		100
+#define MAX_GC_NODES		1000
+#define MAX_GC_PERCENT		10
+#define MIN_GC_SLEEP_MS		10
+#define MAX_GC_SLEEP_MS		1000
 
 #define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
 
@@ -1542,12 +1543,56 @@ static unsigned int btree_gc_count_keys(struct btree *b)
 	return ret;
 }
 
-static size_t btree_gc_min_nodes(struct cache_set *c)
+static uint64_t btree_gc_sleep_ms(struct cache_set *c, struct gc_stat *gc)
+{
+	uint64_t now = local_clock();
+	uint64_t expect_time, sleep_time = 0;
+
+	/*
+	 * GC maybe process very few nodes when IO requests are very frequent.
+	 * If the sleep time is constant (100ms) each time, whole GC would last
+	 * a long time.
+	 * The IO performance also decline if a single GC takes a long time
+	 * (such as single GC 100ms and sleep 100ms, IOPS is only half).
+	 * So GC sleep time should be calculated dynamically based on gc_cost.
+	 */
+	gc->finish_time = time_after64(now, gc->start_time)
+				? now - gc->start_time : 0;
+	gc->gc_cost = gc->finish_time > gc->sleep_cost
+			? gc->finish_time - gc->sleep_cost : 0;
+	expect_time = div_u64(gc->gc_cost * (100 - MAX_GC_PERCENT), MAX_GC_PERCENT);
+	if (expect_time > gc->sleep_cost)
+		sleep_time = div_u64(expect_time - gc->sleep_cost, NSEC_PER_MSEC);
+
+	if (sleep_time < MIN_GC_SLEEP_MS)
+		sleep_time = MIN_GC_SLEEP_MS;
+	if (sleep_time > MAX_GC_SLEEP_MS)
+		sleep_time = MAX_GC_SLEEP_MS;
+
+	return sleep_time;
+}
+
+static size_t btree_gc_min_nodes(struct cache_set *c, struct gc_stat *gc)
 {
 	size_t min_nodes;
+	size_t inflight;
+
+	/*
+	 * If there are no requests, the GC is not stopped. Also, we hope to
+	 * process the increasing number of IO requests immediately and hold
+	 * the inflight peak. When IO depth up to maximum, gc sleep and handle
+	 * these requests.
+	 */
+	inflight = atomic_read(&c->search_inflight);
+	if (inflight <= 0)
+		return max(c->gc_stats.nodes, gc->nodes) + 1;
+	if (inflight > gc->max_inflight)
+		gc->max_inflight = inflight;
+	if (inflight >= gc->max_inflight)
+		return 1;
 
 	/*
-	 * Since incremental GC would stop 100ms when front
+	 * Since incremental GC would dynamic sleep when front
 	 * side I/O comes, so when there are many btree nodes,
 	 * if GC only processes constant (100) nodes each time,
 	 * GC would last a long time, and the front side I/Os
@@ -1558,11 +1603,14 @@ static size_t btree_gc_min_nodes(struct cache_set *c)
 	 * realized by dividing GC into constant(100) times,
 	 * so when there are many btree nodes, GC can process
 	 * more nodes each time, otherwise, GC will process less
-	 * nodes each time (but no less than MIN_GC_NODES)
+	 * nodes each time (but no less than MIN_GC_NODES and
+	 * no more than MAX_GC_NODES)
 	 */
 	min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
 	if (min_nodes < MIN_GC_NODES)
 		min_nodes = MIN_GC_NODES;
+	if (min_nodes > MAX_GC_NODES)
+		min_nodes = MAX_GC_NODES;
 
 	return min_nodes;
 }
@@ -1633,8 +1681,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
 		r->b = NULL;
 
-		if (atomic_read(&b->c->search_inflight) &&
-		    gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
+		if (gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c, gc)) {
 			gc->nodes_pre =  gc->nodes;
 			ret = -EAGAIN;
 			break;
@@ -1789,7 +1836,7 @@ static void bch_btree_gc(struct cache_set *c)
 	struct gc_stat stats;
 	struct closure writes;
 	struct btree_op op;
-	uint64_t start_time = local_clock();
+	uint64_t sleep_time;
 
 	trace_bcache_gc_start(c);
 
@@ -1798,24 +1845,34 @@ static void bch_btree_gc(struct cache_set *c)
 	bch_btree_op_init(&op, SHRT_MAX);
 
 	btree_gc_start(c);
+	stats.start_time = local_clock();
 
 	/* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
 	do {
+		stats.times++;
 		ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
 		closure_sync(&writes);
 		cond_resched();
 
-		if (ret == -EAGAIN)
+		sleep_time = btree_gc_sleep_ms(c, &stats);
+		if (ret == -EAGAIN) {
+			stats.sleep_cost += sleep_time * NSEC_PER_MSEC;
 			schedule_timeout_interruptible(msecs_to_jiffies
-						       (GC_SLEEP_MS));
-		else if (ret)
+						       (sleep_time));
+		} else if (ret)
 			pr_warn("gc failed!\n");
 	} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
 
 	bch_btree_gc_finish(c);
 	wake_up_allocators(c);
 
-	bch_time_stats_update(&c->btree_gc_time, start_time);
+	bch_time_stats_update(&c->btree_gc_time, stats.start_time);
+	stats.average_cost = stats.gc_cost / stats.nodes;
+	pr_info("gc %llu times with %llu nodes, sleep %llums, "
+		"average gc cost %lluus per node",
+		(uint64_t)stats.times, (uint64_t)stats.nodes,
+		div_u64(stats.sleep_cost, NSEC_PER_MSEC),
+		div_u64(stats.average_cost, NSEC_PER_USEC));
 
 	stats.key_bytes *= sizeof(uint64_t);
 	stats.data	<<= 9;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] bcache: dynamic incremental gc
  2022-04-21 12:17 [PATCH] bcache: dynamic incremental gc mingzhe.zou
@ 2022-04-21 13:28 ` Coly Li
  2022-04-22  5:58   ` Zou Mingzhe
  2022-04-21 18:12 ` kernel test robot
  2022-04-21 19:34 ` kernel test robot
  2 siblings, 1 reply; 5+ messages in thread
From: Coly Li @ 2022-04-21 13:28 UTC (permalink / raw)
  To: mingzhe.zou; +Cc: zoumingzhe, linux-bcache

On 4/21/22 8:17 PM, mingzhe.zou@easystack.cn wrote:
> From: ZouMingzhe <mingzhe.zou@easystack.cn>
>
> During GC, IO performance would be reduced by half or more.
> According to our test data, when nvme is used as the cache,
> it takes about 1ms for GC to handle each node (block 4k and
> bucket 512k).
>
> So, GC process at least 100 nodes each time, resulting in
> IOPS decreasing by half and latency increasing.
>
> This patch add some cost statistics and hold the inflight peak.
> When IO depth up to maximum, gc sleep and handle these requests.
> GC sleep time dynamically calculate based on gc_cost.

Hi Mingzhe,

What the problem this patch intends to solve, and what is the result of 
the change?


Thanks.

Coly Li


> Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
> ---
>   drivers/md/bcache/bcache.h |  8 ++++
>   drivers/md/bcache/btree.c  | 83 ++++++++++++++++++++++++++++++++------
>   2 files changed, 78 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 9ed9c955add7..065a1137db68 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -471,6 +471,14 @@ struct cache {
>   };
>   
>   struct gc_stat {
> +	uint64_t		gc_cost;
> +	uint64_t		sleep_cost;
> +	uint64_t		average_cost;
> +	uint64_t		start_time;
> +	uint64_t		finish_time;
> +	size_t			max_inflight;
> +
> +	size_t			times;
>   	size_t			nodes;
>   	size_t			nodes_pre;
>   	size_t			key_bytes;
> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
> index f5f2718e03e5..fc721f216eb7 100644
> --- a/drivers/md/bcache/btree.c
> +++ b/drivers/md/bcache/btree.c
> @@ -88,11 +88,12 @@
>    * Test module load/unload
>    */
>   
> -#define MAX_NEED_GC		64
> -#define MAX_SAVE_PRIO		72
>   #define MAX_GC_TIMES		100
>   #define MIN_GC_NODES		100
> -#define GC_SLEEP_MS		100
> +#define MAX_GC_NODES		1000
> +#define MAX_GC_PERCENT		10
> +#define MIN_GC_SLEEP_MS		10
> +#define MAX_GC_SLEEP_MS		1000
>   
>   #define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
>   
> @@ -1542,12 +1543,56 @@ static unsigned int btree_gc_count_keys(struct btree *b)
>   	return ret;
>   }
>   
> -static size_t btree_gc_min_nodes(struct cache_set *c)
> +static uint64_t btree_gc_sleep_ms(struct cache_set *c, struct gc_stat *gc)
> +{
> +	uint64_t now = local_clock();
> +	uint64_t expect_time, sleep_time = 0;
> +
> +	/*
> +	 * GC maybe process very few nodes when IO requests are very frequent.
> +	 * If the sleep time is constant (100ms) each time, whole GC would last
> +	 * a long time.
> +	 * The IO performance also decline if a single GC takes a long time
> +	 * (such as single GC 100ms and sleep 100ms, IOPS is only half).
> +	 * So GC sleep time should be calculated dynamically based on gc_cost.
> +	 */
> +	gc->finish_time = time_after64(now, gc->start_time)
> +				? now - gc->start_time : 0;
> +	gc->gc_cost = gc->finish_time > gc->sleep_cost
> +			? gc->finish_time - gc->sleep_cost : 0;
> +	expect_time = div_u64(gc->gc_cost * (100 - MAX_GC_PERCENT), MAX_GC_PERCENT);
> +	if (expect_time > gc->sleep_cost)
> +		sleep_time = div_u64(expect_time - gc->sleep_cost, NSEC_PER_MSEC);
> +
> +	if (sleep_time < MIN_GC_SLEEP_MS)
> +		sleep_time = MIN_GC_SLEEP_MS;
> +	if (sleep_time > MAX_GC_SLEEP_MS)
> +		sleep_time = MAX_GC_SLEEP_MS;
> +
> +	return sleep_time;
> +}
> +
> +static size_t btree_gc_min_nodes(struct cache_set *c, struct gc_stat *gc)
>   {
>   	size_t min_nodes;
> +	size_t inflight;
> +
> +	/*
> +	 * If there are no requests, the GC is not stopped. Also, we hope to
> +	 * process the increasing number of IO requests immediately and hold
> +	 * the inflight peak. When IO depth up to maximum, gc sleep and handle
> +	 * these requests.
> +	 */
> +	inflight = atomic_read(&c->search_inflight);
> +	if (inflight <= 0)
> +		return max(c->gc_stats.nodes, gc->nodes) + 1;
> +	if (inflight > gc->max_inflight)
> +		gc->max_inflight = inflight;
> +	if (inflight >= gc->max_inflight)
> +		return 1;
>   
>   	/*
> -	 * Since incremental GC would stop 100ms when front
> +	 * Since incremental GC would dynamic sleep when front
>   	 * side I/O comes, so when there are many btree nodes,
>   	 * if GC only processes constant (100) nodes each time,
>   	 * GC would last a long time, and the front side I/Os
> @@ -1558,11 +1603,14 @@ static size_t btree_gc_min_nodes(struct cache_set *c)
>   	 * realized by dividing GC into constant(100) times,
>   	 * so when there are many btree nodes, GC can process
>   	 * more nodes each time, otherwise, GC will process less
> -	 * nodes each time (but no less than MIN_GC_NODES)
> +	 * nodes each time (but no less than MIN_GC_NODES and
> +	 * no more than MAX_GC_NODES)
>   	 */
>   	min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
>   	if (min_nodes < MIN_GC_NODES)
>   		min_nodes = MIN_GC_NODES;
> +	if (min_nodes > MAX_GC_NODES)
> +		min_nodes = MAX_GC_NODES;
>   
>   	return min_nodes;
>   }
> @@ -1633,8 +1681,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
>   		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
>   		r->b = NULL;
>   
> -		if (atomic_read(&b->c->search_inflight) &&
> -		    gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
> +		if (gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c, gc)) {
>   			gc->nodes_pre =  gc->nodes;
>   			ret = -EAGAIN;
>   			break;
> @@ -1789,7 +1836,7 @@ static void bch_btree_gc(struct cache_set *c)
>   	struct gc_stat stats;
>   	struct closure writes;
>   	struct btree_op op;
> -	uint64_t start_time = local_clock();
> +	uint64_t sleep_time;
>   
>   	trace_bcache_gc_start(c);
>   
> @@ -1798,24 +1845,34 @@ static void bch_btree_gc(struct cache_set *c)
>   	bch_btree_op_init(&op, SHRT_MAX);
>   
>   	btree_gc_start(c);
> +	stats.start_time = local_clock();
>   
>   	/* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
>   	do {
> +		stats.times++;
>   		ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
>   		closure_sync(&writes);
>   		cond_resched();
>   
> -		if (ret == -EAGAIN)
> +		sleep_time = btree_gc_sleep_ms(c, &stats);
> +		if (ret == -EAGAIN) {
> +			stats.sleep_cost += sleep_time * NSEC_PER_MSEC;
>   			schedule_timeout_interruptible(msecs_to_jiffies
> -						       (GC_SLEEP_MS));
> -		else if (ret)
> +						       (sleep_time));
> +		} else if (ret)
>   			pr_warn("gc failed!\n");
>   	} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
>   
>   	bch_btree_gc_finish(c);
>   	wake_up_allocators(c);
>   
> -	bch_time_stats_update(&c->btree_gc_time, start_time);
> +	bch_time_stats_update(&c->btree_gc_time, stats.start_time);
> +	stats.average_cost = stats.gc_cost / stats.nodes;
> +	pr_info("gc %llu times with %llu nodes, sleep %llums, "
> +		"average gc cost %lluus per node",
> +		(uint64_t)stats.times, (uint64_t)stats.nodes,
> +		div_u64(stats.sleep_cost, NSEC_PER_MSEC),
> +		div_u64(stats.average_cost, NSEC_PER_USEC));
>   
>   	stats.key_bytes *= sizeof(uint64_t);
>   	stats.data	<<= 9;



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] bcache: dynamic incremental gc
  2022-04-21 12:17 [PATCH] bcache: dynamic incremental gc mingzhe.zou
  2022-04-21 13:28 ` Coly Li
@ 2022-04-21 18:12 ` kernel test robot
  2022-04-21 19:34 ` kernel test robot
  2 siblings, 0 replies; 5+ messages in thread
From: kernel test robot @ 2022-04-21 18:12 UTC (permalink / raw)
  To: mingzhe.zou, colyli, linux-bcache; +Cc: kbuild-all, zoumingzhe

Hi,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.18-rc3 next-20220421]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/mingzhe-zou-easystack-cn/bcache-dynamic-incremental-gc/20220421-201917
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git b253435746d9a4a701b5f09211b9c14d3370d0da
config: m68k-randconfig-r025-20220421 (https://download.01.org/0day-ci/archive/20220422/202204220253.PJykKQfz-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/9934df989e22e2a0da9c61c9c47da9839220570e
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review mingzhe-zou-easystack-cn/bcache-dynamic-incremental-gc/20220421-201917
        git checkout 9934df989e22e2a0da9c61c9c47da9839220570e
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   m68k-linux-ld: section .rodata VMA [0000000000002000,00000000001d1067] overlaps section .text VMA [0000000000000400,000000000057bf1f]
   m68k-linux-ld: drivers/md/bcache/btree.o: in function `bch_btree_gc':
>> btree.c:(.text+0x54cc): undefined reference to `__udivdi3'

-- 
0-DAY CI Kernel Test Service
https://01.org/lkp

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] bcache: dynamic incremental gc
  2022-04-21 12:17 [PATCH] bcache: dynamic incremental gc mingzhe.zou
  2022-04-21 13:28 ` Coly Li
  2022-04-21 18:12 ` kernel test robot
@ 2022-04-21 19:34 ` kernel test robot
  2 siblings, 0 replies; 5+ messages in thread
From: kernel test robot @ 2022-04-21 19:34 UTC (permalink / raw)
  To: mingzhe.zou, colyli, linux-bcache; +Cc: kbuild-all, zoumingzhe

Hi,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v5.18-rc3 next-20220421]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/intel-lab-lkp/linux/commits/mingzhe-zou-easystack-cn/bcache-dynamic-incremental-gc/20220421-201917
base:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git b253435746d9a4a701b5f09211b9c14d3370d0da
config: powerpc-randconfig-c024-20220421 (https://download.01.org/0day-ci/archive/20220422/202204220354.7eXmJIoI-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/9934df989e22e2a0da9c61c9c47da9839220570e
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review mingzhe-zou-easystack-cn/bcache-dynamic-incremental-gc/20220421-201917
        git checkout 9934df989e22e2a0da9c61c9c47da9839220570e
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross W=1 O=build_dir ARCH=powerpc SHELL=/bin/bash

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   powerpc-linux-ld: drivers/md/bcache/btree.o: in function `bch_btree_gc':
>> drivers/md/bcache/btree.c:1870: undefined reference to `__udivdi3'


vim +1870 drivers/md/bcache/btree.c

  1832	
  1833	static void bch_btree_gc(struct cache_set *c)
  1834	{
  1835		int ret;
  1836		struct gc_stat stats;
  1837		struct closure writes;
  1838		struct btree_op op;
  1839		uint64_t sleep_time;
  1840	
  1841		trace_bcache_gc_start(c);
  1842	
  1843		memset(&stats, 0, sizeof(struct gc_stat));
  1844		closure_init_stack(&writes);
  1845		bch_btree_op_init(&op, SHRT_MAX);
  1846	
  1847		btree_gc_start(c);
  1848		stats.start_time = local_clock();
  1849	
  1850		/* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
  1851		do {
  1852			stats.times++;
  1853			ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
  1854			closure_sync(&writes);
  1855			cond_resched();
  1856	
  1857			sleep_time = btree_gc_sleep_ms(c, &stats);
  1858			if (ret == -EAGAIN) {
  1859				stats.sleep_cost += sleep_time * NSEC_PER_MSEC;
  1860				schedule_timeout_interruptible(msecs_to_jiffies
  1861							       (sleep_time));
  1862			} else if (ret)
  1863				pr_warn("gc failed!\n");
  1864		} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
  1865	
  1866		bch_btree_gc_finish(c);
  1867		wake_up_allocators(c);
  1868	
  1869		bch_time_stats_update(&c->btree_gc_time, stats.start_time);
> 1870		stats.average_cost = stats.gc_cost / stats.nodes;
  1871		pr_info("gc %llu times with %llu nodes, sleep %llums, "
  1872			"average gc cost %lluus per node",
  1873			(uint64_t)stats.times, (uint64_t)stats.nodes,
  1874			div_u64(stats.sleep_cost, NSEC_PER_MSEC),
  1875			div_u64(stats.average_cost, NSEC_PER_USEC));
  1876	
  1877		stats.key_bytes *= sizeof(uint64_t);
  1878		stats.data	<<= 9;
  1879		bch_update_bucket_in_use(c, &stats);
  1880		memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
  1881	
  1882		trace_bcache_gc_end(c);
  1883	
  1884		bch_moving_gc(c);
  1885	}
  1886	

-- 
0-DAY CI Kernel Test Service
https://01.org/lkp

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] bcache: dynamic incremental gc
  2022-04-21 13:28 ` Coly Li
@ 2022-04-22  5:58   ` Zou Mingzhe
  0 siblings, 0 replies; 5+ messages in thread
From: Zou Mingzhe @ 2022-04-22  5:58 UTC (permalink / raw)
  To: Coly Li; +Cc: zoumingzhe, linux-bcache


在 2022/4/21 21:28, Coly Li 写道:
> On 4/21/22 8:17 PM, mingzhe.zou@easystack.cn wrote:
>> From: ZouMingzhe <mingzhe.zou@easystack.cn>
>>
>> During GC, IO performance would be reduced by half or more.
>> According to our test data, when nvme is used as the cache,
>> it takes about 1ms for GC to handle each node (block 4k and
>> bucket 512k).
>>
>> So, GC process at least 100 nodes each time, resulting in
>> IOPS decreasing by half and latency increasing.
>>
>> This patch add some cost statistics and hold the inflight peak.
>> When IO depth up to maximum, gc sleep and handle these requests.
>> GC sleep time dynamically calculate based on gc_cost.
>
> Hi Mingzhe,
>
> What the problem this patch intends to solve, and what is the result 
> of the change?
>
>
> Thanks.
>
> Coly Li
>
Hi Coly

We regularly use FIO to test our storage performance briefly at night. 
Recently, we frequently found some FIO test results, IOPS decreased and 
latency increased. We checked dmesg and found bcache was always doing GC 
during these abnormal results. So we manually trigger GC and confirm it.

Currently, GC is no more than 100 times, with at least 100 nodes each 
time, the maximum number of nodes each time is not limited. According to 
our test data, when nvme is used as the cache, it takes about 1ms for GC 
to handle each node. This means that the latency during GC is at least 
100ms.

We should better balance GC and request, but I don't have a good plan 
for all application scenes at present. So I want to optimize the 
performance under high load first. At the same time, I added some 
statistical items, hoping to provide useful information for future work.

This patch will cause the GC to stop immediately when inflight reaches 
peak. Because each GC maybe only process very few nodes, GC would last a 
long time if sleep 100ms each time. So, the sleep time should be 
calculated dynamically.

mingzhe
>
>> Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
>> ---
>>   drivers/md/bcache/bcache.h |  8 ++++
>>   drivers/md/bcache/btree.c  | 83 ++++++++++++++++++++++++++++++++------
>>   2 files changed, 78 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
>> index 9ed9c955add7..065a1137db68 100644
>> --- a/drivers/md/bcache/bcache.h
>> +++ b/drivers/md/bcache/bcache.h
>> @@ -471,6 +471,14 @@ struct cache {
>>   };
>>     struct gc_stat {
>> +    uint64_t        gc_cost;
>> +    uint64_t        sleep_cost;
>> +    uint64_t        average_cost;
>> +    uint64_t        start_time;
>> +    uint64_t        finish_time;
>> +    size_t            max_inflight;
>> +
>> +    size_t            times;
>>       size_t            nodes;
>>       size_t            nodes_pre;
>>       size_t            key_bytes;
>> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
>> index f5f2718e03e5..fc721f216eb7 100644
>> --- a/drivers/md/bcache/btree.c
>> +++ b/drivers/md/bcache/btree.c
>> @@ -88,11 +88,12 @@
>>    * Test module load/unload
>>    */
>>   -#define MAX_NEED_GC        64
>> -#define MAX_SAVE_PRIO        72
>>   #define MAX_GC_TIMES        100
>>   #define MIN_GC_NODES        100
>> -#define GC_SLEEP_MS        100
>> +#define MAX_GC_NODES        1000
>> +#define MAX_GC_PERCENT        10
>> +#define MIN_GC_SLEEP_MS        10
>> +#define MAX_GC_SLEEP_MS        1000
>>     #define PTR_DIRTY_BIT        (((uint64_t) 1 << 36))
>>   @@ -1542,12 +1543,56 @@ static unsigned int 
>> btree_gc_count_keys(struct btree *b)
>>       return ret;
>>   }
>>   -static size_t btree_gc_min_nodes(struct cache_set *c)
>> +static uint64_t btree_gc_sleep_ms(struct cache_set *c, struct 
>> gc_stat *gc)
>> +{
>> +    uint64_t now = local_clock();
>> +    uint64_t expect_time, sleep_time = 0;
>> +
>> +    /*
>> +     * GC maybe process very few nodes when IO requests are very 
>> frequent.
>> +     * If the sleep time is constant (100ms) each time, whole GC 
>> would last
>> +     * a long time.
>> +     * The IO performance also decline if a single GC takes a long time
>> +     * (such as single GC 100ms and sleep 100ms, IOPS is only half).
>> +     * So GC sleep time should be calculated dynamically based on 
>> gc_cost.
>> +     */
>> +    gc->finish_time = time_after64(now, gc->start_time)
>> +                ? now - gc->start_time : 0;
>> +    gc->gc_cost = gc->finish_time > gc->sleep_cost
>> +            ? gc->finish_time - gc->sleep_cost : 0;
>> +    expect_time = div_u64(gc->gc_cost * (100 - MAX_GC_PERCENT), 
>> MAX_GC_PERCENT);
>> +    if (expect_time > gc->sleep_cost)
>> +        sleep_time = div_u64(expect_time - gc->sleep_cost, 
>> NSEC_PER_MSEC);
>> +
>> +    if (sleep_time < MIN_GC_SLEEP_MS)
>> +        sleep_time = MIN_GC_SLEEP_MS;
>> +    if (sleep_time > MAX_GC_SLEEP_MS)
>> +        sleep_time = MAX_GC_SLEEP_MS;
>> +
>> +    return sleep_time;
>> +}
>> +
>> +static size_t btree_gc_min_nodes(struct cache_set *c, struct gc_stat 
>> *gc)
>>   {
>>       size_t min_nodes;
>> +    size_t inflight;
>> +
>> +    /*
>> +     * If there are no requests, the GC is not stopped. Also, we 
>> hope to
>> +     * process the increasing number of IO requests immediately and 
>> hold
>> +     * the inflight peak. When IO depth up to maximum, gc sleep and 
>> handle
>> +     * these requests.
>> +     */
>> +    inflight = atomic_read(&c->search_inflight);
>> +    if (inflight <= 0)
>> +        return max(c->gc_stats.nodes, gc->nodes) + 1;
>> +    if (inflight > gc->max_inflight)
>> +        gc->max_inflight = inflight;
>> +    if (inflight >= gc->max_inflight)
>> +        return 1;
>>         /*
>> -     * Since incremental GC would stop 100ms when front
>> +     * Since incremental GC would dynamic sleep when front
>>        * side I/O comes, so when there are many btree nodes,
>>        * if GC only processes constant (100) nodes each time,
>>        * GC would last a long time, and the front side I/Os
>> @@ -1558,11 +1603,14 @@ static size_t btree_gc_min_nodes(struct 
>> cache_set *c)
>>        * realized by dividing GC into constant(100) times,
>>        * so when there are many btree nodes, GC can process
>>        * more nodes each time, otherwise, GC will process less
>> -     * nodes each time (but no less than MIN_GC_NODES)
>> +     * nodes each time (but no less than MIN_GC_NODES and
>> +     * no more than MAX_GC_NODES)
>>        */
>>       min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
>>       if (min_nodes < MIN_GC_NODES)
>>           min_nodes = MIN_GC_NODES;
>> +    if (min_nodes > MAX_GC_NODES)
>> +        min_nodes = MAX_GC_NODES;
>>         return min_nodes;
>>   }
>> @@ -1633,8 +1681,7 @@ static int btree_gc_recurse(struct btree *b, 
>> struct btree_op *op,
>>           memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
>>           r->b = NULL;
>>   -        if (atomic_read(&b->c->search_inflight) &&
>> -            gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
>> +        if (gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c, 
>> gc)) {
>>               gc->nodes_pre =  gc->nodes;
>>               ret = -EAGAIN;
>>               break;
>> @@ -1789,7 +1836,7 @@ static void bch_btree_gc(struct cache_set *c)
>>       struct gc_stat stats;
>>       struct closure writes;
>>       struct btree_op op;
>> -    uint64_t start_time = local_clock();
>> +    uint64_t sleep_time;
>>         trace_bcache_gc_start(c);
>>   @@ -1798,24 +1845,34 @@ static void bch_btree_gc(struct cache_set *c)
>>       bch_btree_op_init(&op, SHRT_MAX);
>>         btree_gc_start(c);
>> +    stats.start_time = local_clock();
>>         /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
>>       do {
>> +        stats.times++;
>>           ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
>>           closure_sync(&writes);
>>           cond_resched();
>>   -        if (ret == -EAGAIN)
>> +        sleep_time = btree_gc_sleep_ms(c, &stats);
>> +        if (ret == -EAGAIN) {
>> +            stats.sleep_cost += sleep_time * NSEC_PER_MSEC;
>>               schedule_timeout_interruptible(msecs_to_jiffies
>> -                               (GC_SLEEP_MS));
>> -        else if (ret)
>> +                               (sleep_time));
>> +        } else if (ret)
>>               pr_warn("gc failed!\n");
>>       } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
>>         bch_btree_gc_finish(c);
>>       wake_up_allocators(c);
>>   -    bch_time_stats_update(&c->btree_gc_time, start_time);
>> +    bch_time_stats_update(&c->btree_gc_time, stats.start_time);
>> +    stats.average_cost = stats.gc_cost / stats.nodes;
>> +    pr_info("gc %llu times with %llu nodes, sleep %llums, "
>> +        "average gc cost %lluus per node",
>> +        (uint64_t)stats.times, (uint64_t)stats.nodes,
>> +        div_u64(stats.sleep_cost, NSEC_PER_MSEC),
>> +        div_u64(stats.average_cost, NSEC_PER_USEC));
>>         stats.key_bytes *= sizeof(uint64_t);
>>       stats.data    <<= 9;
>
>
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2022-04-22  6:03 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-21 12:17 [PATCH] bcache: dynamic incremental gc mingzhe.zou
2022-04-21 13:28 ` Coly Li
2022-04-22  5:58   ` Zou Mingzhe
2022-04-21 18:12 ` kernel test robot
2022-04-21 19:34 ` kernel test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.