* [PATCH] bcache: dynamic incremental gc
@ 2022-04-21 12:17 mingzhe.zou
2022-04-21 13:28 ` Coly Li
` (2 more replies)
0 siblings, 3 replies; 5+ messages in thread
From: mingzhe.zou @ 2022-04-21 12:17 UTC (permalink / raw)
To: colyli, linux-bcache; +Cc: zoumingzhe
From: ZouMingzhe <mingzhe.zou@easystack.cn>
During GC, IO performance would be reduced by half or more.
According to our test data, when nvme is used as the cache,
it takes about 1ms for GC to handle each node (block 4k and
bucket 512k).
So, GC process at least 100 nodes each time, resulting in
IOPS decreasing by half and latency increasing.
This patch add some cost statistics and hold the inflight peak.
When IO depth up to maximum, gc sleep and handle these requests.
GC sleep time dynamically calculate based on gc_cost.
Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
---
drivers/md/bcache/bcache.h | 8 ++++
drivers/md/bcache/btree.c | 83 ++++++++++++++++++++++++++++++++------
2 files changed, 78 insertions(+), 13 deletions(-)
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 9ed9c955add7..065a1137db68 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -471,6 +471,14 @@ struct cache {
};
struct gc_stat {
+ uint64_t gc_cost;
+ uint64_t sleep_cost;
+ uint64_t average_cost;
+ uint64_t start_time;
+ uint64_t finish_time;
+ size_t max_inflight;
+
+ size_t times;
size_t nodes;
size_t nodes_pre;
size_t key_bytes;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index f5f2718e03e5..fc721f216eb7 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -88,11 +88,12 @@
* Test module load/unload
*/
-#define MAX_NEED_GC 64
-#define MAX_SAVE_PRIO 72
#define MAX_GC_TIMES 100
#define MIN_GC_NODES 100
-#define GC_SLEEP_MS 100
+#define MAX_GC_NODES 1000
+#define MAX_GC_PERCENT 10
+#define MIN_GC_SLEEP_MS 10
+#define MAX_GC_SLEEP_MS 1000
#define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
@@ -1542,12 +1543,56 @@ static unsigned int btree_gc_count_keys(struct btree *b)
return ret;
}
-static size_t btree_gc_min_nodes(struct cache_set *c)
+static uint64_t btree_gc_sleep_ms(struct cache_set *c, struct gc_stat *gc)
+{
+ uint64_t now = local_clock();
+ uint64_t expect_time, sleep_time = 0;
+
+ /*
+ * GC maybe process very few nodes when IO requests are very frequent.
+ * If the sleep time is constant (100ms) each time, whole GC would last
+ * a long time.
+ * The IO performance also decline if a single GC takes a long time
+ * (such as single GC 100ms and sleep 100ms, IOPS is only half).
+ * So GC sleep time should be calculated dynamically based on gc_cost.
+ */
+ gc->finish_time = time_after64(now, gc->start_time)
+ ? now - gc->start_time : 0;
+ gc->gc_cost = gc->finish_time > gc->sleep_cost
+ ? gc->finish_time - gc->sleep_cost : 0;
+ expect_time = div_u64(gc->gc_cost * (100 - MAX_GC_PERCENT), MAX_GC_PERCENT);
+ if (expect_time > gc->sleep_cost)
+ sleep_time = div_u64(expect_time - gc->sleep_cost, NSEC_PER_MSEC);
+
+ if (sleep_time < MIN_GC_SLEEP_MS)
+ sleep_time = MIN_GC_SLEEP_MS;
+ if (sleep_time > MAX_GC_SLEEP_MS)
+ sleep_time = MAX_GC_SLEEP_MS;
+
+ return sleep_time;
+}
+
+static size_t btree_gc_min_nodes(struct cache_set *c, struct gc_stat *gc)
{
size_t min_nodes;
+ size_t inflight;
+
+ /*
+ * If there are no requests, the GC is not stopped. Also, we hope to
+ * process the increasing number of IO requests immediately and hold
+ * the inflight peak. When IO depth up to maximum, gc sleep and handle
+ * these requests.
+ */
+ inflight = atomic_read(&c->search_inflight);
+ if (inflight <= 0)
+ return max(c->gc_stats.nodes, gc->nodes) + 1;
+ if (inflight > gc->max_inflight)
+ gc->max_inflight = inflight;
+ if (inflight >= gc->max_inflight)
+ return 1;
/*
- * Since incremental GC would stop 100ms when front
+ * Since incremental GC would dynamic sleep when front
* side I/O comes, so when there are many btree nodes,
* if GC only processes constant (100) nodes each time,
* GC would last a long time, and the front side I/Os
@@ -1558,11 +1603,14 @@ static size_t btree_gc_min_nodes(struct cache_set *c)
* realized by dividing GC into constant(100) times,
* so when there are many btree nodes, GC can process
* more nodes each time, otherwise, GC will process less
- * nodes each time (but no less than MIN_GC_NODES)
+ * nodes each time (but no less than MIN_GC_NODES and
+ * no more than MAX_GC_NODES)
*/
min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
if (min_nodes < MIN_GC_NODES)
min_nodes = MIN_GC_NODES;
+ if (min_nodes > MAX_GC_NODES)
+ min_nodes = MAX_GC_NODES;
return min_nodes;
}
@@ -1633,8 +1681,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
r->b = NULL;
- if (atomic_read(&b->c->search_inflight) &&
- gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
+ if (gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c, gc)) {
gc->nodes_pre = gc->nodes;
ret = -EAGAIN;
break;
@@ -1789,7 +1836,7 @@ static void bch_btree_gc(struct cache_set *c)
struct gc_stat stats;
struct closure writes;
struct btree_op op;
- uint64_t start_time = local_clock();
+ uint64_t sleep_time;
trace_bcache_gc_start(c);
@@ -1798,24 +1845,34 @@ static void bch_btree_gc(struct cache_set *c)
bch_btree_op_init(&op, SHRT_MAX);
btree_gc_start(c);
+ stats.start_time = local_clock();
/* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
do {
+ stats.times++;
ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
closure_sync(&writes);
cond_resched();
- if (ret == -EAGAIN)
+ sleep_time = btree_gc_sleep_ms(c, &stats);
+ if (ret == -EAGAIN) {
+ stats.sleep_cost += sleep_time * NSEC_PER_MSEC;
schedule_timeout_interruptible(msecs_to_jiffies
- (GC_SLEEP_MS));
- else if (ret)
+ (sleep_time));
+ } else if (ret)
pr_warn("gc failed!\n");
} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
bch_btree_gc_finish(c);
wake_up_allocators(c);
- bch_time_stats_update(&c->btree_gc_time, start_time);
+ bch_time_stats_update(&c->btree_gc_time, stats.start_time);
+ stats.average_cost = stats.gc_cost / stats.nodes;
+ pr_info("gc %llu times with %llu nodes, sleep %llums, "
+ "average gc cost %lluus per node",
+ (uint64_t)stats.times, (uint64_t)stats.nodes,
+ div_u64(stats.sleep_cost, NSEC_PER_MSEC),
+ div_u64(stats.average_cost, NSEC_PER_USEC));
stats.key_bytes *= sizeof(uint64_t);
stats.data <<= 9;
--
2.17.1
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH] bcache: dynamic incremental gc
2022-04-21 12:17 [PATCH] bcache: dynamic incremental gc mingzhe.zou
@ 2022-04-21 13:28 ` Coly Li
2022-04-22 5:58 ` Zou Mingzhe
2022-04-21 18:12 ` kernel test robot
2022-04-21 19:34 ` kernel test robot
2 siblings, 1 reply; 5+ messages in thread
From: Coly Li @ 2022-04-21 13:28 UTC (permalink / raw)
To: mingzhe.zou; +Cc: zoumingzhe, linux-bcache
On 4/21/22 8:17 PM, mingzhe.zou@easystack.cn wrote:
> From: ZouMingzhe <mingzhe.zou@easystack.cn>
>
> During GC, IO performance would be reduced by half or more.
> According to our test data, when nvme is used as the cache,
> it takes about 1ms for GC to handle each node (block 4k and
> bucket 512k).
>
> So, GC process at least 100 nodes each time, resulting in
> IOPS decreasing by half and latency increasing.
>
> This patch add some cost statistics and hold the inflight peak.
> When IO depth up to maximum, gc sleep and handle these requests.
> GC sleep time dynamically calculate based on gc_cost.
Hi Mingzhe,
What the problem this patch intends to solve, and what is the result of
the change?
Thanks.
Coly Li
> Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
> ---
> drivers/md/bcache/bcache.h | 8 ++++
> drivers/md/bcache/btree.c | 83 ++++++++++++++++++++++++++++++++------
> 2 files changed, 78 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 9ed9c955add7..065a1137db68 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -471,6 +471,14 @@ struct cache {
> };
>
> struct gc_stat {
> + uint64_t gc_cost;
> + uint64_t sleep_cost;
> + uint64_t average_cost;
> + uint64_t start_time;
> + uint64_t finish_time;
> + size_t max_inflight;
> +
> + size_t times;
> size_t nodes;
> size_t nodes_pre;
> size_t key_bytes;
> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
> index f5f2718e03e5..fc721f216eb7 100644
> --- a/drivers/md/bcache/btree.c
> +++ b/drivers/md/bcache/btree.c
> @@ -88,11 +88,12 @@
> * Test module load/unload
> */
>
> -#define MAX_NEED_GC 64
> -#define MAX_SAVE_PRIO 72
> #define MAX_GC_TIMES 100
> #define MIN_GC_NODES 100
> -#define GC_SLEEP_MS 100
> +#define MAX_GC_NODES 1000
> +#define MAX_GC_PERCENT 10
> +#define MIN_GC_SLEEP_MS 10
> +#define MAX_GC_SLEEP_MS 1000
>
> #define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
>
> @@ -1542,12 +1543,56 @@ static unsigned int btree_gc_count_keys(struct btree *b)
> return ret;
> }
>
> -static size_t btree_gc_min_nodes(struct cache_set *c)
> +static uint64_t btree_gc_sleep_ms(struct cache_set *c, struct gc_stat *gc)
> +{
> + uint64_t now = local_clock();
> + uint64_t expect_time, sleep_time = 0;
> +
> + /*
> + * GC maybe process very few nodes when IO requests are very frequent.
> + * If the sleep time is constant (100ms) each time, whole GC would last
> + * a long time.
> + * The IO performance also decline if a single GC takes a long time
> + * (such as single GC 100ms and sleep 100ms, IOPS is only half).
> + * So GC sleep time should be calculated dynamically based on gc_cost.
> + */
> + gc->finish_time = time_after64(now, gc->start_time)
> + ? now - gc->start_time : 0;
> + gc->gc_cost = gc->finish_time > gc->sleep_cost
> + ? gc->finish_time - gc->sleep_cost : 0;
> + expect_time = div_u64(gc->gc_cost * (100 - MAX_GC_PERCENT), MAX_GC_PERCENT);
> + if (expect_time > gc->sleep_cost)
> + sleep_time = div_u64(expect_time - gc->sleep_cost, NSEC_PER_MSEC);
> +
> + if (sleep_time < MIN_GC_SLEEP_MS)
> + sleep_time = MIN_GC_SLEEP_MS;
> + if (sleep_time > MAX_GC_SLEEP_MS)
> + sleep_time = MAX_GC_SLEEP_MS;
> +
> + return sleep_time;
> +}
> +
> +static size_t btree_gc_min_nodes(struct cache_set *c, struct gc_stat *gc)
> {
> size_t min_nodes;
> + size_t inflight;
> +
> + /*
> + * If there are no requests, the GC is not stopped. Also, we hope to
> + * process the increasing number of IO requests immediately and hold
> + * the inflight peak. When IO depth up to maximum, gc sleep and handle
> + * these requests.
> + */
> + inflight = atomic_read(&c->search_inflight);
> + if (inflight <= 0)
> + return max(c->gc_stats.nodes, gc->nodes) + 1;
> + if (inflight > gc->max_inflight)
> + gc->max_inflight = inflight;
> + if (inflight >= gc->max_inflight)
> + return 1;
>
> /*
> - * Since incremental GC would stop 100ms when front
> + * Since incremental GC would dynamic sleep when front
> * side I/O comes, so when there are many btree nodes,
> * if GC only processes constant (100) nodes each time,
> * GC would last a long time, and the front side I/Os
> @@ -1558,11 +1603,14 @@ static size_t btree_gc_min_nodes(struct cache_set *c)
> * realized by dividing GC into constant(100) times,
> * so when there are many btree nodes, GC can process
> * more nodes each time, otherwise, GC will process less
> - * nodes each time (but no less than MIN_GC_NODES)
> + * nodes each time (but no less than MIN_GC_NODES and
> + * no more than MAX_GC_NODES)
> */
> min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
> if (min_nodes < MIN_GC_NODES)
> min_nodes = MIN_GC_NODES;
> + if (min_nodes > MAX_GC_NODES)
> + min_nodes = MAX_GC_NODES;
>
> return min_nodes;
> }
> @@ -1633,8 +1681,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
> memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
> r->b = NULL;
>
> - if (atomic_read(&b->c->search_inflight) &&
> - gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
> + if (gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c, gc)) {
> gc->nodes_pre = gc->nodes;
> ret = -EAGAIN;
> break;
> @@ -1789,7 +1836,7 @@ static void bch_btree_gc(struct cache_set *c)
> struct gc_stat stats;
> struct closure writes;
> struct btree_op op;
> - uint64_t start_time = local_clock();
> + uint64_t sleep_time;
>
> trace_bcache_gc_start(c);
>
> @@ -1798,24 +1845,34 @@ static void bch_btree_gc(struct cache_set *c)
> bch_btree_op_init(&op, SHRT_MAX);
>
> btree_gc_start(c);
> + stats.start_time = local_clock();
>
> /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
> do {
> + stats.times++;
> ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
> closure_sync(&writes);
> cond_resched();
>
> - if (ret == -EAGAIN)
> + sleep_time = btree_gc_sleep_ms(c, &stats);
> + if (ret == -EAGAIN) {
> + stats.sleep_cost += sleep_time * NSEC_PER_MSEC;
> schedule_timeout_interruptible(msecs_to_jiffies
> - (GC_SLEEP_MS));
> - else if (ret)
> + (sleep_time));
> + } else if (ret)
> pr_warn("gc failed!\n");
> } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
>
> bch_btree_gc_finish(c);
> wake_up_allocators(c);
>
> - bch_time_stats_update(&c->btree_gc_time, start_time);
> + bch_time_stats_update(&c->btree_gc_time, stats.start_time);
> + stats.average_cost = stats.gc_cost / stats.nodes;
> + pr_info("gc %llu times with %llu nodes, sleep %llums, "
> + "average gc cost %lluus per node",
> + (uint64_t)stats.times, (uint64_t)stats.nodes,
> + div_u64(stats.sleep_cost, NSEC_PER_MSEC),
> + div_u64(stats.average_cost, NSEC_PER_USEC));
>
> stats.key_bytes *= sizeof(uint64_t);
> stats.data <<= 9;
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] bcache: dynamic incremental gc
2022-04-21 12:17 [PATCH] bcache: dynamic incremental gc mingzhe.zou
2022-04-21 13:28 ` Coly Li
@ 2022-04-21 18:12 ` kernel test robot
2022-04-21 19:34 ` kernel test robot
2 siblings, 0 replies; 5+ messages in thread
From: kernel test robot @ 2022-04-21 18:12 UTC (permalink / raw)
To: mingzhe.zou, colyli, linux-bcache; +Cc: kbuild-all, zoumingzhe
Hi,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on linus/master]
[also build test ERROR on v5.18-rc3 next-20220421]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/intel-lab-lkp/linux/commits/mingzhe-zou-easystack-cn/bcache-dynamic-incremental-gc/20220421-201917
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git b253435746d9a4a701b5f09211b9c14d3370d0da
config: m68k-randconfig-r025-20220421 (https://download.01.org/0day-ci/archive/20220422/202204220253.PJykKQfz-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/9934df989e22e2a0da9c61c9c47da9839220570e
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review mingzhe-zou-easystack-cn/bcache-dynamic-incremental-gc/20220421-201917
git checkout 9934df989e22e2a0da9c61c9c47da9839220570e
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
m68k-linux-ld: section .rodata VMA [0000000000002000,00000000001d1067] overlaps section .text VMA [0000000000000400,000000000057bf1f]
m68k-linux-ld: drivers/md/bcache/btree.o: in function `bch_btree_gc':
>> btree.c:(.text+0x54cc): undefined reference to `__udivdi3'
--
0-DAY CI Kernel Test Service
https://01.org/lkp
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] bcache: dynamic incremental gc
2022-04-21 12:17 [PATCH] bcache: dynamic incremental gc mingzhe.zou
2022-04-21 13:28 ` Coly Li
2022-04-21 18:12 ` kernel test robot
@ 2022-04-21 19:34 ` kernel test robot
2 siblings, 0 replies; 5+ messages in thread
From: kernel test robot @ 2022-04-21 19:34 UTC (permalink / raw)
To: mingzhe.zou, colyli, linux-bcache; +Cc: kbuild-all, zoumingzhe
Hi,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on linus/master]
[also build test ERROR on v5.18-rc3 next-20220421]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/intel-lab-lkp/linux/commits/mingzhe-zou-easystack-cn/bcache-dynamic-incremental-gc/20220421-201917
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git b253435746d9a4a701b5f09211b9c14d3370d0da
config: powerpc-randconfig-c024-20220421 (https://download.01.org/0day-ci/archive/20220422/202204220354.7eXmJIoI-lkp@intel.com/config)
compiler: powerpc-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/9934df989e22e2a0da9c61c9c47da9839220570e
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review mingzhe-zou-easystack-cn/bcache-dynamic-incremental-gc/20220421-201917
git checkout 9934df989e22e2a0da9c61c9c47da9839220570e
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross W=1 O=build_dir ARCH=powerpc SHELL=/bin/bash
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
powerpc-linux-ld: drivers/md/bcache/btree.o: in function `bch_btree_gc':
>> drivers/md/bcache/btree.c:1870: undefined reference to `__udivdi3'
vim +1870 drivers/md/bcache/btree.c
1832
1833 static void bch_btree_gc(struct cache_set *c)
1834 {
1835 int ret;
1836 struct gc_stat stats;
1837 struct closure writes;
1838 struct btree_op op;
1839 uint64_t sleep_time;
1840
1841 trace_bcache_gc_start(c);
1842
1843 memset(&stats, 0, sizeof(struct gc_stat));
1844 closure_init_stack(&writes);
1845 bch_btree_op_init(&op, SHRT_MAX);
1846
1847 btree_gc_start(c);
1848 stats.start_time = local_clock();
1849
1850 /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
1851 do {
1852 stats.times++;
1853 ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
1854 closure_sync(&writes);
1855 cond_resched();
1856
1857 sleep_time = btree_gc_sleep_ms(c, &stats);
1858 if (ret == -EAGAIN) {
1859 stats.sleep_cost += sleep_time * NSEC_PER_MSEC;
1860 schedule_timeout_interruptible(msecs_to_jiffies
1861 (sleep_time));
1862 } else if (ret)
1863 pr_warn("gc failed!\n");
1864 } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
1865
1866 bch_btree_gc_finish(c);
1867 wake_up_allocators(c);
1868
1869 bch_time_stats_update(&c->btree_gc_time, stats.start_time);
> 1870 stats.average_cost = stats.gc_cost / stats.nodes;
1871 pr_info("gc %llu times with %llu nodes, sleep %llums, "
1872 "average gc cost %lluus per node",
1873 (uint64_t)stats.times, (uint64_t)stats.nodes,
1874 div_u64(stats.sleep_cost, NSEC_PER_MSEC),
1875 div_u64(stats.average_cost, NSEC_PER_USEC));
1876
1877 stats.key_bytes *= sizeof(uint64_t);
1878 stats.data <<= 9;
1879 bch_update_bucket_in_use(c, &stats);
1880 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1881
1882 trace_bcache_gc_end(c);
1883
1884 bch_moving_gc(c);
1885 }
1886
--
0-DAY CI Kernel Test Service
https://01.org/lkp
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH] bcache: dynamic incremental gc
2022-04-21 13:28 ` Coly Li
@ 2022-04-22 5:58 ` Zou Mingzhe
0 siblings, 0 replies; 5+ messages in thread
From: Zou Mingzhe @ 2022-04-22 5:58 UTC (permalink / raw)
To: Coly Li; +Cc: zoumingzhe, linux-bcache
在 2022/4/21 21:28, Coly Li 写道:
> On 4/21/22 8:17 PM, mingzhe.zou@easystack.cn wrote:
>> From: ZouMingzhe <mingzhe.zou@easystack.cn>
>>
>> During GC, IO performance would be reduced by half or more.
>> According to our test data, when nvme is used as the cache,
>> it takes about 1ms for GC to handle each node (block 4k and
>> bucket 512k).
>>
>> So, GC process at least 100 nodes each time, resulting in
>> IOPS decreasing by half and latency increasing.
>>
>> This patch add some cost statistics and hold the inflight peak.
>> When IO depth up to maximum, gc sleep and handle these requests.
>> GC sleep time dynamically calculate based on gc_cost.
>
> Hi Mingzhe,
>
> What the problem this patch intends to solve, and what is the result
> of the change?
>
>
> Thanks.
>
> Coly Li
>
Hi Coly
We regularly use FIO to test our storage performance briefly at night.
Recently, we frequently found some FIO test results, IOPS decreased and
latency increased. We checked dmesg and found bcache was always doing GC
during these abnormal results. So we manually trigger GC and confirm it.
Currently, GC is no more than 100 times, with at least 100 nodes each
time, the maximum number of nodes each time is not limited. According to
our test data, when nvme is used as the cache, it takes about 1ms for GC
to handle each node. This means that the latency during GC is at least
100ms.
We should better balance GC and request, but I don't have a good plan
for all application scenes at present. So I want to optimize the
performance under high load first. At the same time, I added some
statistical items, hoping to provide useful information for future work.
This patch will cause the GC to stop immediately when inflight reaches
peak. Because each GC maybe only process very few nodes, GC would last a
long time if sleep 100ms each time. So, the sleep time should be
calculated dynamically.
mingzhe
>
>> Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
>> ---
>> drivers/md/bcache/bcache.h | 8 ++++
>> drivers/md/bcache/btree.c | 83 ++++++++++++++++++++++++++++++++------
>> 2 files changed, 78 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
>> index 9ed9c955add7..065a1137db68 100644
>> --- a/drivers/md/bcache/bcache.h
>> +++ b/drivers/md/bcache/bcache.h
>> @@ -471,6 +471,14 @@ struct cache {
>> };
>> struct gc_stat {
>> + uint64_t gc_cost;
>> + uint64_t sleep_cost;
>> + uint64_t average_cost;
>> + uint64_t start_time;
>> + uint64_t finish_time;
>> + size_t max_inflight;
>> +
>> + size_t times;
>> size_t nodes;
>> size_t nodes_pre;
>> size_t key_bytes;
>> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
>> index f5f2718e03e5..fc721f216eb7 100644
>> --- a/drivers/md/bcache/btree.c
>> +++ b/drivers/md/bcache/btree.c
>> @@ -88,11 +88,12 @@
>> * Test module load/unload
>> */
>> -#define MAX_NEED_GC 64
>> -#define MAX_SAVE_PRIO 72
>> #define MAX_GC_TIMES 100
>> #define MIN_GC_NODES 100
>> -#define GC_SLEEP_MS 100
>> +#define MAX_GC_NODES 1000
>> +#define MAX_GC_PERCENT 10
>> +#define MIN_GC_SLEEP_MS 10
>> +#define MAX_GC_SLEEP_MS 1000
>> #define PTR_DIRTY_BIT (((uint64_t) 1 << 36))
>> @@ -1542,12 +1543,56 @@ static unsigned int
>> btree_gc_count_keys(struct btree *b)
>> return ret;
>> }
>> -static size_t btree_gc_min_nodes(struct cache_set *c)
>> +static uint64_t btree_gc_sleep_ms(struct cache_set *c, struct
>> gc_stat *gc)
>> +{
>> + uint64_t now = local_clock();
>> + uint64_t expect_time, sleep_time = 0;
>> +
>> + /*
>> + * GC maybe process very few nodes when IO requests are very
>> frequent.
>> + * If the sleep time is constant (100ms) each time, whole GC
>> would last
>> + * a long time.
>> + * The IO performance also decline if a single GC takes a long time
>> + * (such as single GC 100ms and sleep 100ms, IOPS is only half).
>> + * So GC sleep time should be calculated dynamically based on
>> gc_cost.
>> + */
>> + gc->finish_time = time_after64(now, gc->start_time)
>> + ? now - gc->start_time : 0;
>> + gc->gc_cost = gc->finish_time > gc->sleep_cost
>> + ? gc->finish_time - gc->sleep_cost : 0;
>> + expect_time = div_u64(gc->gc_cost * (100 - MAX_GC_PERCENT),
>> MAX_GC_PERCENT);
>> + if (expect_time > gc->sleep_cost)
>> + sleep_time = div_u64(expect_time - gc->sleep_cost,
>> NSEC_PER_MSEC);
>> +
>> + if (sleep_time < MIN_GC_SLEEP_MS)
>> + sleep_time = MIN_GC_SLEEP_MS;
>> + if (sleep_time > MAX_GC_SLEEP_MS)
>> + sleep_time = MAX_GC_SLEEP_MS;
>> +
>> + return sleep_time;
>> +}
>> +
>> +static size_t btree_gc_min_nodes(struct cache_set *c, struct gc_stat
>> *gc)
>> {
>> size_t min_nodes;
>> + size_t inflight;
>> +
>> + /*
>> + * If there are no requests, the GC is not stopped. Also, we
>> hope to
>> + * process the increasing number of IO requests immediately and
>> hold
>> + * the inflight peak. When IO depth up to maximum, gc sleep and
>> handle
>> + * these requests.
>> + */
>> + inflight = atomic_read(&c->search_inflight);
>> + if (inflight <= 0)
>> + return max(c->gc_stats.nodes, gc->nodes) + 1;
>> + if (inflight > gc->max_inflight)
>> + gc->max_inflight = inflight;
>> + if (inflight >= gc->max_inflight)
>> + return 1;
>> /*
>> - * Since incremental GC would stop 100ms when front
>> + * Since incremental GC would dynamic sleep when front
>> * side I/O comes, so when there are many btree nodes,
>> * if GC only processes constant (100) nodes each time,
>> * GC would last a long time, and the front side I/Os
>> @@ -1558,11 +1603,14 @@ static size_t btree_gc_min_nodes(struct
>> cache_set *c)
>> * realized by dividing GC into constant(100) times,
>> * so when there are many btree nodes, GC can process
>> * more nodes each time, otherwise, GC will process less
>> - * nodes each time (but no less than MIN_GC_NODES)
>> + * nodes each time (but no less than MIN_GC_NODES and
>> + * no more than MAX_GC_NODES)
>> */
>> min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
>> if (min_nodes < MIN_GC_NODES)
>> min_nodes = MIN_GC_NODES;
>> + if (min_nodes > MAX_GC_NODES)
>> + min_nodes = MAX_GC_NODES;
>> return min_nodes;
>> }
>> @@ -1633,8 +1681,7 @@ static int btree_gc_recurse(struct btree *b,
>> struct btree_op *op,
>> memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
>> r->b = NULL;
>> - if (atomic_read(&b->c->search_inflight) &&
>> - gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
>> + if (gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c,
>> gc)) {
>> gc->nodes_pre = gc->nodes;
>> ret = -EAGAIN;
>> break;
>> @@ -1789,7 +1836,7 @@ static void bch_btree_gc(struct cache_set *c)
>> struct gc_stat stats;
>> struct closure writes;
>> struct btree_op op;
>> - uint64_t start_time = local_clock();
>> + uint64_t sleep_time;
>> trace_bcache_gc_start(c);
>> @@ -1798,24 +1845,34 @@ static void bch_btree_gc(struct cache_set *c)
>> bch_btree_op_init(&op, SHRT_MAX);
>> btree_gc_start(c);
>> + stats.start_time = local_clock();
>> /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
>> do {
>> + stats.times++;
>> ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
>> closure_sync(&writes);
>> cond_resched();
>> - if (ret == -EAGAIN)
>> + sleep_time = btree_gc_sleep_ms(c, &stats);
>> + if (ret == -EAGAIN) {
>> + stats.sleep_cost += sleep_time * NSEC_PER_MSEC;
>> schedule_timeout_interruptible(msecs_to_jiffies
>> - (GC_SLEEP_MS));
>> - else if (ret)
>> + (sleep_time));
>> + } else if (ret)
>> pr_warn("gc failed!\n");
>> } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
>> bch_btree_gc_finish(c);
>> wake_up_allocators(c);
>> - bch_time_stats_update(&c->btree_gc_time, start_time);
>> + bch_time_stats_update(&c->btree_gc_time, stats.start_time);
>> + stats.average_cost = stats.gc_cost / stats.nodes;
>> + pr_info("gc %llu times with %llu nodes, sleep %llums, "
>> + "average gc cost %lluus per node",
>> + (uint64_t)stats.times, (uint64_t)stats.nodes,
>> + div_u64(stats.sleep_cost, NSEC_PER_MSEC),
>> + div_u64(stats.average_cost, NSEC_PER_USEC));
>> stats.key_bytes *= sizeof(uint64_t);
>> stats.data <<= 9;
>
>
>
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2022-04-22 6:03 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-21 12:17 [PATCH] bcache: dynamic incremental gc mingzhe.zou
2022-04-21 13:28 ` Coly Li
2022-04-22 5:58 ` Zou Mingzhe
2022-04-21 18:12 ` kernel test robot
2022-04-21 19:34 ` kernel test robot
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.