All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] bcache: dynamic incremental gc
@ 2022-05-11  7:39 mingzhe.zou
  2022-05-12 13:41 ` Coly Li
  0 siblings, 1 reply; 8+ messages in thread
From: mingzhe.zou @ 2022-05-11  7:39 UTC (permalink / raw)
  To: colyli, linux-bcache; +Cc: dongsheng.yang, zoumingzhe

From: ZouMingzhe <mingzhe.zou@easystack.cn>

Currently, GC wants no more than 100 times, with at least
100 nodes each time, the maximum number of nodes each time
is not limited.

```
static size_t btree_gc_min_nodes(struct cache_set *c)
{
        ......
        min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
        if (min_nodes < MIN_GC_NODES)
                min_nodes = MIN_GC_NODES;

        return min_nodes;
}
```

According to our test data, when nvme is used as the cache,
it takes about 1ms for GC to handle each node (block 4k and
bucket 512k). This means that the latency during GC is at
least 100ms. During GC, IO performance would be reduced by
half or more.

I want to optimize the IOPS and latency under high pressure.
This patch hold the inflight peak. When IO depth up to maximum,
GC only process very few(10) nodes, then sleep immediately and
handle these requests.

bch_bucket_alloc() maybe wait for bch_allocator_thread() to
wake up, and and bch_allocator_thread() needs to wait for gc
to complete, in which case gc needs to end quickly. So, add
bucket_alloc_inflight to cache_set in v3.

```
long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait)
{
        ......
        do {
                prepare_to_wait(&ca->set->bucket_wait, &w,
                                TASK_UNINTERRUPTIBLE);

                mutex_unlock(&ca->set->bucket_lock);
                schedule();
                mutex_lock(&ca->set->bucket_lock);
        } while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
                 !fifo_pop(&ca->free[reserve], r));
        ......
}

static int bch_allocator_thread(void *arg)
{
	......
	allocator_wait(ca, bch_allocator_push(ca, bucket));
	wake_up(&ca->set->btree_cache_wait);
	wake_up(&ca->set->bucket_wait);
	......
}

static void bch_btree_gc(struct cache_set *c)
{
	......
	bch_btree_gc_finish(c);
	wake_up_allocators(c);
	......
}
```

Apply this patch, each GC maybe only process very few nodes,
GC would last a long time if sleep 100ms each time. So, the
sleep time should be calculated dynamically based on gc_cost.

At the same time, I added some cost statistics in gc_stat,
hoping to provide useful information for future work.

Signed-off-by: Mingzhe Zou <mingzhe.zou@easystack.cn>
---
 drivers/md/bcache/alloc.c  |   2 +
 drivers/md/bcache/bcache.h |   9 ++++
 drivers/md/bcache/btree.c  | 100 ++++++++++++++++++++++++++++++++-----
 3 files changed, 98 insertions(+), 13 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 097577ae3c47..bb8b16cc2189 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -415,7 +415,9 @@ long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait)
 				TASK_UNINTERRUPTIBLE);
 
 		mutex_unlock(&ca->set->bucket_lock);
+		atomic_inc(&ca->set->bucket_alloc_inflight);
 		schedule();
+		atomic_dec(&ca->set->bucket_alloc_inflight);
 		mutex_lock(&ca->set->bucket_lock);
 	} while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
 		 !fifo_pop(&ca->free[reserve], r));
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 9ed9c955add7..a113a3bc7356 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -471,6 +471,14 @@ struct cache {
 };
 
 struct gc_stat {
+	uint64_t		gc_cost;
+	uint64_t		sleep_cost;
+	uint64_t		average_cost;
+	uint64_t		start_time;
+	uint64_t		finish_time;
+	size_t			max_inflight;
+
+	size_t			times;
 	size_t			nodes;
 	size_t			nodes_pre;
 	size_t			key_bytes;
@@ -595,6 +603,7 @@ struct cache_set {
 	 * written.
 	 */
 	atomic_t		prio_blocked;
+	atomic_t		bucket_alloc_inflight;
 	wait_queue_head_t	bucket_wait;
 
 	/*
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index ad9f16689419..bc37fac0eac4 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -88,11 +88,14 @@
  * Test module load/unload
  */
 
-#define MAX_NEED_GC		64
-#define MAX_SAVE_PRIO		72
 #define MAX_GC_TIMES		100
 #define MIN_GC_NODES		100
-#define GC_SLEEP_MS		100
+#define MAX_GC_NODES		1000
+#define MAX_GC_PERCENT		10
+#define MIN_GC_SLEEP_MS		10
+#define MAX_GC_SLEEP_MS		1000
+#define MAX_INFLIGHT_FACTOR	60
+#define MAX_INFLIGHT(b, d, p)	div_u64((b)*(100-(p)) + (d)*(p), 100)
 
 #define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
 
@@ -1542,12 +1545,65 @@ static unsigned int btree_gc_count_keys(struct btree *b)
 	return ret;
 }
 
-static size_t btree_gc_min_nodes(struct cache_set *c)
+static uint64_t btree_gc_sleep_ms(struct cache_set *c, struct gc_stat *gc)
+{
+	uint64_t now = local_clock();
+	uint64_t expect_time, sleep_time = 0;
+
+	/*
+	 * bch_bucket_alloc() waits for gc to finish
+	 */
+	if (atomic_read(&c->bucket_alloc_inflight) > 0)
+		return MIN_GC_SLEEP_MS;
+
+	/*
+	 * GC maybe process very few nodes when IO requests are very frequent.
+	 * If the sleep time is constant (100ms) each time, whole GC would last
+	 * a long time.
+	 * The IO performance also decline if a single GC takes a long time
+	 * (such as single GC 100ms and sleep 100ms, IOPS is only half).
+	 * So GC sleep time should be calculated dynamically based on gc_cost.
+	 */
+	gc->finish_time = time_after64(now, gc->start_time)
+				? now - gc->start_time : 0;
+	gc->gc_cost = gc->finish_time > gc->sleep_cost
+			? gc->finish_time - gc->sleep_cost : 0;
+	expect_time = div_u64(gc->gc_cost * (100 - MAX_GC_PERCENT), MAX_GC_PERCENT);
+	if (expect_time > gc->sleep_cost)
+		sleep_time = div_u64(expect_time - gc->sleep_cost, NSEC_PER_MSEC);
+
+	if (sleep_time < MIN_GC_SLEEP_MS)
+		sleep_time = MIN_GC_SLEEP_MS;
+	if (sleep_time > MAX_GC_SLEEP_MS)
+		sleep_time = MAX_GC_SLEEP_MS;
+
+	return sleep_time;
+}
+
+static size_t btree_gc_min_nodes(struct cache_set *c, struct gc_stat *gc)
 {
 	size_t min_nodes;
+	size_t inflight;
 
 	/*
-	 * Since incremental GC would stop 100ms when front
+	 * If there is no requests or bucket_wait is happened,
+	 * the GC is not stopped. Also, we hope to process the
+	 * increasing number of IO requests immediately and hold
+	 * the inflight peak. When IO depth up to maximum, this gc
+	 * only process very few(10) nodes, then sleep and handle
+	 * these requests.
+	 */
+	inflight = atomic_read(&c->search_inflight);
+	if (inflight <= 0 || atomic_read(&c->bucket_alloc_inflight) > 0)
+		return max(c->gc_stats.nodes, gc->nodes) + 1;
+	if (inflight > gc->max_inflight)
+		gc->max_inflight = inflight;
+	if (inflight >= gc->max_inflight ||
+	    inflight >= c->gc_stats.max_inflight)
+		return 10;
+
+	/*
+	 * Since incremental GC would dynamic sleep when front
 	 * side I/O comes, so when there are many btree nodes,
 	 * if GC only processes constant (100) nodes each time,
 	 * GC would last a long time, and the front side I/Os
@@ -1558,11 +1614,14 @@ static size_t btree_gc_min_nodes(struct cache_set *c)
 	 * realized by dividing GC into constant(100) times,
 	 * so when there are many btree nodes, GC can process
 	 * more nodes each time, otherwise, GC will process less
-	 * nodes each time (but no less than MIN_GC_NODES)
+	 * nodes each time (but no less than MIN_GC_NODES and
+	 * no more than MAX_GC_NODES)
 	 */
 	min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
 	if (min_nodes < MIN_GC_NODES)
 		min_nodes = MIN_GC_NODES;
+	if (min_nodes > MAX_GC_NODES)
+		min_nodes = MAX_GC_NODES;
 
 	return min_nodes;
 }
@@ -1633,8 +1692,7 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
 		r->b = NULL;
 
-		if (atomic_read(&b->c->search_inflight) &&
-		    gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
+		if (gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c, gc)) {
 			gc->nodes_pre =  gc->nodes;
 			ret = -EAGAIN;
 			break;
@@ -1789,7 +1847,7 @@ static void bch_btree_gc(struct cache_set *c)
 	struct gc_stat stats;
 	struct closure writes;
 	struct btree_op op;
-	uint64_t start_time = local_clock();
+	uint64_t sleep_time;
 
 	trace_bcache_gc_start(c);
 
@@ -1798,24 +1856,40 @@ static void bch_btree_gc(struct cache_set *c)
 	bch_btree_op_init(&op, SHRT_MAX);
 
 	btree_gc_start(c);
+	stats.start_time = local_clock();
 
 	/* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
 	do {
+		stats.times++;
 		ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
 		closure_sync(&writes);
 		cond_resched();
 
-		if (ret == -EAGAIN)
+		sleep_time = btree_gc_sleep_ms(c, &stats);
+		if (ret == -EAGAIN) {
+			stats.sleep_cost += sleep_time * NSEC_PER_MSEC;
 			schedule_timeout_interruptible(msecs_to_jiffies
-						       (GC_SLEEP_MS));
-		else if (ret)
+						       (sleep_time));
+		} else if (ret)
 			pr_warn("gc failed!\n");
 	} while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
 
 	bch_btree_gc_finish(c);
 	wake_up_allocators(c);
 
-	bch_time_stats_update(&c->btree_gc_time, start_time);
+	bch_time_stats_update(&c->btree_gc_time, stats.start_time);
+	stats.average_cost = div_u64(stats.gc_cost, stats.nodes);
+	pr_info("gc %llu times with %llu nodes, sleep %llums, "
+		"average gc cost %lluus per node, max inflight %llu",
+		(uint64_t)stats.times, (uint64_t)stats.nodes,
+		div_u64(stats.sleep_cost, NSEC_PER_MSEC),
+		div_u64(stats.average_cost, NSEC_PER_USEC),
+		(uint64_t)stats.max_inflight);
+	stats.max_inflight = MAX_INFLIGHT(c->gc_stats.max_inflight,
+					  stats.max_inflight,
+					  MAX_INFLIGHT_FACTOR);
+	pr_info("max inflight updated to %llu",
+		(uint64_t)stats.max_inflight);
 
 	stats.key_bytes *= sizeof(uint64_t);
 	stats.data	<<= 9;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] bcache: dynamic incremental gc
  2022-05-11  7:39 [PATCH v3] bcache: dynamic incremental gc mingzhe.zou
@ 2022-05-12 13:41 ` Coly Li
  2022-05-20  8:22   ` Zou Mingzhe
  0 siblings, 1 reply; 8+ messages in thread
From: Coly Li @ 2022-05-12 13:41 UTC (permalink / raw)
  To: mingzhe.zou; +Cc: dongsheng.yang, linux-bcache, zoumingzhe

On 5/11/22 3:39 PM, mingzhe.zou@easystack.cn wrote:
> From: ZouMingzhe <mingzhe.zou@easystack.cn>
>
> Currently, GC wants no more than 100 times, with at least
> 100 nodes each time, the maximum number of nodes each time
> is not limited.
>
> ```
> static size_t btree_gc_min_nodes(struct cache_set *c)
> {
>          ......
>          min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
>          if (min_nodes < MIN_GC_NODES)
>                  min_nodes = MIN_GC_NODES;
>
>          return min_nodes;
> }
> ```
>
> According to our test data, when nvme is used as the cache,
> it takes about 1ms for GC to handle each node (block 4k and
> bucket 512k). This means that the latency during GC is at
> least 100ms. During GC, IO performance would be reduced by
> half or more.
>
> I want to optimize the IOPS and latency under high pressure.
> This patch hold the inflight peak. When IO depth up to maximum,
> GC only process very few(10) nodes, then sleep immediately and
> handle these requests.
>
> bch_bucket_alloc() maybe wait for bch_allocator_thread() to
> wake up, and and bch_allocator_thread() needs to wait for gc
> to complete, in which case gc needs to end quickly. So, add
> bucket_alloc_inflight to cache_set in v3.
>
> ```
> long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait)
> {
>          ......
>          do {
>                  prepare_to_wait(&ca->set->bucket_wait, &w,
>                                  TASK_UNINTERRUPTIBLE);
>
>                  mutex_unlock(&ca->set->bucket_lock);
>                  schedule();
>                  mutex_lock(&ca->set->bucket_lock);
>          } while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
>                   !fifo_pop(&ca->free[reserve], r));
>          ......
> }
>
> static int bch_allocator_thread(void *arg)
> {
> 	......
> 	allocator_wait(ca, bch_allocator_push(ca, bucket));
> 	wake_up(&ca->set->btree_cache_wait);
> 	wake_up(&ca->set->bucket_wait);
> 	......
> }
>
> static void bch_btree_gc(struct cache_set *c)
> {
> 	......
> 	bch_btree_gc_finish(c);
> 	wake_up_allocators(c);
> 	......
> }
> ```
>
> Apply this patch, each GC maybe only process very few nodes,
> GC would last a long time if sleep 100ms each time. So, the
> sleep time should be calculated dynamically based on gc_cost.
>
> At the same time, I added some cost statistics in gc_stat,
> hoping to provide useful information for future work.


Hi Mingzhe,

 From the first glance, I feel this change may delay the small GC 
period, and finally result a large GC period, which is not expected.

But it is possible that my feeling is incorrect. Do you have detailed 
performance number about both I/O latency  and GC period, then I can 
have more understanding for this effort.

BTW, I will add this patch to my testing set and experience myself.


Thanks.


Coly Li




[snipped]



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] bcache: dynamic incremental gc
  2022-05-12 13:41 ` Coly Li
@ 2022-05-20  8:22   ` Zou Mingzhe
  2022-05-20 18:24     ` Eric Wheeler
  0 siblings, 1 reply; 8+ messages in thread
From: Zou Mingzhe @ 2022-05-20  8:22 UTC (permalink / raw)
  To: linux-bcache; +Cc: zoumingzhe


在 2022/5/12 21:41, Coly Li 写道:
> On 5/11/22 3:39 PM, mingzhe.zou@easystack.cn wrote:
>> From: ZouMingzhe <mingzhe.zou@easystack.cn>
>>
>> Currently, GC wants no more than 100 times, with at least
>> 100 nodes each time, the maximum number of nodes each time
>> is not limited.
>>
>> ```
>> static size_t btree_gc_min_nodes(struct cache_set *c)
>> {
>>          ......
>>          min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
>>          if (min_nodes < MIN_GC_NODES)
>>                  min_nodes = MIN_GC_NODES;
>>
>>          return min_nodes;
>> }
>> ```
>>
>> According to our test data, when nvme is used as the cache,
>> it takes about 1ms for GC to handle each node (block 4k and
>> bucket 512k). This means that the latency during GC is at
>> least 100ms. During GC, IO performance would be reduced by
>> half or more.
>>
>> I want to optimize the IOPS and latency under high pressure.
>> This patch hold the inflight peak. When IO depth up to maximum,
>> GC only process very few(10) nodes, then sleep immediately and
>> handle these requests.
>>
>> bch_bucket_alloc() maybe wait for bch_allocator_thread() to
>> wake up, and and bch_allocator_thread() needs to wait for gc
>> to complete, in which case gc needs to end quickly. So, add
>> bucket_alloc_inflight to cache_set in v3.
>>
>> ```
>> long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait)
>> {
>>          ......
>>          do {
>> prepare_to_wait(&ca->set->bucket_wait, &w,
>>                                  TASK_UNINTERRUPTIBLE);
>>
>>                  mutex_unlock(&ca->set->bucket_lock);
>>                  schedule();
>>                  mutex_lock(&ca->set->bucket_lock);
>>          } while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
>>                   !fifo_pop(&ca->free[reserve], r));
>>          ......
>> }
>>
>> static int bch_allocator_thread(void *arg)
>> {
>>     ......
>>     allocator_wait(ca, bch_allocator_push(ca, bucket));
>>     wake_up(&ca->set->btree_cache_wait);
>>     wake_up(&ca->set->bucket_wait);
>>     ......
>> }
>>
>> static void bch_btree_gc(struct cache_set *c)
>> {
>>     ......
>>     bch_btree_gc_finish(c);
>>     wake_up_allocators(c);
>>     ......
>> }
>> ```
>>
>> Apply this patch, each GC maybe only process very few nodes,
>> GC would last a long time if sleep 100ms each time. So, the
>> sleep time should be calculated dynamically based on gc_cost.
>>
>> At the same time, I added some cost statistics in gc_stat,
>> hoping to provide useful information for future work.
>
>
> Hi Mingzhe,
>
> From the first glance, I feel this change may delay the small GC 
> period, and finally result a large GC period, which is not expected.
>
> But it is possible that my feeling is incorrect. Do you have detailed 
> performance number about both I/O latency  and GC period, then I can 
> have more understanding for this effort.
>
> BTW, I will add this patch to my testing set and experience myself.
>
>
> Thanks.
>
>
> Coly Li
>
>
Hi Coly,

First, your feeling is right. Then, I have some performance number abort 
before and after this patch.
Since the mailing list does not accept attachments, I put them on the gist.

Please visit the page for details:
https://gist.github.com/zoumingzhe/69a353e7c6fffe43142c2f42b94a67b5
mingzhe
>
>
> [snipped]
>
>
>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] bcache: dynamic incremental gc
  2022-05-20  8:22   ` Zou Mingzhe
@ 2022-05-20 18:24     ` Eric Wheeler
  2022-05-23  2:52       ` Zou Mingzhe
  0 siblings, 1 reply; 8+ messages in thread
From: Eric Wheeler @ 2022-05-20 18:24 UTC (permalink / raw)
  To: Zou Mingzhe; +Cc: linux-bcache, zoumingzhe

[-- Attachment #1: Type: text/plain, Size: 4922 bytes --]

On Fri, 20 May 2022, Zou Mingzhe wrote:
> 在 2022/5/12 21:41, Coly Li 写道:
> > On 5/11/22 3:39 PM, mingzhe.zou@easystack.cn wrote:
> >> From: ZouMingzhe <mingzhe.zou@easystack.cn>
> >>
> >> Currently, GC wants no more than 100 times, with at least
> >> 100 nodes each time, the maximum number of nodes each time
> >> is not limited.
> >>
> >> ```
> >> static size_t btree_gc_min_nodes(struct cache_set *c)
> >> {
> >>          ......
> >>          min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
> >>          if (min_nodes < MIN_GC_NODES)
> >>                  min_nodes = MIN_GC_NODES;
> >>
> >>          return min_nodes;
> >> }
> >> ```
> >>
> >> According to our test data, when nvme is used as the cache,
> >> it takes about 1ms for GC to handle each node (block 4k and
> >> bucket 512k). This means that the latency during GC is at
> >> least 100ms. During GC, IO performance would be reduced by
> >> half or more.
> >>
> >> I want to optimize the IOPS and latency under high pressure.
> >> This patch hold the inflight peak. When IO depth up to maximum,
> >> GC only process very few(10) nodes, then sleep immediately and
> >> handle these requests.
> >>
> >> bch_bucket_alloc() maybe wait for bch_allocator_thread() to
> >> wake up, and and bch_allocator_thread() needs to wait for gc
> >> to complete, in which case gc needs to end quickly. So, add
> >> bucket_alloc_inflight to cache_set in v3.
> >>
> >> ```
> >> long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait)
> >> {
> >>          ......
> >>          do {
> >> prepare_to_wait(&ca->set->bucket_wait, &w,
> >>                                  TASK_UNINTERRUPTIBLE);
> >>
> >>                  mutex_unlock(&ca->set->bucket_lock);
> >>                  schedule();
> >>                  mutex_lock(&ca->set->bucket_lock);
> >>          } while (!fifo_pop(&ca->free[RESERVE_NONE], r) &&
> >>                   !fifo_pop(&ca->free[reserve], r));
> >>          ......
> >> }
> >>
> >> static int bch_allocator_thread(void *arg)
> >> {
> >>     ......
> >>     allocator_wait(ca, bch_allocator_push(ca, bucket));
> >>     wake_up(&ca->set->btree_cache_wait);
> >>     wake_up(&ca->set->bucket_wait);
> >>     ......
> >> }
> >>
> >> static void bch_btree_gc(struct cache_set *c)
> >> {
> >>     ......
> >>     bch_btree_gc_finish(c);
> >>     wake_up_allocators(c);
> >>     ......
> >> }
> >> ```
> >>
> >> Apply this patch, each GC maybe only process very few nodes,
> >> GC would last a long time if sleep 100ms each time. So, the
> >> sleep time should be calculated dynamically based on gc_cost.
> >>
> >> At the same time, I added some cost statistics in gc_stat,
> >> hoping to provide useful information for future work.
> >
> >
> > Hi Mingzhe,
> >
> > From the first glance, I feel this change may delay the small GC period, and
> > finally result a large GC period, which is not expected.
> >
> > But it is possible that my feeling is incorrect. Do you have detailed
> > performance number about both I/O latency  and GC period, then I can have
> > more understanding for this effort.
> >
> > BTW, I will add this patch to my testing set and experience myself.
> >
> >
> > Thanks.
> >
> >
> > Coly Li
> >
> >
> Hi Coly,
> 
> First, your feeling is right. Then, I have some performance number abort
> before and after this patch.
> Since the mailing list does not accept attachments, I put them on the gist.
> 
> Please visit the page for details:
> https://gist.github.com/zoumingzhe/69a353e7c6fffe43142c2f42b94a67b5
> mingzhe

The graphs certainly show that peak latency is much lower, that is 
improvement, and dmesg shows the avail_nbuckets stays about the same so GC 
is keeping up.

Questions: 

1. Why is the after-"BW NO GC" graph so much flatter than the before-"BW 
   NO GC" graph?  I would expect your control measurements to be about the 
   same before vs after.  You might `blkdiscard` the cachedev and 
   re-format between runs in case the FTL is getting in the way, or maybe 
   something in the patch is affecting the "NO GC" graphs.

2. I wonder how the latency looks if you zoom into to the latency graph: 
   If you truncate the before-"LATENCY DO GC" graph at 3000 us then how 
   does the average latency look between the two?

3. This may be solved if you can fix the control graph issue in #1, but 
   the before vs after of "BW DO GC" shows about a 30% decrease in 
   bandwidth performance outside of the GC spikes.  "IOPS DO GC" is lower 
   with the patch too.  Do you think that your dynamic incremental gc 
   algorithm be tuned to deal with GC latency and still provide nearly the 
   same IOPS and bandwidth as before?


--
Eric Wheeler



> >
> >
> > [snipped]
> >
> >
> >
> 
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] bcache: dynamic incremental gc
  2022-05-20 18:24     ` Eric Wheeler
@ 2022-05-23  2:52       ` Zou Mingzhe
  2022-05-23 12:54         ` Zou Mingzhe
  0 siblings, 1 reply; 8+ messages in thread
From: Zou Mingzhe @ 2022-05-23  2:52 UTC (permalink / raw)
  To: linux-bcache; +Cc: zoumingzhe


在 2022/5/21 02:24, Eric Wheeler 写道:
> On Fri, 20 May 2022, Zou Mingzhe wrote:
>> 在 2022/5/12 21:41, Coly Li 写道:
>>> On 5/11/22 3:39 PM, mingzhe.zou@easystack.cn wrote:
>>> Hi Mingzhe,
>>>
>>>  From the first glance, I feel this change may delay the small GC period, and
>>> finally result a large GC period, which is not expected.
>>>
>>> But it is possible that my feeling is incorrect. Do you have detailed
>>> performance number about both I/O latency  and GC period, then I can have
>>> more understanding for this effort.
>>>
>>> BTW, I will add this patch to my testing set and experience myself.
>>>
>>>
>>> Thanks.
>>>
>>>
>>> Coly Li
>>>
>>>
>> Hi Coly,
>>
>> First, your feeling is right. Then, I have some performance number abort
>> before and after this patch.
>> Since the mailing list does not accept attachments, I put them on the gist.
>>
>> Please visit the page for details:
>> “https://gist.github.com/zoumingzhe/69a353e7c6fffe43142c2f42b94a67b5”
>> mingzhe
> The graphs certainly show that peak latency is much lower, that is
> improvement, and dmesg shows the avail_nbuckets stays about the same so GC
> is keeping up.
>
> Questions:
>
> 1. Why is the after-"BW NO GC" graph so much flatter than the before-"BW
>     NO GC" graph?  I would expect your control measurements to be about the
>     same before vs after.  You might `blkdiscard` the cachedev and
>     re-format between runs in case the FTL is getting in the way, or maybe
>     something in the patch is affecting the "NO GC" graphs.
Hi Eric,

First, I re-format the disk with make-bcache before each fio. Then, the 
graph after "BW NO GC" is much flatter than the graph before "BW NO GC", 
I think you may have seen another patch (bcache: allow allocator 
invalidate bucket in gc) I pushed . I also noticed a drop in iops of at 
least 20%, but we added a lot of patches between before and after, so it 
will take some time to figure out which patch is causing it.
>
> 2. I wonder how the latency looks if you zoom into to the latency graph:
>     If you truncate the before-"LATENCY DO GC" graph at 3000 us then how
>     does the average latency look between the two?
I will test the performance numbers for each patch one by one, and 
provide more detailed graph and number later.

mingzhe
>
> 3. This may be solved if you can fix the control graph issue in #1, but
>     the before vs after of "BW DO GC" shows about a 30% decrease in
>     bandwidth performance outside of the GC spikes.  "IOPS DO GC" is lower
>     with the patch too.  Do you think that your dynamic incremental gc
>     algorithm be tuned to deal with GC latency and still provide nearly the
>     same IOPS and bandwidth as before?
>
>
> --
> Eric Wheeler
>
>
>
>>>
>>> [snipped]
>>>
>>>
>>>

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] bcache: dynamic incremental gc
  2022-05-23  2:52       ` Zou Mingzhe
@ 2022-05-23 12:54         ` Zou Mingzhe
  2022-05-23 17:55           ` Eric Wheeler
  0 siblings, 1 reply; 8+ messages in thread
From: Zou Mingzhe @ 2022-05-23 12:54 UTC (permalink / raw)
  To: Eric Wheeler; +Cc: linux-bcache, zoumingzhe


在 2022/5/23 10:52, Zou Mingzhe 写道:
> 在 2022/5/21 02:24, Eric Wheeler 写道:
>> On Fri, 20 May 2022, Zou Mingzhe wrote:
>>
>> Questions:
>>
>> 1. Why is the after-"BW NO GC" graph so much flatter than the before-"BW
>>     NO GC" graph?  I would expect your control measurements to be 
>> about the
>>     same before vs after.  You might `blkdiscard` the cachedev and
>>     re-format between runs in case the FTL is getting in the way, or 
>> maybe
>>     something in the patch is affecting the "NO GC" graphs.
>> 2. I wonder how the latency looks if you zoom into to the latency graph:
>>     If you truncate the before-"LATENCY DO GC" graph at 3000 us then how
>>     does the average latency look between the two?
>> 3. This may be solved if you can fix the control graph issue in #1, but
>>     the before vs after of "BW DO GC" shows about a 30% decrease in
>>     bandwidth performance outside of the GC spikes.  "IOPS DO GC" is 
>> lower
>>     with the patch too.  Do you think that your dynamic incremental gc
>>     algorithm be tuned to deal with GC latency and still provide 
>> nearly the
>>     same IOPS and bandwidth as before?
>>
>>
>> -- 
>> Eric Wheeler

Hi Eric,

I have done a retest round and update all data on 
"https://gist.github.com/zoumingzhe/69a353e7c6fffe43142c2f42b94a67b5".

First, there is only this patch between before and after, I re-format 
the disk with make-bcache before each fio. Each case was tested 5 times, 
and the results are as follows:

                     before after
       NO GC           DO GC          NO GC          DO GC
1    99162.29     97366.28     99970.89     98290.81
2    99897.80     97879.99     96829.14     95548.88
3    98183.49     98834.29     101508.06   98581.53
4    98563.17     98476.61     96866.40     96676.87
5    97059.91     98356.50     96248.10     94442.61

Some details are also shown in the new graph, in addition to the raw 
data available for download.

I confirm that this patch does not cause a drop in iops. We have some 
other patches that may have affected the previous test, but this patch 
works fine.

In fact, we mostly modified the gc handling.

mingzhe








^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] bcache: dynamic incremental gc
  2022-05-23 12:54         ` Zou Mingzhe
@ 2022-05-23 17:55           ` Eric Wheeler
  2022-05-24  2:47             ` Zou Mingzhe
  0 siblings, 1 reply; 8+ messages in thread
From: Eric Wheeler @ 2022-05-23 17:55 UTC (permalink / raw)
  To: Zou Mingzhe; +Cc: linux-bcache, zoumingzhe

[-- Attachment #1: Type: text/plain, Size: 2715 bytes --]

On Mon, 23 May 2022, Zou Mingzhe wrote:
> 在 2022/5/23 10:52, Zou Mingzhe 写道:
> > 在 2022/5/21 02:24, Eric Wheeler 写道:
> >> On Fri, 20 May 2022, Zou Mingzhe wrote:
> >>
> >> Questions:
> >>
> >> 1. Why is the after-"BW NO GC" graph so much flatter than the before-"BW
> >>     NO GC" graph?  I would expect your control measurements to be about the
> >>     same before vs after.  You might `blkdiscard` the cachedev and
> >>     re-format between runs in case the FTL is getting in the way, or maybe
> >>     something in the patch is affecting the "NO GC" graphs.
> >> 2. I wonder how the latency looks if you zoom into to the latency graph:
> >>     If you truncate the before-"LATENCY DO GC" graph at 3000 us then how
> >>     does the average latency look between the two?
> >> 3. This may be solved if you can fix the control graph issue in #1, but
> >>     the before vs after of "BW DO GC" shows about a 30% decrease in
> >>     bandwidth performance outside of the GC spikes.  "IOPS DO GC" is lower
> >>     with the patch too.  Do you think that your dynamic incremental gc
> >>     algorithm be tuned to deal with GC latency and still provide nearly the
> >>     same IOPS and bandwidth as before?
> >>
> >>
> >> -- 
> >> Eric Wheeler
> 
> Hi Eric,
> 
> I have done a retest round and update all data on
> "https://gist.github.com/zoumingzhe/69a353e7c6fffe43142c2f42b94a67b5".
> 
> First, there is only this patch between before and after, I re-format the disk
> with make-bcache before each fio. Each case was tested 5 times, and the
> results are as follows:
> 
>                     before after
>       NO GC           DO GC          NO GC          DO GC
> 1    99162.29     97366.28     99970.89     98290.81
> 2    99897.80     97879.99     96829.14     95548.88
> 3    98183.49     98834.29     101508.06   98581.53
> 4    98563.17     98476.61     96866.40     96676.87
> 5    97059.91     98356.50     96248.10     94442.61
> 
> Some details are also shown in the new graph, in addition to the raw data
> available for download.
> 
> I confirm that this patch does not cause a drop in iops. We have some other
> patches that may have affected the previous test, but this patch works fine.

Looks great, glad to see that it performs well!

What is the 3000us spike on the "after" line of "LATENCY DO GC" at about 
t=40s ? There is a drop in IOPS and BW on the other graphs at t=40s, too, 
but I don't see the same behavior on the "NO GC" side.

-Eric

> 
> 
> In fact, we mostly modified the gc handling.
> 
> mingzhe
> 
> 
> 
> 
> 
> 
> 
> 
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v3] bcache: dynamic incremental gc
  2022-05-23 17:55           ` Eric Wheeler
@ 2022-05-24  2:47             ` Zou Mingzhe
  0 siblings, 0 replies; 8+ messages in thread
From: Zou Mingzhe @ 2022-05-24  2:47 UTC (permalink / raw)
  To: Eric Wheeler; +Cc: linux-bcache, zoumingzhe


在 2022/5/24 01:55, Eric Wheeler 写道:
> On Mon, 23 May 2022, Zou Mingzhe wrote:
>> 在 2022/5/23 10:52, Zou Mingzhe 写道:
>>> 在 2022/5/21 02:24, Eric Wheeler 写道:
>>>> On Fri, 20 May 2022, Zou Mingzhe wrote:
>>>>
>>>> Questions:
>>>>
>>>> 1. Why is the after-"BW NO GC" graph so much flatter than the before-"BW
>>>>      NO GC" graph?  I would expect your control measurements to be about the
>>>>      same before vs after.  You might `blkdiscard` the cachedev and
>>>>      re-format between runs in case the FTL is getting in the way, or maybe
>>>>      something in the patch is affecting the "NO GC" graphs.
>>>> 2. I wonder how the latency looks if you zoom into to the latency graph:
>>>>      If you truncate the before-"LATENCY DO GC" graph at 3000 us then how
>>>>      does the average latency look between the two?
>>>> 3. This may be solved if you can fix the control graph issue in #1, but
>>>>      the before vs after of "BW DO GC" shows about a 30% decrease in
>>>>      bandwidth performance outside of the GC spikes.  "IOPS DO GC" is lower
>>>>      with the patch too.  Do you think that your dynamic incremental gc
>>>>      algorithm be tuned to deal with GC latency and still provide nearly the
>>>>      same IOPS and bandwidth as before?
>>>>
>>>>
>>>> -- 
>>>> Eric Wheeler
>> Hi Eric,
>>
>> I have done a retest round and update all data on
>> "https://gist.github.com/zoumingzhe/69a353e7c6fffe43142c2f42b94a67b5".
>>
>> First, there is only this patch between before and after, I re-format the disk
>> with make-bcache before each fio. Each case was tested 5 times, and the
>> results are as follows:
>>
>>                      before after
>>        NO GC           DO GC          NO GC          DO GC
>> 1    99162.29     97366.28     99970.89     98290.81
>> 2    99897.80     97879.99     96829.14     95548.88
>> 3    98183.49     98834.29     101508.06   98581.53
>> 4    98563.17     98476.61     96866.40     96676.87
>> 5    97059.91     98356.50     96248.10     94442.61
>>
>> Some details are also shown in the new graph, in addition to the raw data
>> available for download.
>>
>> I confirm that this patch does not cause a drop in iops. We have some other
>> patches that may have affected the previous test, but this patch works fine.
> Looks great, glad to see that it performs well!
>
> What is the 3000us spike on the "after" line of "LATENCY DO GC" at about
> t=40s ? There is a drop in IOPS and BW on the other graphs at t=40s, too,
> but I don't see the same behavior on the "NO GC" side.
>
> -Eric
>
Hi, Eric

I tested each case 5 times and used the last result to make this graph, 
you can check the raw data.

The first 4 times results also have graph in the raw data, you can also 
find some spikes over  2000us on the "NO GC" side.

So, I don't think the 3000us "spike" at 40s are a problem, it just looks 
like a system fluctuation.

mingzhe
>> In fact, we mostly modified the gc handling.
>>
>> mingzhe
>>
>>
>>
>>
>>
>>
>>
>>

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2022-05-24  2:47 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-11  7:39 [PATCH v3] bcache: dynamic incremental gc mingzhe.zou
2022-05-12 13:41 ` Coly Li
2022-05-20  8:22   ` Zou Mingzhe
2022-05-20 18:24     ` Eric Wheeler
2022-05-23  2:52       ` Zou Mingzhe
2022-05-23 12:54         ` Zou Mingzhe
2022-05-23 17:55           ` Eric Wheeler
2022-05-24  2:47             ` Zou Mingzhe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.