All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] bcache: allow allocator to invalidate bucket in gc
@ 2020-09-10 11:21 Dongsheng Yang
  2020-09-10 11:28 ` [PATCH v2] " Dongsheng Yang
  0 siblings, 1 reply; 16+ messages in thread
From: Dongsheng Yang @ 2020-09-10 11:21 UTC (permalink / raw)
  To: colyli; +Cc: linux-bcache, Dongsheng Yang

Currently, if the gc is running, when the allocator found free_inc
is empty, allocator has to wait the gc finish. Before that, the
IO is blocked.

But actually, there would be some buckets is reclaimable before gc,
and gc will never mark this kind of bucket to be  unreclaimable.

So we can put these buckets into free_inc in gc running to avoid
IO blocking.

Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
---
 drivers/md/bcache/alloc.c  | 10 ++++------
 drivers/md/bcache/bcache.h |  1 +
 drivers/md/bcache/btree.c  | 10 +++++++++-
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 52035a7..265fa05 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -130,12 +130,11 @@ static inline bool can_inc_bucket_gen(struct bucket *b)
 
 bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
 {
-	BUG_ON(!ca->set->gc_mark_valid);
-
-	return (!GC_MARK(b) ||
+	return ((b->reclaimable_in_gc || ca->set->gc_mark_valid) &&
+		((!GC_MARK(b) ||
 		GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
 		!atomic_read(&b->pin) &&
-		can_inc_bucket_gen(b);
+		can_inc_bucket_gen(b)));
 }
 
 void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
@@ -353,8 +352,7 @@ static int bch_allocator_thread(void *arg)
 		 */
 
 retry_invalidate:
-		allocator_wait(ca, ca->set->gc_mark_valid &&
-			       !ca->invalidate_needs_gc);
+		allocator_wait(ca, !ca->invalidate_needs_gc);
 		invalidate_buckets(ca);
 
 		/*
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 4fd03d2..870f146 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -200,6 +200,7 @@ struct bucket {
 	uint8_t		gen;
 	uint8_t		last_gc; /* Most out of date gen in the btree */
 	uint16_t	gc_mark; /* Bitfield used by GC. See below for field */
+	uint16_t	reclaimable_in_gc:1;
 };
 
 /*
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3d8bd06..d45a1dd 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1702,18 +1702,21 @@ static void btree_gc_start(struct cache_set *c)
 
 	mutex_lock(&c->bucket_lock);
 
-	c->gc_mark_valid = 0;
 	c->gc_done = ZERO_KEY;
 
 	for_each_cache(ca, c, i)
 		for_each_bucket(b, ca) {
 			b->last_gc = b->gen;
+			if (bch_can_invalidate_bucket(ca, b))
+				b->reclaimable_in_gc = 1;
+
 			if (!atomic_read(&b->pin)) {
 				SET_GC_MARK(b, 0);
 				SET_GC_SECTORS_USED(b, 0);
 			}
 		}
 
+	c->gc_mark_valid = 0;
 	mutex_unlock(&c->bucket_lock);
 }
 
@@ -1729,6 +1732,11 @@ static void bch_btree_gc_finish(struct cache_set *c)
 	c->gc_mark_valid = 1;
 	c->need_gc	= 0;
 
+	for_each_cache(ca, c, i)
+		for_each_bucket(b, ca)
+			if (b->reclaimable_in_gc)
+				b->reclaimable_in_gc = 0;
+
 	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
 		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
 			    GC_MARK_METADATA);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2020-09-10 11:21 [PATCH] bcache: allow allocator to invalidate bucket in gc Dongsheng Yang
@ 2020-09-10 11:28 ` Dongsheng Yang
  2020-09-18  9:53   ` Coly Li
  0 siblings, 1 reply; 16+ messages in thread
From: Dongsheng Yang @ 2020-09-10 11:28 UTC (permalink / raw)
  To: colyli; +Cc: linux-bcache, Dongsheng Yang

Currently, if the gc is running, when the allocator found free_inc
is empty, allocator has to wait the gc finish. Before that, the
IO is blocked.

But actually, there would be some buckets is reclaimable before gc,
and gc will never mark this kind of bucket to be  unreclaimable.

So we can put these buckets into free_inc in gc running to avoid
IO being blocked.

Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn>
---
 drivers/md/bcache/alloc.c  | 11 +++++------
 drivers/md/bcache/bcache.h |  1 +
 drivers/md/bcache/btree.c  | 10 +++++++++-
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index 52035a7..faa5a5d 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -130,12 +130,11 @@ static inline bool can_inc_bucket_gen(struct bucket *b)
 
 bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
 {
-	BUG_ON(!ca->set->gc_mark_valid);
-
-	return (!GC_MARK(b) ||
+	return ((b->reclaimable_in_gc || ca->set->gc_mark_valid) &&
+		((!GC_MARK(b) ||
 		GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
 		!atomic_read(&b->pin) &&
-		can_inc_bucket_gen(b);
+		can_inc_bucket_gen(b)));
 }
 
 void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
@@ -149,6 +148,7 @@ void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
 	bch_inc_gen(ca, b);
 	b->prio = INITIAL_PRIO;
 	atomic_inc(&b->pin);
+	b->reclaimable_in_gc = 0;
 }
 
 static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
@@ -353,8 +353,7 @@ static int bch_allocator_thread(void *arg)
 		 */
 
 retry_invalidate:
-		allocator_wait(ca, ca->set->gc_mark_valid &&
-			       !ca->invalidate_needs_gc);
+		allocator_wait(ca, !ca->invalidate_needs_gc);
 		invalidate_buckets(ca);
 
 		/*
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 4fd03d2..870f146 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -200,6 +200,7 @@ struct bucket {
 	uint8_t		gen;
 	uint8_t		last_gc; /* Most out of date gen in the btree */
 	uint16_t	gc_mark; /* Bitfield used by GC. See below for field */
+	uint16_t	reclaimable_in_gc:1;
 };
 
 /*
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 3d8bd06..d45a1dd 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1702,18 +1702,21 @@ static void btree_gc_start(struct cache_set *c)
 
 	mutex_lock(&c->bucket_lock);
 
-	c->gc_mark_valid = 0;
 	c->gc_done = ZERO_KEY;
 
 	for_each_cache(ca, c, i)
 		for_each_bucket(b, ca) {
 			b->last_gc = b->gen;
+			if (bch_can_invalidate_bucket(ca, b))
+				b->reclaimable_in_gc = 1;
+
 			if (!atomic_read(&b->pin)) {
 				SET_GC_MARK(b, 0);
 				SET_GC_SECTORS_USED(b, 0);
 			}
 		}
 
+	c->gc_mark_valid = 0;
 	mutex_unlock(&c->bucket_lock);
 }
 
@@ -1729,6 +1732,11 @@ static void bch_btree_gc_finish(struct cache_set *c)
 	c->gc_mark_valid = 1;
 	c->need_gc	= 0;
 
+	for_each_cache(ca, c, i)
+		for_each_bucket(b, ca)
+			if (b->reclaimable_in_gc)
+				b->reclaimable_in_gc = 0;
+
 	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
 		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
 			    GC_MARK_METADATA);
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2020-09-10 11:28 ` [PATCH v2] " Dongsheng Yang
@ 2020-09-18  9:53   ` Coly Li
  2024-03-15 22:45     ` Robert Pang
  0 siblings, 1 reply; 16+ messages in thread
From: Coly Li @ 2020-09-18  9:53 UTC (permalink / raw)
  To: Dongsheng Yang; +Cc: linux-bcache

On 2020/9/10 19:28, Dongsheng Yang wrote:
> Currently, if the gc is running, when the allocator found free_inc
> is empty, allocator has to wait the gc finish. Before that, the
> IO is blocked.
> 
> But actually, there would be some buckets is reclaimable before gc,
> and gc will never mark this kind of bucket to be  unreclaimable.
> 
> So we can put these buckets into free_inc in gc running to avoid
> IO being blocked.
> 
> Signed-off-by: Dongsheng Yang <dongsheng.yang@easystack.cn>

Hi Dongsheng,

This is not a simple change :-)

Let's do more testing for this patch, and give me more time to
understand the new code path.

Thanks for the idea.

Coly Li


> ---
>  drivers/md/bcache/alloc.c  | 11 +++++------
>  drivers/md/bcache/bcache.h |  1 +
>  drivers/md/bcache/btree.c  | 10 +++++++++-
>  3 files changed, 15 insertions(+), 7 deletions(-)
> 
[snipped]


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2020-09-18  9:53   ` Coly Li
@ 2024-03-15 22:45     ` Robert Pang
  2024-03-16  2:48       ` Coly Li
  0 siblings, 1 reply; 16+ messages in thread
From: Robert Pang @ 2024-03-15 22:45 UTC (permalink / raw)
  To: colyli; +Cc: dongsheng.yang, linux-bcache

Hi all

We found this patch via google.

We have a setup that uses bcache to cache a network attached storage in a local SSD drive. Under heavy traffic, IO on the cached device stalls every hour or so for tens of seconds. When we track the latency with "fio" utility continuously, we can see the max IO latency shoots up when stall happens,  

latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:14:18 2024
  read: IOPS=62.3k, BW=486MiB/s (510MB/s)(11.4GiB/24000msec)
    slat (nsec): min=1377, max=98964, avg=4567.31, stdev=1330.69
    clat (nsec): min=367, max=43682, avg=429.77, stdev=234.70
     lat (nsec): min=1866, max=105301, avg=5068.60, stdev=1383.14
    clat percentiles (nsec):
     |  1.00th=[  386],  5.00th=[  406], 10.00th=[  406], 20.00th=[  410],
     | 30.00th=[  414], 40.00th=[  414], 50.00th=[  414], 60.00th=[  418],
     | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  462],
     | 99.00th=[  652], 99.50th=[  708], 99.90th=[ 3088], 99.95th=[ 5600],
     | 99.99th=[11328]
   bw (  KiB/s): min=318192, max=627591, per=99.97%, avg=497939.04, stdev=81923.63, samples=47
   iops        : min=39774, max=78448, avg=62242.15, stdev=10240.39, samples=47
...
 
<IO stall>

latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:21:23 2024
  read: IOPS=26.0k, BW=203MiB/s (213MB/s)(89.1GiB/448867msec)
    slat (nsec): min=958, max=40745M, avg=15596.66, stdev=13650543.09
    clat (nsec): min=364, max=104599, avg=435.81, stdev=302.81
     lat (nsec): min=1416, max=40745M, avg=16104.06, stdev=13650546.77
    clat percentiles (nsec):
     |  1.00th=[  378],  5.00th=[  390], 10.00th=[  406], 20.00th=[  410],
     | 30.00th=[  414], 40.00th=[  414], 50.00th=[  418], 60.00th=[  418],
     | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  494],
     | 99.00th=[  772], 99.50th=[  916], 99.90th=[ 3856], 99.95th=[ 5920],
     | 99.99th=[10816]
   bw (  KiB/s): min=    1, max=627591, per=100.00%, avg=244393.77, stdev=103534.74, samples=765
   iops        : min=    0, max=78448, avg=30549.06, stdev=12941.82, samples=765

When we track per-second max latency in fio, we see something like this:

<time-ms>,<max-latency-ns>,,,
...
777000, 5155548, 0, 0, 0
778000, 105551, 1, 0, 0
802615, 24276019570, 0, 0, 0
802615, 82134, 1, 0, 0
804000, 9944554, 0, 0, 0
805000, 7424638, 1, 0, 0

fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=fio --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
 
We saw a smiliar issue reported in https://www.spinics.net/lists/linux-bcache/msg09578.html, which suggests an issue in garbage collection. When we trigger GC manually via "echo 1 > /sys/fs/bcache/a356bdb0-...-64f794387488/internal/trigger_gc", the stall is always reproduced. That thread points to this patch (https://www.spinics.net/lists/linux-bcache/msg08870.html) that we tested and the stall no longer happens.

AFAIK, this patch marks buckets reclaimable at the beginning of GC to unblock the allocator so it does not need to wait for GC to finish. This periodic stall is a serious issue. Can the community look at this issue and this patch if possible?

We are running Linux kernel version 5.10 and 6.1.

Thank you.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-03-15 22:45     ` Robert Pang
@ 2024-03-16  2:48       ` Coly Li
  2024-03-17  5:41         ` Robert Pang
  0 siblings, 1 reply; 16+ messages in thread
From: Coly Li @ 2024-03-16  2:48 UTC (permalink / raw)
  To: Robert Pang; +Cc: Dongsheng Yang, linux-bcache

Hi Robert,

Thanks for your email.

> 2024年3月16日 06:45,Robert Pang <robertpang@google.com> 写道:
> 
> Hi all
> 
> We found this patch via google.
> 
> We have a setup that uses bcache to cache a network attached storage in a local SSD drive. Under heavy traffic, IO on the cached device stalls every hour or so for tens of seconds. When we track the latency with "fio" utility continuously, we can see the max IO latency shoots up when stall happens,  
> 
> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:14:18 2024
>  read: IOPS=62.3k, BW=486MiB/s (510MB/s)(11.4GiB/24000msec)
>    slat (nsec): min=1377, max=98964, avg=4567.31, stdev=1330.69
>    clat (nsec): min=367, max=43682, avg=429.77, stdev=234.70
>     lat (nsec): min=1866, max=105301, avg=5068.60, stdev=1383.14
>    clat percentiles (nsec):
>     |  1.00th=[  386],  5.00th=[  406], 10.00th=[  406], 20.00th=[  410],
>     | 30.00th=[  414], 40.00th=[  414], 50.00th=[  414], 60.00th=[  418],
>     | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  462],
>     | 99.00th=[  652], 99.50th=[  708], 99.90th=[ 3088], 99.95th=[ 5600],
>     | 99.99th=[11328]
>   bw (  KiB/s): min=318192, max=627591, per=99.97%, avg=497939.04, stdev=81923.63, samples=47
>   iops        : min=39774, max=78448, avg=62242.15, stdev=10240.39, samples=47
> ...
> 
> <IO stall>
> 
> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:21:23 2024
>  read: IOPS=26.0k, BW=203MiB/s (213MB/s)(89.1GiB/448867msec)
>    slat (nsec): min=958, max=40745M, avg=15596.66, stdev=13650543.09
>    clat (nsec): min=364, max=104599, avg=435.81, stdev=302.81
>     lat (nsec): min=1416, max=40745M, avg=16104.06, stdev=13650546.77
>    clat percentiles (nsec):
>     |  1.00th=[  378],  5.00th=[  390], 10.00th=[  406], 20.00th=[  410],
>     | 30.00th=[  414], 40.00th=[  414], 50.00th=[  418], 60.00th=[  418],
>     | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  494],
>     | 99.00th=[  772], 99.50th=[  916], 99.90th=[ 3856], 99.95th=[ 5920],
>     | 99.99th=[10816]
>   bw (  KiB/s): min=    1, max=627591, per=100.00%, avg=244393.77, stdev=103534.74, samples=765
>   iops        : min=    0, max=78448, avg=30549.06, stdev=12941.82, samples=765
> 
> When we track per-second max latency in fio, we see something like this:
> 
> <time-ms>,<max-latency-ns>,,,
> ...
> 777000, 5155548, 0, 0, 0
> 778000, 105551, 1, 0, 0
> 802615, 24276019570, 0, 0, 0
> 802615, 82134, 1, 0, 0
> 804000, 9944554, 0, 0, 0
> 805000, 7424638, 1, 0, 0
> 
> fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=fio --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
> 
> We saw a smiliar issue reported in https://www.spinics.net/lists/linux-bcache/msg09578.html, which suggests an issue in garbage collection. When we trigger GC manually via "echo 1 > /sys/fs/bcache/a356bdb0-...-64f794387488/internal/trigger_gc", the stall is always reproduced. That thread points to this patch (https://www.spinics.net/lists/linux-bcache/msg08870.html) that we tested and the stall no longer happens.
> 
> AFAIK, this patch marks buckets reclaimable at the beginning of GC to unblock the allocator so it does not need to wait for GC to finish. This periodic stall is a serious issue. Can the community look at this issue and this patch if possible?
> 

Could you please share more performance information of this patch? And how many nodes/how long time does the test cover so far?

Last time I test the patch, it looked fine. But I was not confident how large scale and how long time this patch was tested. If you may provide more testing information, it will be helpful.
 

Coly Li

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-03-16  2:48       ` Coly Li
@ 2024-03-17  5:41         ` Robert Pang
  2024-03-17 13:59           ` Coly Li
  0 siblings, 1 reply; 16+ messages in thread
From: Robert Pang @ 2024-03-17  5:41 UTC (permalink / raw)
  To: Coly Li; +Cc: Dongsheng Yang, linux-bcache

Hi Coly

Thank you for looking into this issue.

We tested this patch in 5 machines with local SSD size ranging from
375 GB to 9 TB, and ran tests for 10 to 12 hours each. We observed no
stall nor other issues. Performance was comparable before and after
the patch. Hope this info will be helpful.

Yours
Robert


On Fri, Mar 15, 2024 at 7:49 PM Coly Li <colyli@suse.de> wrote:
>
> Hi Robert,
>
> Thanks for your email.
>
> > 2024年3月16日 06:45,Robert Pang <robertpang@google.com> 写道:
> >
> > Hi all
> >
> > We found this patch via google.
> >
> > We have a setup that uses bcache to cache a network attached storage in a local SSD drive. Under heavy traffic, IO on the cached device stalls every hour or so for tens of seconds. When we track the latency with "fio" utility continuously, we can see the max IO latency shoots up when stall happens,
> >
> > latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:14:18 2024
> >  read: IOPS=62.3k, BW=486MiB/s (510MB/s)(11.4GiB/24000msec)
> >    slat (nsec): min=1377, max=98964, avg=4567.31, stdev=1330.69
> >    clat (nsec): min=367, max=43682, avg=429.77, stdev=234.70
> >     lat (nsec): min=1866, max=105301, avg=5068.60, stdev=1383.14
> >    clat percentiles (nsec):
> >     |  1.00th=[  386],  5.00th=[  406], 10.00th=[  406], 20.00th=[  410],
> >     | 30.00th=[  414], 40.00th=[  414], 50.00th=[  414], 60.00th=[  418],
> >     | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  462],
> >     | 99.00th=[  652], 99.50th=[  708], 99.90th=[ 3088], 99.95th=[ 5600],
> >     | 99.99th=[11328]
> >   bw (  KiB/s): min=318192, max=627591, per=99.97%, avg=497939.04, stdev=81923.63, samples=47
> >   iops        : min=39774, max=78448, avg=62242.15, stdev=10240.39, samples=47
> > ...
> >
> > <IO stall>
> >
> > latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:21:23 2024
> >  read: IOPS=26.0k, BW=203MiB/s (213MB/s)(89.1GiB/448867msec)
> >    slat (nsec): min=958, max=40745M, avg=15596.66, stdev=13650543.09
> >    clat (nsec): min=364, max=104599, avg=435.81, stdev=302.81
> >     lat (nsec): min=1416, max=40745M, avg=16104.06, stdev=13650546.77
> >    clat percentiles (nsec):
> >     |  1.00th=[  378],  5.00th=[  390], 10.00th=[  406], 20.00th=[  410],
> >     | 30.00th=[  414], 40.00th=[  414], 50.00th=[  418], 60.00th=[  418],
> >     | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  494],
> >     | 99.00th=[  772], 99.50th=[  916], 99.90th=[ 3856], 99.95th=[ 5920],
> >     | 99.99th=[10816]
> >   bw (  KiB/s): min=    1, max=627591, per=100.00%, avg=244393.77, stdev=103534.74, samples=765
> >   iops        : min=    0, max=78448, avg=30549.06, stdev=12941.82, samples=765
> >
> > When we track per-second max latency in fio, we see something like this:
> >
> > <time-ms>,<max-latency-ns>,,,
> > ...
> > 777000, 5155548, 0, 0, 0
> > 778000, 105551, 1, 0, 0
> > 802615, 24276019570, 0, 0, 0
> > 802615, 82134, 1, 0, 0
> > 804000, 9944554, 0, 0, 0
> > 805000, 7424638, 1, 0, 0
> >
> > fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=fio --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
> >
> > We saw a smiliar issue reported in https://www.spinics.net/lists/linux-bcache/msg09578.html, which suggests an issue in garbage collection. When we trigger GC manually via "echo 1 > /sys/fs/bcache/a356bdb0-...-64f794387488/internal/trigger_gc", the stall is always reproduced. That thread points to this patch (https://www.spinics.net/lists/linux-bcache/msg08870.html) that we tested and the stall no longer happens.
> >
> > AFAIK, this patch marks buckets reclaimable at the beginning of GC to unblock the allocator so it does not need to wait for GC to finish. This periodic stall is a serious issue. Can the community look at this issue and this patch if possible?
> >
>
> Could you please share more performance information of this patch? And how many nodes/how long time does the test cover so far?
>
> Last time I test the patch, it looked fine. But I was not confident how large scale and how long time this patch was tested. If you may provide more testing information, it will be helpful.
>
>
> Coly Li

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-03-17  5:41         ` Robert Pang
@ 2024-03-17 13:59           ` Coly Li
  2024-03-18  6:16             ` Robert Pang
  0 siblings, 1 reply; 16+ messages in thread
From: Coly Li @ 2024-03-17 13:59 UTC (permalink / raw)
  To: Robert Pang; +Cc: Dongsheng Yang, Bcache Linux



> 2024年3月17日 13:41,Robert Pang <robertpang@google.com> 写道:
> 
> Hi Coly
> 

Hi Robert,

> Thank you for looking into this issue.
> 
> We tested this patch in 5 machines with local SSD size ranging from
> 375 GB to 9 TB, and ran tests for 10 to 12 hours each. We observed no
> stall nor other issues. Performance was comparable before and after
> the patch. Hope this info will be helpful.

Thanks for the information.

Also I was told this patch has been deployed and shipped for 1+ year in easystack products, works well.

The above information makes me feel confident for this patch. I will submit it in next merge window if some ultra testing loop passes.

Coly Li


> 
> 
> On Fri, Mar 15, 2024 at 7:49 PM Coly Li <colyli@suse.de> wrote:
>> 
>> Hi Robert,
>> 
>> Thanks for your email.
>> 
>>> 2024年3月16日 06:45,Robert Pang <robertpang@google.com> 写道:
>>> 
>>> Hi all
>>> 
>>> We found this patch via google.
>>> 
>>> We have a setup that uses bcache to cache a network attached storage in a local SSD drive. Under heavy traffic, IO on the cached device stalls every hour or so for tens of seconds. When we track the latency with "fio" utility continuously, we can see the max IO latency shoots up when stall happens,
>>> 
>>> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:14:18 2024
>>> read: IOPS=62.3k, BW=486MiB/s (510MB/s)(11.4GiB/24000msec)
>>>   slat (nsec): min=1377, max=98964, avg=4567.31, stdev=1330.69
>>>   clat (nsec): min=367, max=43682, avg=429.77, stdev=234.70
>>>    lat (nsec): min=1866, max=105301, avg=5068.60, stdev=1383.14
>>>   clat percentiles (nsec):
>>>    |  1.00th=[  386],  5.00th=[  406], 10.00th=[  406], 20.00th=[  410],
>>>    | 30.00th=[  414], 40.00th=[  414], 50.00th=[  414], 60.00th=[  418],
>>>    | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  462],
>>>    | 99.00th=[  652], 99.50th=[  708], 99.90th=[ 3088], 99.95th=[ 5600],
>>>    | 99.99th=[11328]
>>>  bw (  KiB/s): min=318192, max=627591, per=99.97%, avg=497939.04, stdev=81923.63, samples=47
>>>  iops        : min=39774, max=78448, avg=62242.15, stdev=10240.39, samples=47
>>> ...
>>> 
>>> <IO stall>
>>> 
>>> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:21:23 2024
>>> read: IOPS=26.0k, BW=203MiB/s (213MB/s)(89.1GiB/448867msec)
>>>   slat (nsec): min=958, max=40745M, avg=15596.66, stdev=13650543.09
>>>   clat (nsec): min=364, max=104599, avg=435.81, stdev=302.81
>>>    lat (nsec): min=1416, max=40745M, avg=16104.06, stdev=13650546.77
>>>   clat percentiles (nsec):
>>>    |  1.00th=[  378],  5.00th=[  390], 10.00th=[  406], 20.00th=[  410],
>>>    | 30.00th=[  414], 40.00th=[  414], 50.00th=[  418], 60.00th=[  418],
>>>    | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  494],
>>>    | 99.00th=[  772], 99.50th=[  916], 99.90th=[ 3856], 99.95th=[ 5920],
>>>    | 99.99th=[10816]
>>>  bw (  KiB/s): min=    1, max=627591, per=100.00%, avg=244393.77, stdev=103534.74, samples=765
>>>  iops        : min=    0, max=78448, avg=30549.06, stdev=12941.82, samples=765
>>> 
>>> When we track per-second max latency in fio, we see something like this:
>>> 
>>> <time-ms>,<max-latency-ns>,,,
>>> ...
>>> 777000, 5155548, 0, 0, 0
>>> 778000, 105551, 1, 0, 0
>>> 802615, 24276019570, 0, 0, 0
>>> 802615, 82134, 1, 0, 0
>>> 804000, 9944554, 0, 0, 0
>>> 805000, 7424638, 1, 0, 0
>>> 
>>> fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=fio --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
>>> 
>>> We saw a smiliar issue reported in https://www.spinics.net/lists/linux-bcache/msg09578.html, which suggests an issue in garbage collection. When we trigger GC manually via "echo 1 > /sys/fs/bcache/a356bdb0-...-64f794387488/internal/trigger_gc", the stall is always reproduced. That thread points to this patch (https://www.spinics.net/lists/linux-bcache/msg08870.html) that we tested and the stall no longer happens.
>>> 
>>> AFAIK, this patch marks buckets reclaimable at the beginning of GC to unblock the allocator so it does not need to wait for GC to finish. This periodic stall is a serious issue. Can the community look at this issue and this patch if possible?
>>> 
>> 
>> Could you please share more performance information of this patch? And how many nodes/how long time does the test cover so far?
>> 
>> Last time I test the patch, it looked fine. But I was not confident how large scale and how long time this patch was tested. If you may provide more testing information, it will be helpful.
>> 
>> 
>> Coly Li
> 


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-03-17 13:59           ` Coly Li
@ 2024-03-18  6:16             ` Robert Pang
  2024-03-28 18:05               ` Robert Pang
  0 siblings, 1 reply; 16+ messages in thread
From: Robert Pang @ 2024-03-18  6:16 UTC (permalink / raw)
  To: Coly Li; +Cc: Dongsheng Yang, Bcache Linux

Hi Coly

Thank you for confirming. It looks like the 6.9 merge window just
opened last week so we hope it can catch it. Please update in this
thread when it gets submitted.

https://lore.kernel.org/lkml/CAHk-=wiehc0DfPtL6fC2=bFuyzkTnuiuYSQrr6JTQxQao6pq1Q@mail.gmail.com/T/

BTW, speaking of testing, mind if you point us to the bcache test
suite? We would like to have a look and maybe give it a try also.

Thanks
Robert

On Sun, Mar 17, 2024 at 7:00 AM Coly Li <colyli@suse.de> wrote:
>
>
>
> > 2024年3月17日 13:41,Robert Pang <robertpang@google.com> 写道:
> >
> > Hi Coly
> >
>
> Hi Robert,
>
> > Thank you for looking into this issue.
> >
> > We tested this patch in 5 machines with local SSD size ranging from
> > 375 GB to 9 TB, and ran tests for 10 to 12 hours each. We observed no
> > stall nor other issues. Performance was comparable before and after
> > the patch. Hope this info will be helpful.
>
> Thanks for the information.
>
> Also I was told this patch has been deployed and shipped for 1+ year in easystack products, works well.
>
> The above information makes me feel confident for this patch. I will submit it in next merge window if some ultra testing loop passes.
>
> Coly Li
>
>
> >
> >
> > On Fri, Mar 15, 2024 at 7:49 PM Coly Li <colyli@suse.de> wrote:
> >>
> >> Hi Robert,
> >>
> >> Thanks for your email.
> >>
> >>> 2024年3月16日 06:45,Robert Pang <robertpang@google.com> 写道:
> >>>
> >>> Hi all
> >>>
> >>> We found this patch via google.
> >>>
> >>> We have a setup that uses bcache to cache a network attached storage in a local SSD drive. Under heavy traffic, IO on the cached device stalls every hour or so for tens of seconds. When we track the latency with "fio" utility continuously, we can see the max IO latency shoots up when stall happens,
> >>>
> >>> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:14:18 2024
> >>> read: IOPS=62.3k, BW=486MiB/s (510MB/s)(11.4GiB/24000msec)
> >>>   slat (nsec): min=1377, max=98964, avg=4567.31, stdev=1330.69
> >>>   clat (nsec): min=367, max=43682, avg=429.77, stdev=234.70
> >>>    lat (nsec): min=1866, max=105301, avg=5068.60, stdev=1383.14
> >>>   clat percentiles (nsec):
> >>>    |  1.00th=[  386],  5.00th=[  406], 10.00th=[  406], 20.00th=[  410],
> >>>    | 30.00th=[  414], 40.00th=[  414], 50.00th=[  414], 60.00th=[  418],
> >>>    | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  462],
> >>>    | 99.00th=[  652], 99.50th=[  708], 99.90th=[ 3088], 99.95th=[ 5600],
> >>>    | 99.99th=[11328]
> >>>  bw (  KiB/s): min=318192, max=627591, per=99.97%, avg=497939.04, stdev=81923.63, samples=47
> >>>  iops        : min=39774, max=78448, avg=62242.15, stdev=10240.39, samples=47
> >>> ...
> >>>
> >>> <IO stall>
> >>>
> >>> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:21:23 2024
> >>> read: IOPS=26.0k, BW=203MiB/s (213MB/s)(89.1GiB/448867msec)
> >>>   slat (nsec): min=958, max=40745M, avg=15596.66, stdev=13650543.09
> >>>   clat (nsec): min=364, max=104599, avg=435.81, stdev=302.81
> >>>    lat (nsec): min=1416, max=40745M, avg=16104.06, stdev=13650546.77
> >>>   clat percentiles (nsec):
> >>>    |  1.00th=[  378],  5.00th=[  390], 10.00th=[  406], 20.00th=[  410],
> >>>    | 30.00th=[  414], 40.00th=[  414], 50.00th=[  418], 60.00th=[  418],
> >>>    | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  494],
> >>>    | 99.00th=[  772], 99.50th=[  916], 99.90th=[ 3856], 99.95th=[ 5920],
> >>>    | 99.99th=[10816]
> >>>  bw (  KiB/s): min=    1, max=627591, per=100.00%, avg=244393.77, stdev=103534.74, samples=765
> >>>  iops        : min=    0, max=78448, avg=30549.06, stdev=12941.82, samples=765
> >>>
> >>> When we track per-second max latency in fio, we see something like this:
> >>>
> >>> <time-ms>,<max-latency-ns>,,,
> >>> ...
> >>> 777000, 5155548, 0, 0, 0
> >>> 778000, 105551, 1, 0, 0
> >>> 802615, 24276019570, 0, 0, 0
> >>> 802615, 82134, 1, 0, 0
> >>> 804000, 9944554, 0, 0, 0
> >>> 805000, 7424638, 1, 0, 0
> >>>
> >>> fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=fio --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
> >>>
> >>> We saw a smiliar issue reported in https://www.spinics.net/lists/linux-bcache/msg09578.html, which suggests an issue in garbage collection. When we trigger GC manually via "echo 1 > /sys/fs/bcache/a356bdb0-...-64f794387488/internal/trigger_gc", the stall is always reproduced. That thread points to this patch (https://www.spinics.net/lists/linux-bcache/msg08870.html) that we tested and the stall no longer happens.
> >>>
> >>> AFAIK, this patch marks buckets reclaimable at the beginning of GC to unblock the allocator so it does not need to wait for GC to finish. This periodic stall is a serious issue. Can the community look at this issue and this patch if possible?
> >>>
> >>
> >> Could you please share more performance information of this patch? And how many nodes/how long time does the test cover so far?
> >>
> >> Last time I test the patch, it looked fine. But I was not confident how large scale and how long time this patch was tested. If you may provide more testing information, it will be helpful.
> >>
> >>
> >> Coly Li
> >
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-03-18  6:16             ` Robert Pang
@ 2024-03-28 18:05               ` Robert Pang
  2024-03-29 13:00                 ` Coly Li
  0 siblings, 1 reply; 16+ messages in thread
From: Robert Pang @ 2024-03-28 18:05 UTC (permalink / raw)
  To: Coly Li; +Cc: Dongsheng Yang, Bcache Linux

Hi bcache developers

Greetings. Any update on this patch? How are things going with the
testing and submission upstream?

Thanks
Robert


On Sun, Mar 17, 2024 at 11:16 PM Robert Pang <robertpang@google.com> wrote:
>
> Hi Coly
>
> Thank you for confirming. It looks like the 6.9 merge window just
> opened last week so we hope it can catch it. Please update in this
> thread when it gets submitted.
>
> https://lore.kernel.org/lkml/CAHk-=wiehc0DfPtL6fC2=bFuyzkTnuiuYSQrr6JTQxQao6pq1Q@mail.gmail.com/T/
>
> BTW, speaking of testing, mind if you point us to the bcache test
> suite? We would like to have a look and maybe give it a try also.
>
> Thanks
> Robert
>
> On Sun, Mar 17, 2024 at 7:00 AM Coly Li <colyli@suse.de> wrote:
> >
> >
> >
> > > 2024年3月17日 13:41,Robert Pang <robertpang@google.com> 写道:
> > >
> > > Hi Coly
> > >
> >
> > Hi Robert,
> >
> > > Thank you for looking into this issue.
> > >
> > > We tested this patch in 5 machines with local SSD size ranging from
> > > 375 GB to 9 TB, and ran tests for 10 to 12 hours each. We observed no
> > > stall nor other issues. Performance was comparable before and after
> > > the patch. Hope this info will be helpful.
> >
> > Thanks for the information.
> >
> > Also I was told this patch has been deployed and shipped for 1+ year in easystack products, works well.
> >
> > The above information makes me feel confident for this patch. I will submit it in next merge window if some ultra testing loop passes.
> >
> > Coly Li
> >
> >
> > >
> > >
> > > On Fri, Mar 15, 2024 at 7:49 PM Coly Li <colyli@suse.de> wrote:
> > >>
> > >> Hi Robert,
> > >>
> > >> Thanks for your email.
> > >>
> > >>> 2024年3月16日 06:45,Robert Pang <robertpang@google.com> 写道:
> > >>>
> > >>> Hi all
> > >>>
> > >>> We found this patch via google.
> > >>>
> > >>> We have a setup that uses bcache to cache a network attached storage in a local SSD drive. Under heavy traffic, IO on the cached device stalls every hour or so for tens of seconds. When we track the latency with "fio" utility continuously, we can see the max IO latency shoots up when stall happens,
> > >>>
> > >>> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:14:18 2024
> > >>> read: IOPS=62.3k, BW=486MiB/s (510MB/s)(11.4GiB/24000msec)
> > >>>   slat (nsec): min=1377, max=98964, avg=4567.31, stdev=1330.69
> > >>>   clat (nsec): min=367, max=43682, avg=429.77, stdev=234.70
> > >>>    lat (nsec): min=1866, max=105301, avg=5068.60, stdev=1383.14
> > >>>   clat percentiles (nsec):
> > >>>    |  1.00th=[  386],  5.00th=[  406], 10.00th=[  406], 20.00th=[  410],
> > >>>    | 30.00th=[  414], 40.00th=[  414], 50.00th=[  414], 60.00th=[  418],
> > >>>    | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  462],
> > >>>    | 99.00th=[  652], 99.50th=[  708], 99.90th=[ 3088], 99.95th=[ 5600],
> > >>>    | 99.99th=[11328]
> > >>>  bw (  KiB/s): min=318192, max=627591, per=99.97%, avg=497939.04, stdev=81923.63, samples=47
> > >>>  iops        : min=39774, max=78448, avg=62242.15, stdev=10240.39, samples=47
> > >>> ...
> > >>>
> > >>> <IO stall>
> > >>>
> > >>> latency_test: (groupid=0, jobs=1): err= 0: pid=50416: Fri Mar 15 21:21:23 2024
> > >>> read: IOPS=26.0k, BW=203MiB/s (213MB/s)(89.1GiB/448867msec)
> > >>>   slat (nsec): min=958, max=40745M, avg=15596.66, stdev=13650543.09
> > >>>   clat (nsec): min=364, max=104599, avg=435.81, stdev=302.81
> > >>>    lat (nsec): min=1416, max=40745M, avg=16104.06, stdev=13650546.77
> > >>>   clat percentiles (nsec):
> > >>>    |  1.00th=[  378],  5.00th=[  390], 10.00th=[  406], 20.00th=[  410],
> > >>>    | 30.00th=[  414], 40.00th=[  414], 50.00th=[  418], 60.00th=[  418],
> > >>>    | 70.00th=[  418], 80.00th=[  422], 90.00th=[  426], 95.00th=[  494],
> > >>>    | 99.00th=[  772], 99.50th=[  916], 99.90th=[ 3856], 99.95th=[ 5920],
> > >>>    | 99.99th=[10816]
> > >>>  bw (  KiB/s): min=    1, max=627591, per=100.00%, avg=244393.77, stdev=103534.74, samples=765
> > >>>  iops        : min=    0, max=78448, avg=30549.06, stdev=12941.82, samples=765
> > >>>
> > >>> When we track per-second max latency in fio, we see something like this:
> > >>>
> > >>> <time-ms>,<max-latency-ns>,,,
> > >>> ...
> > >>> 777000, 5155548, 0, 0, 0
> > >>> 778000, 105551, 1, 0, 0
> > >>> 802615, 24276019570, 0, 0, 0
> > >>> 802615, 82134, 1, 0, 0
> > >>> 804000, 9944554, 0, 0, 0
> > >>> 805000, 7424638, 1, 0, 0
> > >>>
> > >>> fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=fio --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
> > >>>
> > >>> We saw a smiliar issue reported in https://www.spinics.net/lists/linux-bcache/msg09578.html, which suggests an issue in garbage collection. When we trigger GC manually via "echo 1 > /sys/fs/bcache/a356bdb0-...-64f794387488/internal/trigger_gc", the stall is always reproduced. That thread points to this patch (https://www.spinics.net/lists/linux-bcache/msg08870.html) that we tested and the stall no longer happens.
> > >>>
> > >>> AFAIK, this patch marks buckets reclaimable at the beginning of GC to unblock the allocator so it does not need to wait for GC to finish. This periodic stall is a serious issue. Can the community look at this issue and this patch if possible?
> > >>>
> > >>
> > >> Could you please share more performance information of this patch? And how many nodes/how long time does the test cover so far?
> > >>
> > >> Last time I test the patch, it looked fine. But I was not confident how large scale and how long time this patch was tested. If you may provide more testing information, it will be helpful.
> > >>
> > >>
> > >> Coly Li
> > >
> >

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-03-28 18:05               ` Robert Pang
@ 2024-03-29 13:00                 ` Coly Li
  2024-04-11  6:44                   ` Robert Pang
  0 siblings, 1 reply; 16+ messages in thread
From: Coly Li @ 2024-03-29 13:00 UTC (permalink / raw)
  To: Robert Pang; +Cc: Dongsheng Yang, Bcache Linux



> 2024年3月29日 02:05,Robert Pang <robertpang@google.com> 写道:
> 
> Hi bcache developers
> 
> Greetings. Any update on this patch? How are things going with the
> testing and submission upstream?

Hi Peng,

As I said, it will be in next merge window, not this one. If there is help necessary, I will ask :-)

Thanks.

Coly Li


> 
> 
> On Sun, Mar 17, 2024 at 11:16 PM Robert Pang <robertpang@google.com> wrote:
>> 
>> Hi Coly
>> 
>> Thank you for confirming. It looks like the 6.9 merge window just
>> opened last week so we hope it can catch it. Please update in this
>> thread when it gets submitted.
>> 
>> https://lore.kernel.org/lkml/CAHk-=wiehc0DfPtL6fC2=bFuyzkTnuiuYSQrr6JTQxQao6pq1Q@mail.gmail.com/T/
>> 
>> BTW, speaking of testing, mind if you point us to the bcache test
>> suite? We would like to have a look and maybe give it a try also.
>> 
>> Thanks
>> Robert
>> 
>> On Sun, Mar 17, 2024 at 7:00 AM Coly Li <colyli@suse.de> wrote:
>>> 
>>> 
>>> 
>>>> 2024年3月17日 13:41,Robert Pang <robertpang@google.com> 写道:
>>>> 
>>>> Hi Coly
>>>> 
>>> 
>>> Hi Robert,
>>> 
>>>> Thank you for looking into this issue.
>>>> 
>>>> We tested this patch in 5 machines with local SSD size ranging from
>>>> 375 GB to 9 TB, and ran tests for 10 to 12 hours each. We observed no
>>>> stall nor other issues. Performance was comparable before and after
>>>> the patch. Hope this info will be helpful.
>>> 
>>> Thanks for the information.
>>> 
>>> Also I was told this patch has been deployed and shipped for 1+ year in easystack products, works well.
>>> 
>>> The above information makes me feel confident for this patch. I will submit it in next merge window if some ultra testing loop passes.
>>> 
>>> Coly Li
>>> 
> 

[snipped]


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-03-29 13:00                 ` Coly Li
@ 2024-04-11  6:44                   ` Robert Pang
  2024-05-03 18:23                     ` Coly Li
  0 siblings, 1 reply; 16+ messages in thread
From: Robert Pang @ 2024-04-11  6:44 UTC (permalink / raw)
  To: Coly Li; +Cc: Dongsheng Yang, Bcache Linux

HI Coly

Thank you for submitting it in the next merge window. This patch is
very critical because the long IO stall measured in tens of seconds
every hour is a serious issue making bcache unusable when it happens.
So we look forward to this patch.

Speaking of this GC issue, we gathered the bcache btree GC stats after
our fio benchmark on a 375GB SSD cache device with 256kB bucket size:

$ grep . /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_*
/sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_duration_ms:45293
/sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_frequency_sec:286
/sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_last_sec:212
/sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_max_duration_ms:61986
$ more /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_nodes
5876

However, fio directly on the SSD device itself shows pretty good performance:

Read IOPS 14,100 (110MiB/s)
Write IOPS 42,200 (330MiB/s)
Latency: 106.64 microseconds

Can you shed some light on why CG takes so long (avg 45 seconds) given
the SSD speed? And is there any way or setting to reduce the CG time
or lower the GC frequency?

One interesting thing we observed is when the SSD is encrypted via
dm-crypt, the GC time is shortened ~80% to be under 10 seconds. Is it
possible that GC writes the blocks one-by-one synchronously, and
dm-crypt's internal queuing and buffering mitigates the GC IO latency?

Thanks
Robert


On Fri, Mar 29, 2024 at 6:00 AM Coly Li <colyli@suse.de> wrote:
>
>
>
> > 2024年3月29日 02:05,Robert Pang <robertpang@google.com> 写道:
> >
> > Hi bcache developers
> >
> > Greetings. Any update on this patch? How are things going with the
> > testing and submission upstream?
>
> Hi Peng,
>
> As I said, it will be in next merge window, not this one. If there is help necessary, I will ask :-)
>
> Thanks.
>
> Coly Li
>
>
> >
> >
> > On Sun, Mar 17, 2024 at 11:16 PM Robert Pang <robertpang@google.com> wrote:
> >>
> >> Hi Coly
> >>
> >> Thank you for confirming. It looks like the 6.9 merge window just
> >> opened last week so we hope it can catch it. Please update in this
> >> thread when it gets submitted.
> >>
> >> https://lore.kernel.org/lkml/CAHk-=wiehc0DfPtL6fC2=bFuyzkTnuiuYSQrr6JTQxQao6pq1Q@mail.gmail.com/T/
> >>
> >> BTW, speaking of testing, mind if you point us to the bcache test
> >> suite? We would like to have a look and maybe give it a try also.
> >>
> >> Thanks
> >> Robert
> >>
> >> On Sun, Mar 17, 2024 at 7:00 AM Coly Li <colyli@suse.de> wrote:
> >>>
> >>>
> >>>
> >>>> 2024年3月17日 13:41,Robert Pang <robertpang@google.com> 写道:
> >>>>
> >>>> Hi Coly
> >>>>
> >>>
> >>> Hi Robert,
> >>>
> >>>> Thank you for looking into this issue.
> >>>>
> >>>> We tested this patch in 5 machines with local SSD size ranging from
> >>>> 375 GB to 9 TB, and ran tests for 10 to 12 hours each. We observed no
> >>>> stall nor other issues. Performance was comparable before and after
> >>>> the patch. Hope this info will be helpful.
> >>>
> >>> Thanks for the information.
> >>>
> >>> Also I was told this patch has been deployed and shipped for 1+ year in easystack products, works well.
> >>>
> >>> The above information makes me feel confident for this patch. I will submit it in next merge window if some ultra testing loop passes.
> >>>
> >>> Coly Li
> >>>
> >
>
> [snipped]
>

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-04-11  6:44                   ` Robert Pang
@ 2024-05-03 18:23                     ` Coly Li
  2024-05-03 18:28                       ` Coly Li
  0 siblings, 1 reply; 16+ messages in thread
From: Coly Li @ 2024-05-03 18:23 UTC (permalink / raw)
  To: Robert Pang, Dongsheng Yang; +Cc: Bcache Linux



> 2024年4月11日 14:44,Robert Pang <robertpang@google.com> 写道:
> 
> HI Coly
> 
> Thank you for submitting it in the next merge window. This patch is
> very critical because the long IO stall measured in tens of seconds
> every hour is a serious issue making bcache unusable when it happens.
> So we look forward to this patch.
> 
> Speaking of this GC issue, we gathered the bcache btree GC stats after
> our fio benchmark on a 375GB SSD cache device with 256kB bucket size:
> 
> $ grep . /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_*
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_duration_ms:45293
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_frequency_sec:286
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_last_sec:212
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_max_duration_ms:61986
> $ more /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_nodes
> 5876
> 
> However, fio directly on the SSD device itself shows pretty good performance:
> 
> Read IOPS 14,100 (110MiB/s)
> Write IOPS 42,200 (330MiB/s)
> Latency: 106.64 microseconds
> 
> Can you shed some light on why CG takes so long (avg 45 seconds) given
> the SSD speed? And is there any way or setting to reduce the CG time
> or lower the GC frequency?
> 
> One interesting thing we observed is when the SSD is encrypted via
> dm-crypt, the GC time is shortened ~80% to be under 10 seconds. Is it
> possible that GC writes the blocks one-by-one synchronously, and
> dm-crypt's internal queuing and buffering mitigates the GC IO latency?

Hi Robert,

Can I know In which kernel version did you test the patch?

I do a patch rebase and apply it on Linux v6.9. With a 4TB SSD as cache device, I didn’t observe obvious performance advantage of this patch.
And occasionally I a bit more GC time. It might be from my rebase modification in bch_btree_gc_finish(),
@@ -1769,6 +1771,11 @@ static void bch_btree_gc_finish(struct cache_set *c)
        c->gc_mark_valid = 1;
        c->need_gc      = 0;

+       ca = c->cache;
+       for_each_bucket(b, ca)
+               if (b->reclaimable_in_gc)
+                       b->reclaimable_in_gc = 0;
+
        for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
                SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
                            GC_MARK_METADATA);

for_each_bucket() runs twice in bch_btree_gc_finish(). I guess maybe it is not exactly relevant to the GC time floating, but iterating all buckets twice in this patch looks a bit comfortable to me.


Hi Dongsheng,

Maybe my rebase is incorrect. Could you please post a new version which applies to the latest upstream bcache code?

Thanks in advance.


Coly Li


^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-03 18:23                     ` Coly Li
@ 2024-05-03 18:28                       ` Coly Li
  2024-05-04  2:04                         ` Robert Pang
  0 siblings, 1 reply; 16+ messages in thread
From: Coly Li @ 2024-05-03 18:28 UTC (permalink / raw)
  To: Robert Pang, Dongsheng Yang; +Cc: Bcache Linux



> 2024年5月4日 02:23,Coly Li <colyli@suse.de> 写道:
> 
> 
> 
>> 2024年4月11日 14:44,Robert Pang <robertpang@google.com> 写道:
>> 
>> HI Coly
>> 
>> Thank you for submitting it in the next merge window. This patch is
>> very critical because the long IO stall measured in tens of seconds
>> every hour is a serious issue making bcache unusable when it happens.
>> So we look forward to this patch.
>> 
>> Speaking of this GC issue, we gathered the bcache btree GC stats after
>> our fio benchmark on a 375GB SSD cache device with 256kB bucket size:
>> 
>> $ grep . /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_*
>> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_duration_ms:45293
>> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_frequency_sec:286
>> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_last_sec:212
>> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_max_duration_ms:61986
>> $ more /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_nodes
>> 5876
>> 
>> However, fio directly on the SSD device itself shows pretty good performance:
>> 
>> Read IOPS 14,100 (110MiB/s)
>> Write IOPS 42,200 (330MiB/s)
>> Latency: 106.64 microseconds
>> 
>> Can you shed some light on why CG takes so long (avg 45 seconds) given
>> the SSD speed? And is there any way or setting to reduce the CG time
>> or lower the GC frequency?
>> 
>> One interesting thing we observed is when the SSD is encrypted via
>> dm-crypt, the GC time is shortened ~80% to be under 10 seconds. Is it
>> possible that GC writes the blocks one-by-one synchronously, and
>> dm-crypt's internal queuing and buffering mitigates the GC IO latency?
> 
> Hi Robert,
> 
> Can I know In which kernel version did you test the patch?
> 

Sorry I missed a bit more information here.

> I do a patch rebase and apply it on Linux v6.9. With a 4TB SSD as cache device, I didn’t observe obvious performance advantage of this patch.

When I didn’t see obvious performance advantage, the testing was on a 512G Intel Optane memory (with pmem driver) as cache device.


> And occasionally I a bit more GC time. It might be from my rebase modification in bch_btree_gc_finish(),

And for the above situation, it was on a 4TB NVMe SSD.


I guess maybe it was from my improper patch rebase. Once Dongsheng posts a new version for the latest upstream kernel bcache code, I will test the patch again.


Thanks.

Coly Li

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-03 18:28                       ` Coly Li
@ 2024-05-04  2:04                         ` Robert Pang
  2024-05-04  3:08                           ` Coly Li
  0 siblings, 1 reply; 16+ messages in thread
From: Robert Pang @ 2024-05-04  2:04 UTC (permalink / raw)
  To: Coly Li; +Cc: Dongsheng Yang, Bcache Linux


[-- Attachment #1.1: Type: text/plain, Size: 5690 bytes --]

Hi Coly,

> Can I know In which kernel version did you test the patch?

I tested in both Linux kernels 5.10 and 6.1.

> I didn’t observe obvious performance advantage of this patch.

This patch doesn't improve bcache performance. Instead, it eliminates the
IO stall in bcache that happens due to bch_allocator_thread() getting
blocked and waiting on GC to finish when GC happens.

/*
* We've run out of free buckets, we need to find some buckets
* we can invalidate. First, invalidate them in memory and add
* them to the free_inc list:
*/
retry_invalidate:
allocator_wait(ca, ca->set->gc_mark_valid &&  <--------
       !ca->invalidate_needs_gc);
invalidate_buckets(ca);

From what you showed, it looks like your rebase is good. As you
already noticed, the original patch was based on 4.x kernel so the bucket
traversal in btree.c needs to be adapted for 5.x and 6.x kernels. I
attached the patch rebased to 6.9 HEAD for your reference.

But to observe the IO stall before the patch, please test with a read-write
workload so GC will happen periodically enough (read-only or read-mostly
workload doesn't show the problem). For me, I used the "fio" utility to
generate a random read-write workload as follows.

# Pre-generate a 900GB test file
$ truncate -s 900G test

# Run random read-write workload for 1 hour
$ fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio
--name=latency_test --filename=test --bs=8k --iodepth=1 --size=900G
 --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat
--log_avg_msec=1000 --log_max_value=1

We include the flags "--write_lat_log=lat --log_avg_msec=1000
--log_max_value=1" so fio will dump the second-by-second max latency into a
log file at the end of test so we can when stall happens and for how long:

E.g.

$ more lat_lat.1.log
(format: <time-ms>,<max-latency-ns>,,,)
...
777000, 5155548, 0, 0, 0
778000, 105551, 1, 0, 0
802615, 24276019570, 0, 0, 0 <---- stalls for 24s with no IO possible
802615, 82134, 1, 0, 0
804000, 9944554, 0, 0, 0
805000, 7424638, 1, 0, 0

I used a 375 GB local SSD (cache device) and a 1 TB network-attached
storage (backing device). In the 1-hr run, GC starts happening about 10
minutes into the run and then happens at ~ 5 minute intervals. The stall
duration ranges from a few seconds at the beginning to close to 40 seconds
towards the end. Only about 1/2 to 2/3 of the cache is used by the end.

Note that this patch doesn't shorten the GC either. Instead, it just avoids
GC from blocking the allocator thread by first sweeping the buckets and
marking reclaimable ones quickly at the beginning of GC so the allocator
can proceed while GC continues its actual job.

We are eagerly looking forward to this patch to be merged in this coming
merge window that is expected to open in a week to two.

Thanks
Robert


On Fri, May 3, 2024 at 11:28 AM Coly Li <colyli@suse.de> wrote:

>
>
> > 2024年5月4日 02:23,Coly Li <colyli@suse.de> 写道:
> >
> >
> >
> >> 2024年4月11日 14:44,Robert Pang <robertpang@google.com> 写道:
> >>
> >> HI Coly
> >>
> >> Thank you for submitting it in the next merge window. This patch is
> >> very critical because the long IO stall measured in tens of seconds
> >> every hour is a serious issue making bcache unusable when it happens.
> >> So we look forward to this patch.
> >>
> >> Speaking of this GC issue, we gathered the bcache btree GC stats after
> >> our fio benchmark on a 375GB SSD cache device with 256kB bucket size:
> >>
> >> $ grep .
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_*
> >>
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_duration_ms:45293
> >>
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_average_frequency_sec:286
> >>
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_last_sec:212
> >>
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_gc_max_duration_ms:61986
> >> $ more
> /sys/fs/bcache/31c945a7-d96c-499b-945c-d76a1ab0beda/internal/btree_nodes
> >> 5876
> >>
> >> However, fio directly on the SSD device itself shows pretty good
> performance:
> >>
> >> Read IOPS 14,100 (110MiB/s)
> >> Write IOPS 42,200 (330MiB/s)
> >> Latency: 106.64 microseconds
> >>
> >> Can you shed some light on why CG takes so long (avg 45 seconds) given
> >> the SSD speed? And is there any way or setting to reduce the CG time
> >> or lower the GC frequency?
> >>
> >> One interesting thing we observed is when the SSD is encrypted via
> >> dm-crypt, the GC time is shortened ~80% to be under 10 seconds. Is it
> >> possible that GC writes the blocks one-by-one synchronously, and
> >> dm-crypt's internal queuing and buffering mitigates the GC IO latency?
> >
> > Hi Robert,
> >
> > Can I know In which kernel version did you test the patch?
> >
>
> Sorry I missed a bit more information here.
>
> > I do a patch rebase and apply it on Linux v6.9. With a 4TB SSD as cache
> device, I didn’t observe obvious performance advantage of this patch.
>
> When I didn’t see obvious performance advantage, the testing was on a 512G
> Intel Optane memory (with pmem driver) as cache device.
>
>
> > And occasionally I a bit more GC time. It might be from my rebase
> modification in bch_btree_gc_finish(),
>
> And for the above situation, it was on a 4TB NVMe SSD.
>
>
> I guess maybe it was from my improper patch rebase. Once Dongsheng posts a
> new version for the latest upstream kernel bcache code, I will test the
> patch again.
>
>
> Thanks.
>
> Coly Li

[-- Attachment #1.2: Type: text/html, Size: 6648 bytes --]

[-- Attachment #2: 0001-bcache-allow-allocator-to-invalidate-bucket-in-gc.patch --]
[-- Type: text/x-patch, Size: 2963 bytes --]

---
 drivers/md/bcache/alloc.c  | 11 +++++------
 drivers/md/bcache/bcache.h |  1 +
 drivers/md/bcache/btree.c  | 11 +++++++++--
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index ce13c272c387..982b36d12907 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -129,12 +129,11 @@ static inline bool can_inc_bucket_gen(struct bucket *b)
 
 bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b)
 {
-	BUG_ON(!ca->set->gc_mark_valid);
-
-	return (!GC_MARK(b) ||
+	return ((b->reclaimable_in_gc || ca->set->gc_mark_valid) &&
+		((!GC_MARK(b) ||
 		GC_MARK(b) == GC_MARK_RECLAIMABLE) &&
 		!atomic_read(&b->pin) &&
-		can_inc_bucket_gen(b);
+		can_inc_bucket_gen(b)));
 }
 
 void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
@@ -148,6 +147,7 @@ void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
 	bch_inc_gen(ca, b);
 	b->prio = INITIAL_PRIO;
 	atomic_inc(&b->pin);
+	b->reclaimable_in_gc = 0;
 }
 
 static void bch_invalidate_one_bucket(struct cache *ca, struct bucket *b)
@@ -352,8 +352,7 @@ static int bch_allocator_thread(void *arg)
 		 */
 
 retry_invalidate:
-		allocator_wait(ca, ca->set->gc_mark_valid &&
-			       !ca->invalidate_needs_gc);
+		allocator_wait(ca, !ca->invalidate_needs_gc);
 		invalidate_buckets(ca);
 
 		/*
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 4e6afa89921f..1d33e40d26ea 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -200,6 +200,7 @@ struct bucket {
 	uint8_t		gen;
 	uint8_t		last_gc; /* Most out of date gen in the btree */
 	uint16_t	gc_mark; /* Bitfield used by GC. See below for field */
+	uint16_t	reclaimable_in_gc:1;
 };
 
 /*
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 196cdacce38f..ded55958782d 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1740,18 +1740,21 @@ static void btree_gc_start(struct cache_set *c)
 
 	mutex_lock(&c->bucket_lock);
 
-	c->gc_mark_valid = 0;
 	c->gc_done = ZERO_KEY;
 
 	ca = c->cache;
 	for_each_bucket(b, ca) {
 		b->last_gc = b->gen;
+		if (bch_can_invalidate_bucket(ca, b))
+			b->reclaimable_in_gc = 1;
+
 		if (!atomic_read(&b->pin)) {
 			SET_GC_MARK(b, 0);
 			SET_GC_SECTORS_USED(b, 0);
 		}
 	}
 
+	c->gc_mark_valid = 0;
 	mutex_unlock(&c->bucket_lock);
 }
 
@@ -1768,6 +1771,11 @@ static void bch_btree_gc_finish(struct cache_set *c)
 	c->gc_mark_valid = 1;
 	c->need_gc	= 0;
 
+	ca = c->cache;
+	for_each_bucket(b, ca)
+	    if (b->reclaimable_in_gc)
+		b->reclaimable_in_gc = 0;
+
 	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
 		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
 			    GC_MARK_METADATA);
@@ -1795,7 +1803,6 @@ static void bch_btree_gc_finish(struct cache_set *c)
 
 	c->avail_nbuckets = 0;
 
-	ca = c->cache;
 	ca->invalidate_needs_gc = 0;
 
 	for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++)
-- 

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-04  2:04                         ` Robert Pang
@ 2024-05-04  3:08                           ` Coly Li
  2024-05-08  2:34                             ` Dongsheng Yang
  0 siblings, 1 reply; 16+ messages in thread
From: Coly Li @ 2024-05-04  3:08 UTC (permalink / raw)
  To: Robert Pang; +Cc: Dongsheng Yang, Bcache Linux



> 2024年5月4日 10:04,Robert Pang <robertpang@google.com> 写道:
> 
> Hi Coly,
> 
> > Can I know In which kernel version did you test the patch?
> 
> I tested in both Linux kernels 5.10 and 6.1.
> 
> > I didn’t observe obvious performance advantage of this patch.
> 
> This patch doesn't improve bcache performance. Instead, it eliminates the IO stall in bcache that happens due to bch_allocator_thread() getting blocked and waiting on GC to finish when GC happens.
> 
> /*
> * We've run out of free buckets, we need to find some buckets
> * we can invalidate. First, invalidate them in memory and add
> * them to the free_inc list:
> */
> retry_invalidate:
> allocator_wait(ca, ca->set->gc_mark_valid &&  <--------
>        !ca->invalidate_needs_gc);
> invalidate_buckets(ca);
> 
> From what you showed, it looks like your rebase is good. As you already noticed, the original patch was based on 4.x kernel so the bucket traversal in btree.c needs to be adapted for 5.x and 6.x kernels. I attached the patch rebased to 6.9 HEAD for your reference.
> 
> But to observe the IO stall before the patch, please test with a read-write workload so GC will happen periodically enough (read-only or read-mostly workload doesn't show the problem). For me, I used the "fio" utility to generate a random read-write workload as follows.
> 
> # Pre-generate a 900GB test file
> $ truncate -s 900G test
> 
> # Run random read-write workload for 1 hour
> $ fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=test --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1 
> 
> We include the flags "--write_lat_log=lat --log_avg_msec=1000 --log_max_value=1" so fio will dump the second-by-second max latency into a log file at the end of test so we can when stall happens and for how long:
> 

Copied. Thanks for the information. Let me try the above command lines on my local machine with longer time.



> E.g.
> 
> $ more lat_lat.1.log
> (format: <time-ms>,<max-latency-ns>,,,)
> ...
> 777000, 5155548, 0, 0, 0
> 778000, 105551, 1, 0, 0
> 802615, 24276019570, 0, 0, 0 <---- stalls for 24s with no IO possible
> 802615, 82134, 1, 0, 0
> 804000, 9944554, 0, 0, 0
> 805000, 7424638, 1, 0, 0
> 
> I used a 375 GB local SSD (cache device) and a 1 TB network-attached storage (backing device). In the 1-hr run, GC starts happening about 10 minutes into the run and then happens at ~ 5 minute intervals. The stall duration ranges from a few seconds at the beginning to close to 40 seconds towards the end. Only about 1/2 to 2/3 of the cache is used by the end.
> 
> Note that this patch doesn't shorten the GC either. Instead, it just avoids GC from blocking the allocator thread by first sweeping the buckets and marking reclaimable ones quickly at the beginning of GC so the allocator can proceed while GC continues its actual job.
> 
> We are eagerly looking forward to this patch to be merged in this coming merge window that is expected to open in a week to two.

In order to avoid the no-space deadlock, normally there are around 10% space will not be allocated out. I need to look more close onto this patch.


Dongsheng Yang,

Could you please post a new version based on current mainline kernel code ?

Thanks.

Coly Li



^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] bcache: allow allocator to invalidate bucket in gc
  2024-05-04  3:08                           ` Coly Li
@ 2024-05-08  2:34                             ` Dongsheng Yang
  0 siblings, 0 replies; 16+ messages in thread
From: Dongsheng Yang @ 2024-05-08  2:34 UTC (permalink / raw)
  To: Coly Li, Robert Pang, mingzhe.zou; +Cc: Bcache Linux



在 2024/5/4 星期六 上午 11:08, Coly Li 写道:
> 
> 
>> 2024年5月4日 10:04,Robert Pang <robertpang@google.com> 写道:
>>
>> Hi Coly,
>>
>>> Can I know In which kernel version did you test the patch?
>>
>> I tested in both Linux kernels 5.10 and 6.1.
>>
>>> I didn’t observe obvious performance advantage of this patch.
>>
>> This patch doesn't improve bcache performance. Instead, it eliminates the IO stall in bcache that happens due to bch_allocator_thread() getting blocked and waiting on GC to finish when GC happens.
>>
>> /*
>> * We've run out of free buckets, we need to find some buckets
>> * we can invalidate. First, invalidate them in memory and add
>> * them to the free_inc list:
>> */
>> retry_invalidate:
>> allocator_wait(ca, ca->set->gc_mark_valid &&  <--------
>>         !ca->invalidate_needs_gc);
>> invalidate_buckets(ca);
>>
>>  From what you showed, it looks like your rebase is good. As you already noticed, the original patch was based on 4.x kernel so the bucket traversal in btree.c needs to be adapted for 5.x and 6.x kernels. I attached the patch rebased to 6.9 HEAD for your reference.
>>
>> But to observe the IO stall before the patch, please test with a read-write workload so GC will happen periodically enough (read-only or read-mostly workload doesn't show the problem). For me, I used the "fio" utility to generate a random read-write workload as follows.
>>
>> # Pre-generate a 900GB test file
>> $ truncate -s 900G test
>>
>> # Run random read-write workload for 1 hour
>> $ fio --time_based --runtime=3600s --ramp_time=2s --ioengine=libaio --name=latency_test --filename=test --bs=8k --iodepth=1 --size=900G  --readwrite=randrw --verify=0 --filename=fio --write_lat_log=lat --log_avg_msec=1000 --log_max_value=1
>>
>> We include the flags "--write_lat_log=lat --log_avg_msec=1000 --log_max_value=1" so fio will dump the second-by-second max latency into a log file at the end of test so we can when stall happens and for how long:
>>
> 
> Copied. Thanks for the information. Let me try the above command lines on my local machine with longer time.
> 
> 
> 
>> E.g.
>>
>> $ more lat_lat.1.log
>> (format: <time-ms>,<max-latency-ns>,,,)
>> ...
>> 777000, 5155548, 0, 0, 0
>> 778000, 105551, 1, 0, 0
>> 802615, 24276019570, 0, 0, 0 <---- stalls for 24s with no IO possible
>> 802615, 82134, 1, 0, 0
>> 804000, 9944554, 0, 0, 0
>> 805000, 7424638, 1, 0, 0
>>
>> I used a 375 GB local SSD (cache device) and a 1 TB network-attached storage (backing device). In the 1-hr run, GC starts happening about 10 minutes into the run and then happens at ~ 5 minute intervals. The stall duration ranges from a few seconds at the beginning to close to 40 seconds towards the end. Only about 1/2 to 2/3 of the cache is used by the end.
>>
>> Note that this patch doesn't shorten the GC either. Instead, it just avoids GC from blocking the allocator thread by first sweeping the buckets and marking reclaimable ones quickly at the beginning of GC so the allocator can proceed while GC continues its actual job.
>>
>> We are eagerly looking forward to this patch to be merged in this coming merge window that is expected to open in a week to two.
> 
> In order to avoid the no-space deadlock, normally there are around 10% space will not be allocated out. I need to look more close onto this patch.
> 
> 
> Dongsheng Yang,
> 
> Could you please post a new version based on current mainline kernel code ?

Hi Coly,
	Mingzhe will send a new version based on mainline.

Thanx
> 
> Thanks.
> 
> Coly Li
> 
> 
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2024-05-08 15:32 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-10 11:21 [PATCH] bcache: allow allocator to invalidate bucket in gc Dongsheng Yang
2020-09-10 11:28 ` [PATCH v2] " Dongsheng Yang
2020-09-18  9:53   ` Coly Li
2024-03-15 22:45     ` Robert Pang
2024-03-16  2:48       ` Coly Li
2024-03-17  5:41         ` Robert Pang
2024-03-17 13:59           ` Coly Li
2024-03-18  6:16             ` Robert Pang
2024-03-28 18:05               ` Robert Pang
2024-03-29 13:00                 ` Coly Li
2024-04-11  6:44                   ` Robert Pang
2024-05-03 18:23                     ` Coly Li
2024-05-03 18:28                       ` Coly Li
2024-05-04  2:04                         ` Robert Pang
2024-05-04  3:08                           ` Coly Li
2024-05-08  2:34                             ` Dongsheng Yang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.