All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 0/4] bcache: incremental GC and dirty data init
@ 2018-04-16  6:33 tang.junhui
  2018-04-16  6:33 ` [PATCH v2 1/4] bcache: finish incremental GC tang.junhui
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: tang.junhui @ 2018-04-16  6:33 UTC (permalink / raw)
  To: kent.overstreet, colyli, mlyle; +Cc: linux-bcache, linux-block, tang.junhui

Hi maintainers and folks,

Some patches of this patch set have been sent before, they are not merged
yet, and I add two new patches to solve some issues I found while testing.
since They are interdependent, so I make a patch set and resend them.

[PATCH 1/4] bcache: finish incremental GC
[PATCH 2/4] bcache: calculate the number of incremental GC nodes according to
			the total of btree nodes
[PATCH 3/4] bcache: notify allocator to prepare for GC
[PATCH 4/4] bcache: fix I/O significant decline while backend devices registering

These patches are useful to prevent I/O fluctuations or even jump 0 while GC or
cached devices registering. I have test them for some times, I hope somebody could
have a review for these patch, any comment would be welcome.

Patch v2: modify patch as Coly's suggestion.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2 1/4] bcache: finish incremental GC
  2018-04-16  6:33 [PATCH v2 0/4] bcache: incremental GC and dirty data init tang.junhui
@ 2018-04-16  6:33 ` tang.junhui
  2018-04-16  7:27   ` Coly Li
  2018-04-16  6:33 ` [PATCH v2 3/4] bcache: notify allocator to prepare for GC tang.junhui
  2018-04-16  6:33 ` [PATCH v2 4/4] bcache: fix I/O significant decline while backend devices registering tang.junhui
  2 siblings, 1 reply; 5+ messages in thread
From: tang.junhui @ 2018-04-16  6:33 UTC (permalink / raw)
  To: kent.overstreet, colyli, mlyle; +Cc: linux-bcache, linux-block, tang.junhui

From: Tang Junhui <tang.junhui@zte.com.cn>

In GC thread, we record the latest GC key in gc_done, which is expected
to be used for incremental GC, but in currently code, we didn't realize
it. When GC runs, front side IO would be blocked until the GC over, it
would be a long time if there is a lot of btree nodes.

This patch realizes incremental GC, the main ideal is that, when there
are front side I/Os, after GC some nodes (100), we stop GC, release locker
of the btree node, and go to process the front side I/Os for some times
(100 ms), then go back to GC again.

By this patch, when we doing GC, I/Os are not blocked all the time, and
there is no obvious I/Os zero jump problem any more.

Patch v2: Rename some variables and macros name as Coly suggested.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
---
 drivers/md/bcache/bcache.h  |  5 +++++
 drivers/md/bcache/btree.c   | 15 ++++++++++++++-
 drivers/md/bcache/request.c |  3 +++
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 843877e..5d52be8 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -445,6 +445,7 @@ struct cache {
 
 struct gc_stat {
 	size_t			nodes;
+	size_t			nodes_pre;
 	size_t			key_bytes;
 
 	size_t			nkeys;
@@ -568,6 +569,10 @@ struct cache_set {
 	 */
 	atomic_t		rescale;
 	/*
+	 * used for GC, identify if any front side I/Os is inflight
+	 */
+	atomic_t		search_inflight;
+	/*
 	 * When we invalidate buckets, we use both the priority and the amount
 	 * of good data to determine which buckets to reuse first - to weight
 	 * those together consistently we keep track of the smallest nonzero
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 81e8dc3..6edb00e 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -90,6 +90,9 @@
 
 #define MAX_NEED_GC		64
 #define MAX_SAVE_PRIO		72
+#define MIN_GC_NODES		100
+#define GC_SLEEP_MS		100
+
 
 #define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
 
@@ -1581,6 +1584,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
 		r->b = NULL;
 
+		if (atomic_read(&b->c->search_inflight) &&
+		    gc->nodes >= gc->nodes_pre + MIN_GC_NODES) {
+			gc->nodes_pre =  gc->nodes;
+			ret = -EAGAIN;
+			break;
+		}
+
 		if (need_resched()) {
 			ret = -EAGAIN;
 			break;
@@ -1748,7 +1758,10 @@ static void bch_btree_gc(struct cache_set *c)
 		closure_sync(&writes);
 		cond_resched();
 
-		if (ret && ret != -EAGAIN)
+		if (ret == -EAGAIN)
+			schedule_timeout_interruptible(msecs_to_jiffies
+						       (GC_SLEEP_MS));
+		else if (ret)
 			pr_warn("gc failed!");
 	} while (ret);
 
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 643c3021..a12afbc 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -637,7 +637,9 @@ static void do_bio_hook(struct search *s, struct bio *orig_bio)
 static void search_free(struct closure *cl)
 {
 	struct search *s = container_of(cl, struct search, cl);
+
 	bio_complete(s);
+	atomic_dec(&s->d->c->search_inflight);
 
 	if (s->iop.bio)
 		bio_put(s->iop.bio);
@@ -655,6 +657,7 @@ static inline struct search *search_alloc(struct bio *bio,
 
 	closure_init(&s->cl, NULL);
 	do_bio_hook(s, bio);
+	atomic_inc(&d->c->search_inflight);
 
 	s->orig_bio		= bio;
 	s->cache_miss		= NULL;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH v2 3/4] bcache: notify allocator to prepare for GC
  2018-04-16  6:33 [PATCH v2 0/4] bcache: incremental GC and dirty data init tang.junhui
  2018-04-16  6:33 ` [PATCH v2 1/4] bcache: finish incremental GC tang.junhui
@ 2018-04-16  6:33 ` tang.junhui
  2018-04-16  6:33 ` [PATCH v2 4/4] bcache: fix I/O significant decline while backend devices registering tang.junhui
  2 siblings, 0 replies; 5+ messages in thread
From: tang.junhui @ 2018-04-16  6:33 UTC (permalink / raw)
  To: kent.overstreet, colyli, mlyle; +Cc: linux-bcache, linux-block, tang.junhui

From: Tang Junhui <tang.junhui@zte.com.cn>

Since no new bucket can be allocated during GC, and front side I/Os would
run out of all the buckets, so notify allocator to pack the free_inc queue
full of buckets before GC, then we could have enough buckets for front side
I/Os during GC period.

The main idea of this patch is:

		GC thread						|		allocator thread
==>triggered by sectors_to_gc			|
	set ca->prepare_gc to GC_PREPARING	|
	to notify allocator thread to		|
	prepare for GC						|==>detect ca->prepare_gc is 
										|	GC_PREPARING,
										|	do invalidate_buckets(),
==>waiting for allocator				|	and fill free_inc queue with
	thread to prepare over				|	reclaimable buckets, after
										|	that, set ca->prepare_gc to
										|	GC_PREPARED to notify GC
										|	thread being prepared
==>detect ca->prepare_gc is				|
	prepared, set						|
	ca->prepare_gc back to				|
	GC_PREPARE_NONE, and continue GC	|

Patch v2: Refactoring code

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
---
 drivers/md/bcache/alloc.c  | 11 ++++++++-
 drivers/md/bcache/bcache.h |  2 ++
 drivers/md/bcache/btree.c  | 59 +++++++++++++++++++++++++++++++++++++++++++---
 drivers/md/bcache/btree.h  |  4 ++++
 4 files changed, 72 insertions(+), 4 deletions(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index a0cc1bc..85020cc 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -323,7 +323,8 @@ static int bch_allocator_thread(void *arg)
 		 * possibly issue discards to them, then we add the bucket to
 		 * the free list:
 		 */
-		while (!fifo_empty(&ca->free_inc)) {
+		while (!fifo_empty(&ca->free_inc) &&
+		       ca->prepare_gc != GC_PREPARING) {
 			long bucket;
 
 			fifo_pop(&ca->free_inc, bucket);
@@ -353,6 +354,14 @@ static int bch_allocator_thread(void *arg)
 		invalidate_buckets(ca);
 
 		/*
+		 * Let GC continue
+		 */
+		if (ca->prepare_gc == GC_PREPARING) {
+			ca->prepare_gc = GC_PREPARED;
+			wake_up_gc(ca->set);
+		}
+
+		/*
 		 * Now, we write their new gens to disk so we can start writing
 		 * new stuff to them:
 		 */
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 5d52be8..e6d5391 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -428,6 +428,8 @@ struct cache {
 	 * cpu
 	 */
 	unsigned		invalidate_needs_gc;
+	/* used to notify allocator to prepare GC*/
+	unsigned int		prepare_gc;
 
 	bool			discard; /* Get rid of? */
 
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 2ad0731e..0478821 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1805,19 +1805,72 @@ static void bch_btree_gc(struct cache_set *c)
 	bch_moving_gc(c);
 }
 
+unsigned int get_cache_gc_prepare_status(struct cache_set *c)
+{
+	struct cache *ca;
+	unsigned int i;
+	unsigned int status = GC_PREPARE_NONE;
+
+	for_each_cache(ca, c, i) {
+		if (ca->prepare_gc == GC_PREPARING)
+			return GC_PREPARING;
+
+		status = ca->prepare_gc;
+	}
+
+	return status;
+}
+
+static void set_cache_gc_prepare_status(struct cache_set *c,
+					unsigned int status)
+{
+	struct cache *ca;
+	unsigned int i;
+
+	for_each_cache(ca, c, i)
+		ca->prepare_gc = status;
+}
+
 static bool gc_should_run(struct cache_set *c)
 {
 	struct cache *ca;
 	unsigned i;
+	bool ret = false;
 
 	for_each_cache(ca, c, i)
 		if (ca->invalidate_needs_gc)
 			return true;
 
-	if (atomic_read(&c->sectors_to_gc) < 0)
-		return true;
+	if (atomic_read(&c->sectors_to_gc) < 0) {
+		unsigned int status;
 
-	return false;
+		mutex_lock(&c->bucket_lock);
+		status = get_cache_gc_prepare_status(c);
+		switch (status) {
+		case GC_PREPARE_NONE:
+			/*
+			 * notify allocator thread to prepare for GC
+			 */
+			set_cache_gc_prepare_status(c, GC_PREPARING);
+			break;
+		case GC_PREPARED:
+			/*
+			 * alloc thread finished preparing,
+			 * and continue to GC
+			 */
+			set_cache_gc_prepare_status(c, GC_PREPARE_NONE);
+			ret = true;
+			break;
+		default:
+			/*
+			 * waitting allocator finishing prepareing
+			 */
+			break;
+		}
+		mutex_unlock(&c->bucket_lock);
+	}
+
+	return ret;
 }
 
 static int bch_gc_thread(void *arg)
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index d211e2c..e60bd7a 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -102,6 +102,10 @@
 #include "bset.h"
 #include "debug.h"
 
+#define GC_PREPARE_NONE		0
+#define GC_PREPARING		1
+#define GC_PREPARED		2
+
 struct btree_write {
 	atomic_t		*journal;
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH v2 4/4] bcache: fix I/O significant decline while backend devices registering
  2018-04-16  6:33 [PATCH v2 0/4] bcache: incremental GC and dirty data init tang.junhui
  2018-04-16  6:33 ` [PATCH v2 1/4] bcache: finish incremental GC tang.junhui
  2018-04-16  6:33 ` [PATCH v2 3/4] bcache: notify allocator to prepare for GC tang.junhui
@ 2018-04-16  6:33 ` tang.junhui
  2 siblings, 0 replies; 5+ messages in thread
From: tang.junhui @ 2018-04-16  6:33 UTC (permalink / raw)
  To: kent.overstreet, colyli, mlyle; +Cc: linux-bcache, linux-block, tang.junhui

From: Tang Junhui <tang.junhui@zte.com.cn>

I attached several backend devices in the same cache set, and produced lots
of dirty data by running small rand I/O writes in a long time, then I
continue run I/O in the others cached devices, and stopped a cached device,
after a mean while, I register the stopped device again, I see the running
I/O in the others cached devices dropped significantly, sometimes even
jumps to zero.

In currently code, bcache would traverse each keys and btree node to count
the dirty data under read locker, and the writes threads can not get the
btree write locker, and when there is a lot of keys and btree node in the
registering device, it would last several seconds, so the write I/Os in
others cached device are blocked and declined significantly.

In this patch, when a device registering to a ache set, which exist others
cached devices with running I/Os, we get the amount of dirty data of the
device in an incremental way, and do not block other cached devices all the
time.

Patch v2: Rename some variables and macros name as Coly suggested.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Reviewed-by: Coly Li <colyli@suse.de>
---
 drivers/md/bcache/writeback.c | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 56a3788..71ba861 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -488,10 +488,14 @@ static int bch_writeback_thread(void *arg)
 }
 
 /* Init */
+#define INIT_KEYS_EACH_TIME		500000
+#define INIT_KEYS_SLEEP_MS			100
 
 struct sectors_dirty_init {
 	struct btree_op	op;
 	unsigned	inode;
+	size_t		count;
+	struct bkey	start;
 };
 
 static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
@@ -506,18 +510,38 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
 		bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
 					     KEY_START(k), KEY_SIZE(k));
 
+	op->count++;
+	if (atomic_read(&b->c->search_inflight) &&
+	   !(op->count % INIT_KEYS_EACH_TIME)) {
+		bkey_copy_key(&op->start, k);
+		return -EAGAIN;
+	}
+
 	return MAP_CONTINUE;
 }
 
 void bch_sectors_dirty_init(struct bcache_device *d)
 {
 	struct sectors_dirty_init op;
+	int ret;
 
 	bch_btree_op_init(&op.op, -1);
 	op.inode = d->id;
+	op.count = 0;
+	op.start = KEY(op.inode, 0, 0);
 
-	bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
+	do {
+		ret = bch_btree_map_keys(&op.op, d->c, &op.start,
 			   sectors_dirty_init_fn, 0);
+		if (ret == -EAGAIN)
+			schedule_timeout_interruptible(
+				msecs_to_jiffies(INIT_KEYS_SLEEP_MS));
+		else if (ret < 0) {
+			pr_warn("sectors dirty init failed, ret=%d!", ret);
+			break;
+		}
+	} while (ret == -EAGAIN);
+
 }
 
 void bch_cached_dev_writeback_init(struct cached_dev *dc)
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v2 1/4] bcache: finish incremental GC
  2018-04-16  6:33 ` [PATCH v2 1/4] bcache: finish incremental GC tang.junhui
@ 2018-04-16  7:27   ` Coly Li
  0 siblings, 0 replies; 5+ messages in thread
From: Coly Li @ 2018-04-16  7:27 UTC (permalink / raw)
  To: tang.junhui, kent.overstreet, mlyle; +Cc: linux-bcache, linux-block

On 2018/4/16 2:33 PM, tang.junhui@zte.com.cn wrote:
> From: Tang Junhui <tang.junhui@zte.com.cn>
> 
> In GC thread, we record the latest GC key in gc_done, which is expected
> to be used for incremental GC, but in currently code, we didn't realize
> it. When GC runs, front side IO would be blocked until the GC over, it
> would be a long time if there is a lot of btree nodes.
> 
> This patch realizes incremental GC, the main ideal is that, when there
> are front side I/Os, after GC some nodes (100), we stop GC, release locker
> of the btree node, and go to process the front side I/Os for some times
> (100 ms), then go back to GC again.
> 
> By this patch, when we doing GC, I/Os are not blocked all the time, and
> there is no obvious I/Os zero jump problem any more.
> 
> Patch v2: Rename some variables and macros name as Coly suggested.
> 
> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>

Reviewed-by: Coly Li <colyli@suse.de>

Thanks.

Coly Li

> ---
>  drivers/md/bcache/bcache.h  |  5 +++++
>  drivers/md/bcache/btree.c   | 15 ++++++++++++++-
>  drivers/md/bcache/request.c |  3 +++
>  3 files changed, 22 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 843877e..5d52be8 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -445,6 +445,7 @@ struct cache {
>  
>  struct gc_stat {
>  	size_t			nodes;
> +	size_t			nodes_pre;
>  	size_t			key_bytes;
>  
>  	size_t			nkeys;
> @@ -568,6 +569,10 @@ struct cache_set {
>  	 */
>  	atomic_t		rescale;
>  	/*
> +	 * used for GC, identify if any front side I/Os is inflight
> +	 */
> +	atomic_t		search_inflight;
> +	/*
>  	 * When we invalidate buckets, we use both the priority and the amount
>  	 * of good data to determine which buckets to reuse first - to weight
>  	 * those together consistently we keep track of the smallest nonzero
> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
> index 81e8dc3..6edb00e 100644
> --- a/drivers/md/bcache/btree.c
> +++ b/drivers/md/bcache/btree.c
> @@ -90,6 +90,9 @@
>  
>  #define MAX_NEED_GC		64
>  #define MAX_SAVE_PRIO		72
> +#define MIN_GC_NODES		100
> +#define GC_SLEEP_MS		100
> +
>  
>  #define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
>  
> @@ -1581,6 +1584,13 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
>  		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
>  		r->b = NULL;
>  
> +		if (atomic_read(&b->c->search_inflight) &&
> +		    gc->nodes >= gc->nodes_pre + MIN_GC_NODES) {
> +			gc->nodes_pre =  gc->nodes;
> +			ret = -EAGAIN;
> +			break;
> +		}
> +
>  		if (need_resched()) {
>  			ret = -EAGAIN;
>  			break;
> @@ -1748,7 +1758,10 @@ static void bch_btree_gc(struct cache_set *c)
>  		closure_sync(&writes);
>  		cond_resched();
>  
> -		if (ret && ret != -EAGAIN)
> +		if (ret == -EAGAIN)
> +			schedule_timeout_interruptible(msecs_to_jiffies
> +						       (GC_SLEEP_MS));
> +		else if (ret)
>  			pr_warn("gc failed!");
>  	} while (ret);
>  
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index 643c3021..a12afbc 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -637,7 +637,9 @@ static void do_bio_hook(struct search *s, struct bio *orig_bio)
>  static void search_free(struct closure *cl)
>  {
>  	struct search *s = container_of(cl, struct search, cl);
> +
>  	bio_complete(s);
> +	atomic_dec(&s->d->c->search_inflight);
>  
>  	if (s->iop.bio)
>  		bio_put(s->iop.bio);
> @@ -655,6 +657,7 @@ static inline struct search *search_alloc(struct bio *bio,
>  
>  	closure_init(&s->cl, NULL);
>  	do_bio_hook(s, bio);
> +	atomic_inc(&d->c->search_inflight);
>  
>  	s->orig_bio		= bio;
>  	s->cache_miss		= NULL;
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-04-16  7:28 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-16  6:33 [PATCH v2 0/4] bcache: incremental GC and dirty data init tang.junhui
2018-04-16  6:33 ` [PATCH v2 1/4] bcache: finish incremental GC tang.junhui
2018-04-16  7:27   ` Coly Li
2018-04-16  6:33 ` [PATCH v2 3/4] bcache: notify allocator to prepare for GC tang.junhui
2018-04-16  6:33 ` [PATCH v2 4/4] bcache: fix I/O significant decline while backend devices registering tang.junhui

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.