All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/6] fix for a race in btree_flush_write()
@ 2019-06-05  7:27 Coly Li
  2019-06-05  7:27 ` [PATCH 1/6] bcache: Revert "bcache: fix high CPU occupancy during journal" Coly Li
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Coly Li @ 2019-06-05  7:27 UTC (permalink / raw)
  To: linux-bcache; +Cc: linux-block, Coly Li

There is a race happens in btree_flush_write() which may cause
system hang or panic. This patch set is an effor to fix such race,
and it is also necessary for the jouranl-no-space deadlock fixes. 

Any review or comments is welcome.

Thanks in advance.

Coly Li
---

Coly Li (6):
  bcache: Revert "bcache: fix high CPU occupancy during journal"
  bcache: Revert "bcache: free heap cache_set->flush_btree in
    bch_journal_free"
  bcache: set largest seq to ja->seq[bucket_index] in
    journal_read_bucket()
  bcache: remove retry_flush_write from struct cache_set
  bcache: fix race in btree_flush_write()
  bcache: add reclaimed_journal_buckets to struct cache_set

 drivers/md/bcache/bcache.h  |   4 +-
 drivers/md/bcache/btree.c   |  15 +++++-
 drivers/md/bcache/btree.h   |   2 +
 drivers/md/bcache/journal.c | 111 ++++++++++++++++++++++++++++----------------
 drivers/md/bcache/journal.h |   4 ++
 drivers/md/bcache/sysfs.c   |  10 ++--
 drivers/md/bcache/util.h    |   2 -
 7 files changed, 97 insertions(+), 51 deletions(-)

-- 
2.16.4


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/6] bcache: Revert "bcache: fix high CPU occupancy during journal"
  2019-06-05  7:27 [PATCH 0/6] fix for a race in btree_flush_write() Coly Li
@ 2019-06-05  7:27 ` Coly Li
  2019-06-05  7:27 ` [PATCH 2/6] bcache: Revert "bcache: free heap cache_set->flush_btree in bch_journal_free" Coly Li
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Coly Li @ 2019-06-05  7:27 UTC (permalink / raw)
  To: linux-bcache; +Cc: linux-block, Coly Li, stable, Tang Junhui

This reverts commit c4dc2497d50d9c6fb16aa0d07b6a14f3b2adb1e0.

This patch enlarges a race between normal btree flush code path and
flush_btree_write(), which causes deadlock when journal space is
exhausted. Reverts this patch makes the race window from 128 btree
nodes to only 1 btree nodes.

Fixes: c4dc2497d50d ("bcache: fix high CPU occupancy during journal")
Signed-off-by: Coly Li <colyli@suse.de>
Cc: stable@vger.kernel.org
Cc: Tang Junhui <tang.junhui.linux@gmail.com>
---
 drivers/md/bcache/bcache.h  |  2 --
 drivers/md/bcache/journal.c | 47 +++++++++++++++------------------------------
 drivers/md/bcache/util.h    |  2 --
 3 files changed, 15 insertions(+), 36 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 73a97586a2ef..cb268d7c6cea 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -726,8 +726,6 @@ struct cache_set {
 
 #define BUCKET_HASH_BITS	12
 	struct hlist_head	bucket_hash[1 << BUCKET_HASH_BITS];
-
-	DECLARE_HEAP(struct btree *, flush_btree);
 };
 
 struct bbio {
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index d3f2331fc559..ce176bbef3fe 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -415,12 +415,6 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 }
 
 /* Journalling */
-#define journal_max_cmp(l, r) \
-	(fifo_idx(&c->journal.pin, btree_current_write(l)->journal) < \
-	 fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
-#define journal_min_cmp(l, r) \
-	(fifo_idx(&c->journal.pin, btree_current_write(l)->journal) > \
-	 fifo_idx(&(c)->journal.pin, btree_current_write(r)->journal))
 
 static void btree_flush_write(struct cache_set *c)
 {
@@ -428,35 +422,25 @@ static void btree_flush_write(struct cache_set *c)
 	 * Try to find the btree node with that references the oldest journal
 	 * entry, best is our current candidate and is locked if non NULL:
 	 */
-	struct btree *b;
-	int i;
+	struct btree *b, *best;
+	unsigned int i;
 
 	atomic_long_inc(&c->flush_write);
-
 retry:
-	spin_lock(&c->journal.lock);
-	if (heap_empty(&c->flush_btree)) {
-		for_each_cached_btree(b, c, i)
-			if (btree_current_write(b)->journal) {
-				if (!heap_full(&c->flush_btree))
-					heap_add(&c->flush_btree, b,
-						 journal_max_cmp);
-				else if (journal_max_cmp(b,
-					 heap_peek(&c->flush_btree))) {
-					c->flush_btree.data[0] = b;
-					heap_sift(&c->flush_btree, 0,
-						  journal_max_cmp);
-				}
+	best = NULL;
+
+	for_each_cached_btree(b, c, i)
+		if (btree_current_write(b)->journal) {
+			if (!best)
+				best = b;
+			else if (journal_pin_cmp(c,
+					btree_current_write(best)->journal,
+					btree_current_write(b)->journal)) {
+				best = b;
 			}
+		}
 
-		for (i = c->flush_btree.used / 2 - 1; i >= 0; --i)
-			heap_sift(&c->flush_btree, i, journal_min_cmp);
-	}
-
-	b = NULL;
-	heap_pop(&c->flush_btree, b, journal_min_cmp);
-	spin_unlock(&c->journal.lock);
-
+	b = best;
 	if (b) {
 		mutex_lock(&b->write_lock);
 		if (!btree_current_write(b)->journal) {
@@ -898,8 +882,7 @@ int bch_journal_alloc(struct cache_set *c)
 	j->w[0].c = c;
 	j->w[1].c = c;
 
-	if (!(init_heap(&c->flush_btree, 128, GFP_KERNEL)) ||
-	    !(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
 	    !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
 	    !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
 		return -ENOMEM;
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 1fbced94e4cc..c029f7443190 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -113,8 +113,6 @@ do {									\
 
 #define heap_full(h)	((h)->used == (h)->size)
 
-#define heap_empty(h)	((h)->used == 0)
-
 #define DECLARE_FIFO(type, name)					\
 	struct {							\
 		size_t front, back, size, mask;				\
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 2/6] bcache: Revert "bcache: free heap cache_set->flush_btree in bch_journal_free"
  2019-06-05  7:27 [PATCH 0/6] fix for a race in btree_flush_write() Coly Li
  2019-06-05  7:27 ` [PATCH 1/6] bcache: Revert "bcache: fix high CPU occupancy during journal" Coly Li
@ 2019-06-05  7:27 ` Coly Li
  2019-06-05  7:27 ` [PATCH 3/6] bcache: set largest seq to ja->seq[bucket_index] in journal_read_bucket() Coly Li
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Coly Li @ 2019-06-05  7:27 UTC (permalink / raw)
  To: linux-bcache; +Cc: linux-block, Coly Li, stable, Shenghui Wang

This reverts commit 6268dc2c4703aabfb0b35681be709acf4c2826c6.

This patch depends on commit c4dc2497d50d ("bcache: fix high CPU
occupancy during journal") which is reverted in previous patch. So
revert this one too.

Fixes: 6268dc2c4703 ("bcache: free heap cache_set->flush_btree in bch_journal_free")
Signed-off-by: Coly Li <colyli@suse.de>
Cc: stable@vger.kernel.org
Cc: Shenghui Wang <shhuiw@foxmail.com>
---
 drivers/md/bcache/journal.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index ce176bbef3fe..76d3770ce484 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -867,7 +867,6 @@ void bch_journal_free(struct cache_set *c)
 	free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
 	free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
 	free_fifo(&c->journal.pin);
-	free_heap(&c->flush_btree);
 }
 
 int bch_journal_alloc(struct cache_set *c)
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 3/6] bcache: set largest seq to ja->seq[bucket_index] in journal_read_bucket()
  2019-06-05  7:27 [PATCH 0/6] fix for a race in btree_flush_write() Coly Li
  2019-06-05  7:27 ` [PATCH 1/6] bcache: Revert "bcache: fix high CPU occupancy during journal" Coly Li
  2019-06-05  7:27 ` [PATCH 2/6] bcache: Revert "bcache: free heap cache_set->flush_btree in bch_journal_free" Coly Li
@ 2019-06-05  7:27 ` Coly Li
  2019-06-05  7:27 ` [PATCH 4/6] bcache: remove retry_flush_write from struct cache_set Coly Li
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Coly Li @ 2019-06-05  7:27 UTC (permalink / raw)
  To: linux-bcache; +Cc: linux-block, Coly Li

In journal_read_bucket() when setting ja->seq[bucket_index], there might
be potential case that a later non-maximum overwrites a better sequence
number to ja->seq[bucket_index]. This patch adds a check to make sure
that ja->seq[bucket_index] will be only set a new value if it is bigger
then current value.

Signed-off-by: Coly Li <colyli@suse.de>
---
 drivers/md/bcache/journal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 76d3770ce484..69586ded980c 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -156,7 +156,8 @@ reread:		left = ca->sb.bucket_size - offset;
 			list_add(&i->list, where);
 			ret = 1;
 
-			ja->seq[bucket_index] = j->seq;
+			if (j->seq > ja->seq[bucket_index])
+				ja->seq[bucket_index] = j->seq;
 next_set:
 			offset	+= blocks * ca->sb.block_size;
 			len	-= blocks * ca->sb.block_size;
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 4/6] bcache: remove retry_flush_write from struct cache_set
  2019-06-05  7:27 [PATCH 0/6] fix for a race in btree_flush_write() Coly Li
                   ` (2 preceding siblings ...)
  2019-06-05  7:27 ` [PATCH 3/6] bcache: set largest seq to ja->seq[bucket_index] in journal_read_bucket() Coly Li
@ 2019-06-05  7:27 ` Coly Li
  2019-06-05  7:27 ` [PATCH 5/6] bcache: fix race in btree_flush_write() Coly Li
  2019-06-05  7:27 ` [PATCH 6/6] bcache: add reclaimed_journal_buckets to struct cache_set Coly Li
  5 siblings, 0 replies; 7+ messages in thread
From: Coly Li @ 2019-06-05  7:27 UTC (permalink / raw)
  To: linux-bcache; +Cc: linux-block, Coly Li

In struct cache_set, retry_flush_write is added for commit c4dc2497d50d
("bcache: fix high CPU occupancy during journal") which is reverted in
previous patch.

Now it is useless anymore, and this patch removes it from bcache code.

Signed-off-by: Coly Li <colyli@suse.de>
---
 drivers/md/bcache/bcache.h | 1 -
 drivers/md/bcache/sysfs.c  | 5 -----
 2 files changed, 6 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index cb268d7c6cea..35396248a7d5 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -706,7 +706,6 @@ struct cache_set {
 
 	atomic_long_t		reclaim;
 	atomic_long_t		flush_write;
-	atomic_long_t		retry_flush_write;
 
 	enum			{
 		ON_ERROR_UNREGISTER,
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index eb678e43ac00..80ead1a205b9 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -81,7 +81,6 @@ read_attribute(state);
 read_attribute(cache_read_races);
 read_attribute(reclaim);
 read_attribute(flush_write);
-read_attribute(retry_flush_write);
 read_attribute(writeback_keys_done);
 read_attribute(writeback_keys_failed);
 read_attribute(io_errors);
@@ -695,9 +694,6 @@ SHOW(__bch_cache_set)
 	sysfs_print(flush_write,
 		    atomic_long_read(&c->flush_write));
 
-	sysfs_print(retry_flush_write,
-		    atomic_long_read(&c->retry_flush_write));
-
 	sysfs_print(writeback_keys_done,
 		    atomic_long_read(&c->writeback_keys_done));
 	sysfs_print(writeback_keys_failed,
@@ -914,7 +910,6 @@ static struct attribute *bch_cache_set_internal_files[] = {
 	&sysfs_cache_read_races,
 	&sysfs_reclaim,
 	&sysfs_flush_write,
-	&sysfs_retry_flush_write,
 	&sysfs_writeback_keys_done,
 	&sysfs_writeback_keys_failed,
 
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 5/6] bcache: fix race in btree_flush_write()
  2019-06-05  7:27 [PATCH 0/6] fix for a race in btree_flush_write() Coly Li
                   ` (3 preceding siblings ...)
  2019-06-05  7:27 ` [PATCH 4/6] bcache: remove retry_flush_write from struct cache_set Coly Li
@ 2019-06-05  7:27 ` Coly Li
  2019-06-05  7:27 ` [PATCH 6/6] bcache: add reclaimed_journal_buckets to struct cache_set Coly Li
  5 siblings, 0 replies; 7+ messages in thread
From: Coly Li @ 2019-06-05  7:27 UTC (permalink / raw)
  To: linux-bcache; +Cc: linux-block, Coly Li

There is a race between mca_reap(), btree_node_free() and journal code
btree_flush_write(), which results very rare and strange deadlock or
panic and are very hard to reproduce.

Let me explain how the race happens. In btree_flush_write() one btree
node with oldest journal pin is selected, then it is flushed to cache
device, the select-and-flush is a two steps operation. Between these two
steps, there are something may happen inside the race window,
- The selected btree node was reaped by mca_reap() and allocated to
  other requesters for other btree node.
- The slected btree node was selected, flushed and released by mca
  shrink callback bch_mca_scan().
When btree_flush_write() tries to flush the selected btree node, firstly
b->write_lock is held by mutex_lock(). If the race happens and the
memory of selected btree node is allocated to other btree node, if that
btree node's write_lock is held already, a deadlock very probably
happens here. A worse case is the memory of the selected btree node is
released, then all references to this btree node (e.g. b->write_lock)
will trigger NULL pointer deference panic.

This race was introduced in commit cafe56359144 ("bcache: A block layer
cache"), and enlarged by commit c4dc2497d50d ("bcache: fix high CPU
occupancy during journal"), which selected 128 btree nodes and flushed
them one-by-one in a quite long time period.

Such race is not easy to reproduce before. On a Lenovo SR650 server with
48 Xeon cores, and configure 1 NVMe SSD as cache device, a MD raid0
device assembled by 3 NVMe SSDs as backing device, this race can be
observed around every 10,000 times btree_flush_write() gets called. Both
deadlock and kernel panic all happened as aftermath of the race.

The idea of the fix is to add a btree flag BTREE_NODE_journal_flush. It
is set when selecting btree nodes, and cleared after btree nodes
flushed. Then when mca_reap() selects a btree node with this bit set,
this btree node will be skipped. Since mca_reap() only reaps btree node
without BTREE_NODE_journal_flush flag, such race is avoided.

Once corner case should be noticed, that is btree_node_free(). It might
be called in some error handling code path. For example the following
code piece from btree_split(),
	2149 err_free2:
	2150         bkey_put(b->c, &n2->key);
	2151         btree_node_free(n2);
	2152         rw_unlock(true, n2);
	2153 err_free1:
	2154         bkey_put(b->c, &n1->key);
	2155         btree_node_free(n1);
	2156         rw_unlock(true, n1);
At line 2151 and 2155, the btree node n2 and n1 are released without
mac_reap(), so BTREE_NODE_journal_flush also needs to be checked here.
If btree_node_free() is called directly in such error handling path,
and the selected btree node has BTREE_NODE_journal_flush bit set, just
wait for 1 jiffy and retry again. In this case this btree node won't
be skipped, just retry until the BTREE_NODE_journal_flush bit cleared,
and free the btree node memory.

Wait for 1 jiffy inside btree_node_free() does not hurt too much
performance here, the reasons are,
- Error handling code path is not frequently executed, and the race
  inside this cold path should be very rare. If the very rare race
  happens in the cold code path, waiting 1 jiffy should be acceptible.
- If bree_node_free() is called inside mca_reap(), it means the bit
  BTREE_NODE_journal_flush is checked already, no wait will happen here.

Beside the above fix, the way to select flushing btree nodes is also
changed in this patch. Let me explain what changes in this patch.

- Use another spinlock journal.flush_write_lock to replace the very
  hot journal.lock. We don't have to use journal.lock here, selecting
  candidate btree nodes takes a lot of time, hold journal.lock here will
  block other jouranling threads and drop the overall I/O performance.
- Only select flushing btree node from c->btree_cache list. When the
  machine has a large system memory, mca cache may have a huge number of
  cached btree nodes. Iterating all the cached nodes will take a lot
  of CPU time, and most of the nodes on c->btree_cache_freeable and
  c->btree_cache_freed lists are cleared and have need to flush. So only
  travel mca list c->btree_cache to select flushing btree node should be
  enough for most of the cases.
- Don't iterate whole c->btree_cache list, only reversely select first
  BTREE_FLUSH_NR (32) btree nodes to flush. Iterate all btree nodes from
  c->btree_cache and select the oldest journal pin btree nodes consumes
  huge number of CPU cycles if the list is huge (push and pop a node
  into/out of a heap is expensive). The last several dirty btree nodes
  on the tail of c->btree_cache list are earlest allocated and cached
  btree nodes, they are relative to the oldest journal pin btree nodes.
  Therefore only flushing BTREE_FLUSH_NR btree nodes from tail of
  c->btree_cache probably includes the oldest journal pin btree nodes.

In my testing, the above change decreases 50%+ CPU consumption when
journal space is full. Some times IOPS drops to 0 for 5-8 seconds,
comparing blocking I/O for 120+ seconds in previous code, this is much
better. Maybe there is room to improve in future, but at this momment
the fix looks fine and performs well in my testing.

Signed-off-by: Coly Li <colyli@suse.de>
---
 drivers/md/bcache/btree.c   | 15 +++++++-
 drivers/md/bcache/btree.h   |  2 +
 drivers/md/bcache/journal.c | 93 ++++++++++++++++++++++++++++++++++-----------
 drivers/md/bcache/journal.h |  4 ++
 4 files changed, 90 insertions(+), 24 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index cf38a1b031fa..c0dd8fde37af 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -660,6 +660,13 @@ static int mca_reap(struct btree *b, unsigned int min_order, bool flush)
 	}
 
 	mutex_lock(&b->write_lock);
+	/* don't reap btree node handling in btree_flush_write() */
+	if (btree_node_journal_flush(b)) {
+		pr_debug("bnode %p is flushing by journal, ignore", b);
+		mutex_unlock(&b->write_lock);
+		goto out_unlock;
+	}
+
 	if (btree_node_dirty(b))
 		__bch_btree_node_write(b, &cl);
 	mutex_unlock(&b->write_lock);
@@ -1071,8 +1078,14 @@ static void btree_node_free(struct btree *b)
 
 	BUG_ON(b == b->c->root);
 
+retry:
 	mutex_lock(&b->write_lock);
-
+	if (btree_node_journal_flush(b)) {
+		mutex_unlock(&b->write_lock);
+		pr_debug("bnode %p journal_flush set, retry", b);
+		schedule_timeout_interruptible(1);
+		goto retry;
+	}
 	if (btree_node_dirty(b))
 		btree_complete_write(b, btree_current_write(b));
 	clear_bit(BTREE_NODE_dirty, &b->flags);
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
index d1c72ef64edf..76cfd121a486 100644
--- a/drivers/md/bcache/btree.h
+++ b/drivers/md/bcache/btree.h
@@ -158,11 +158,13 @@ enum btree_flags {
 	BTREE_NODE_io_error,
 	BTREE_NODE_dirty,
 	BTREE_NODE_write_idx,
+	BTREE_NODE_journal_flush,
 };
 
 BTREE_FLAG(io_error);
 BTREE_FLAG(dirty);
 BTREE_FLAG(write_idx);
+BTREE_FLAG(journal_flush);
 
 static inline struct btree_write *btree_current_write(struct btree *b)
 {
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 69586ded980c..4a7cc1bf8e3a 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -419,41 +419,87 @@ int bch_journal_replay(struct cache_set *s, struct list_head *list)
 
 static void btree_flush_write(struct cache_set *c)
 {
-	/*
-	 * Try to find the btree node with that references the oldest journal
-	 * entry, best is our current candidate and is locked if non NULL:
-	 */
-	struct btree *b, *best;
-	unsigned int i;
+	struct btree *b, *btree_nodes[BTREE_FLUSH_NR];
+	unsigned int i, n;
+
+	if (c->journal.btree_flushing)
+		return;
+
+	spin_lock(&c->journal.flush_write_lock);
+	if (c->journal.btree_flushing) {
+		spin_unlock(&c->journal.flush_write_lock);
+		return;
+	}
+	c->journal.btree_flushing = true;
+	spin_unlock(&c->journal.flush_write_lock);
 
 	atomic_long_inc(&c->flush_write);
-retry:
-	best = NULL;
-
-	for_each_cached_btree(b, c, i)
-		if (btree_current_write(b)->journal) {
-			if (!best)
-				best = b;
-			else if (journal_pin_cmp(c,
-					btree_current_write(best)->journal,
-					btree_current_write(b)->journal)) {
-				best = b;
-			}
+	memset(btree_nodes, 0, sizeof(btree_nodes));
+	n = 0;
+
+	mutex_lock(&c->bucket_lock);
+	list_for_each_entry_reverse(b, &c->btree_cache, list) {
+		if (btree_node_journal_flush(b))
+			pr_err("BUG: flush_write bit should not be set here!");
+
+		mutex_lock(&b->write_lock);
+
+		if (!btree_node_dirty(b)) {
+			mutex_unlock(&b->write_lock);
+			continue;
+		}
+
+		if (!btree_current_write(b)->journal) {
+			mutex_unlock(&b->write_lock);
+			continue;
+		}
+
+		set_btree_node_journal_flush(b);
+
+		mutex_unlock(&b->write_lock);
+
+		btree_nodes[n++] = b;
+		if (n == BTREE_FLUSH_NR)
+			break;
+	}
+	mutex_unlock(&c->bucket_lock);
+
+	for (i = 0; i < n; i++) {
+		b = btree_nodes[i];
+		if (!b) {
+			pr_err("BUG: btree_nodes[%d] is NULL", i);
+			continue;
+		}
+
+		/* safe to check without holding b->write_lock */
+		if (!btree_node_journal_flush(b)) {
+			pr_err("BUG: bnode %p: journal_flush bit cleaned", b);
+			continue;
 		}
 
-	b = best;
-	if (b) {
 		mutex_lock(&b->write_lock);
 		if (!btree_current_write(b)->journal) {
 			mutex_unlock(&b->write_lock);
-			/* We raced */
-			atomic_long_inc(&c->retry_flush_write);
-			goto retry;
+			pr_debug("bnode %p: written by others", b);
+			clear_bit(BTREE_NODE_journal_flush, &b->flags);
+			continue;
+		}
+
+		if (!btree_node_dirty(b)) {
+			pr_debug("bnode %p: dirty bit cleaned by others", b);
+			clear_bit(BTREE_NODE_journal_flush, &b->flags);
+			mutex_unlock(&b->write_lock);
+			continue;
 		}
 
 		__bch_btree_node_write(b, NULL);
+		clear_bit(BTREE_NODE_journal_flush, &b->flags);
 		mutex_unlock(&b->write_lock);
 	}
+
+	spin_lock(&c->journal.flush_write_lock);
+	c->journal.btree_flushing = false;
+	spin_unlock(&c->journal.flush_write_lock);
 }
 
 #define last_seq(j)	((j)->seq - fifo_used(&(j)->pin) + 1)
@@ -875,6 +921,7 @@ int bch_journal_alloc(struct cache_set *c)
 	struct journal *j = &c->journal;
 
 	spin_lock_init(&j->lock);
+	spin_lock_init(&j->flush_write_lock);
 	INIT_DELAYED_WORK(&j->work, journal_write_work);
 
 	c->journal_delay_ms = 100;
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
index 66f0facff84b..aeed791f05e7 100644
--- a/drivers/md/bcache/journal.h
+++ b/drivers/md/bcache/journal.h
@@ -103,6 +103,8 @@ struct journal_write {
 /* Embedded in struct cache_set */
 struct journal {
 	spinlock_t		lock;
+	spinlock_t		flush_write_lock;
+	bool			btree_flushing;
 	/* used when waiting because the journal was full */
 	struct closure_waitlist	wait;
 	struct closure		io;
@@ -154,6 +156,8 @@ struct journal_device {
 	struct bio_vec		bv[8];
 };
 
+#define BTREE_FLUSH_NR	32
+
 #define journal_pin_cmp(c, l, r)				\
 	(fifo_idx(&(c)->journal.pin, (l)) > fifo_idx(&(c)->journal.pin, (r)))
 
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH 6/6] bcache: add reclaimed_journal_buckets to struct cache_set
  2019-06-05  7:27 [PATCH 0/6] fix for a race in btree_flush_write() Coly Li
                   ` (4 preceding siblings ...)
  2019-06-05  7:27 ` [PATCH 5/6] bcache: fix race in btree_flush_write() Coly Li
@ 2019-06-05  7:27 ` Coly Li
  5 siblings, 0 replies; 7+ messages in thread
From: Coly Li @ 2019-06-05  7:27 UTC (permalink / raw)
  To: linux-bcache; +Cc: linux-block, Coly Li

Now we have counters for how many times jouranl is reclaimed, how many
times cached dirty btree nodes are flushed, but we don't know how many
jouranl buckets are really reclaimed.

This patch adds reclaimed_journal_buckets into struct cache_set, this
is an increasing only counter, to tell how many journal buckets are
reclaimed since cache set runs. From all these three counters (reclaim,
reclaimed_journal_buckets, flush_write), we can have idea how well
current journal space reclaim code works.

Signed-off-by: Coly Li <colyli@suse.de>
---
 drivers/md/bcache/bcache.h  | 1 +
 drivers/md/bcache/journal.c | 1 +
 drivers/md/bcache/sysfs.c   | 5 +++++
 3 files changed, 7 insertions(+)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 35396248a7d5..013e35a9e317 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -705,6 +705,7 @@ struct cache_set {
 	atomic_long_t		writeback_keys_failed;
 
 	atomic_long_t		reclaim;
+	atomic_long_t		reclaimed_journal_buckets;
 	atomic_long_t		flush_write;
 
 	enum			{
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index 4a7cc1bf8e3a..39580692339c 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -614,6 +614,7 @@ static void journal_reclaim(struct cache_set *c)
 		k->ptr[n++] = MAKE_PTR(0,
 				  bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
 				  ca->sb.nr_this_dev);
+		atomic_long_inc(&c->reclaimed_journal_buckets);
 	}
 
 	if (n) {
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 80ead1a205b9..2ec1f2a90f4a 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -80,6 +80,7 @@ read_attribute(bset_tree_stats);
 read_attribute(state);
 read_attribute(cache_read_races);
 read_attribute(reclaim);
+read_attribute(reclaimed_journal_buckets);
 read_attribute(flush_write);
 read_attribute(writeback_keys_done);
 read_attribute(writeback_keys_failed);
@@ -691,6 +692,9 @@ SHOW(__bch_cache_set)
 	sysfs_print(reclaim,
 		    atomic_long_read(&c->reclaim));
 
+	sysfs_print(reclaimed_journal_buckets,
+		    atomic_long_read(&c->reclaimed_journal_buckets));
+
 	sysfs_print(flush_write,
 		    atomic_long_read(&c->flush_write));
 
@@ -909,6 +913,7 @@ static struct attribute *bch_cache_set_internal_files[] = {
 	&sysfs_bset_tree_stats,
 	&sysfs_cache_read_races,
 	&sysfs_reclaim,
+	&sysfs_reclaimed_journal_buckets,
 	&sysfs_flush_write,
 	&sysfs_writeback_keys_done,
 	&sysfs_writeback_keys_failed,
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2019-06-05  7:27 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-05  7:27 [PATCH 0/6] fix for a race in btree_flush_write() Coly Li
2019-06-05  7:27 ` [PATCH 1/6] bcache: Revert "bcache: fix high CPU occupancy during journal" Coly Li
2019-06-05  7:27 ` [PATCH 2/6] bcache: Revert "bcache: free heap cache_set->flush_btree in bch_journal_free" Coly Li
2019-06-05  7:27 ` [PATCH 3/6] bcache: set largest seq to ja->seq[bucket_index] in journal_read_bucket() Coly Li
2019-06-05  7:27 ` [PATCH 4/6] bcache: remove retry_flush_write from struct cache_set Coly Li
2019-06-05  7:27 ` [PATCH 5/6] bcache: fix race in btree_flush_write() Coly Li
2019-06-05  7:27 ` [PATCH 6/6] bcache: add reclaimed_journal_buckets to struct cache_set Coly Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.