All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 07/19] bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints
  2017-06-29 13:45   ` Christoph Hellwig
@ 2016-10-11 19:04     ` Eric Wheeler
  2016-10-11 19:04     ` Eric Wheeler
                       ` (29 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2016-10-11 19:04 UTC (permalink / raw)
  To: linux-block

Add sysfs entries to support to hint for bypass/writeback by the ioprio
assigned to the bio.  If the bio is unassigned, use current's io-context
ioprio for cache writeback or bypass (configured per-process with
`ionice`).

Having idle IOs bypass the cache can increase performance elsewhere
since you probably don't care about their performance.  In addition,
this prevents idle IOs from promoting into (polluting) your cache and
evicting blocks that are more important elsewhere.

If you really nead the performance at the expense of SSD wearout,
then configure ioprio_writeback and set your `ionice` appropriately.

For example:
	echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
	echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback

See the documentation commit for details.

Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
Acked-by: Kent Overstreet <kent.overstreet@gmail.com>
Tested-by: Kai Krakow <kai@kaishome.de>
Cc: nix@esperi.org.uk
---
 drivers/md/bcache/bcache.h    |  3 ++
 drivers/md/bcache/request.c   | 24 +++++++++++++++
 drivers/md/bcache/sysfs.c     | 71 +++++++++++++++++++++++++++++++++++++++++++
 drivers/md/bcache/writeback.c |  8 +++++
 drivers/md/bcache/writeback.h | 24 +++++++++++++++
 5 files changed, 130 insertions(+)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index dee542f..44123e4 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -367,6 +367,9 @@ struct cached_dev {
 	unsigned		writeback_rate_update_seconds;
 	unsigned		writeback_rate_d_term;
 	unsigned		writeback_rate_p_term_inverse;
+
+	unsigned short		ioprio_writeback;
+	unsigned short		ioprio_bypass;
 };
 
 enum alloc_reserve {
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index d27707d..a95609f 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -373,6 +373,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	unsigned sectors, congested = bch_get_congested(c);
 	struct task_struct *task = current;
 	struct io *i;
+	struct io_context *ioc;
+	unsigned short ioprio;
 
 	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
@@ -384,6 +386,28 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	     op_is_write(bio_op(bio))))
 		goto skip;
 
+	/* If the ioprio already exists on the bio, use that.  We assume that
+	 * the upper layer properly assigned the calling process's ioprio to
+	 * the bio being passed to bcache. Otherwise, use current's ioc. */
+	ioprio = bio_prio(bio);
+	if (!ioprio_valid(ioprio)) {
+		ioc = get_task_io_context(current, GFP_NOIO, NUMA_NO_NODE);
+		if (ioc) {
+			if (ioprio_valid(ioc->ioprio))
+				ioprio = ioc->ioprio;
+			put_io_context(ioc);
+			ioc = NULL;
+		}
+	}
+
+	/* If process ioprio is lower-or-equal to dc->ioprio_bypass, then
+	 * hint for bypass. Note that a lower-priority IO class+value
+	 * has a greater numeric value. */
+	if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback)
+		&& ioprio >= dc->ioprio_bypass) {
+		return true;
+	}
+
 	if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
 	    bio_sectors(bio) & (c->sb.block_size - 1)) {
 		pr_debug("skipping unaligned io");
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index f90f136..cc0076d 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -107,6 +107,9 @@ rw_attribute(btree_shrinker_disabled);
 rw_attribute(copy_gc_enabled);
 rw_attribute(size);
 
+rw_attribute(ioprio_writeback);
+rw_attribute(ioprio_bypass);
+
 SHOW(__bch_cached_dev)
 {
 	struct cached_dev *dc = container_of(kobj, struct cached_dev,
@@ -183,6 +186,17 @@ SHOW(__bch_cached_dev)
 		return strlen(buf);
 	}
 
+	if (attr == &sysfs_ioprio_bypass)
+		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
+			IOPRIO_PRIO_CLASS(dc->ioprio_bypass),
+			IOPRIO_PRIO_DATA(dc->ioprio_bypass));
+
+	if (attr == &sysfs_ioprio_writeback)
+		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
+			IOPRIO_PRIO_CLASS(dc->ioprio_writeback),
+			IOPRIO_PRIO_DATA(dc->ioprio_writeback));
+
+
 #undef var
 	return 0;
 }
@@ -195,6 +209,10 @@ STORE(__cached_dev)
 	unsigned v = size;
 	struct cache_set *c;
 	struct kobj_uevent_env *env;
+	unsigned ioprio_class = 0; /* invalid initial ioprio values */
+	unsigned ioprio_level = IOPRIO_BE_NR;
+	unsigned short *ioprio_hint = NULL;
+	char *ioprio_type = NULL;
 
 #define d_strtoul(var)		sysfs_strtoul(var, dc->var)
 #define d_strtoul_nonzero(var)	sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
@@ -283,6 +301,57 @@ STORE(__cached_dev)
 	if (attr == &sysfs_stop)
 		bcache_device_stop(&dc->disk);
 
+	/* ioprio hinting: we use ioprio_hint to reduce duplicate printk verbiage */
+	if (attr == &sysfs_ioprio_writeback) {
+		ioprio_hint = &dc->ioprio_writeback;
+		ioprio_type = "writeback";
+	}
+
+	if (attr == &sysfs_ioprio_bypass) {
+		ioprio_hint = &dc->ioprio_bypass;
+		ioprio_type = "bypass";
+	}
+
+	if (ioprio_hint != NULL)
+	{
+		if (sscanf(buf, "%u,%u", &ioprio_class, &ioprio_level) != 2
+			|| ioprio_class > IOPRIO_CLASS_IDLE
+			|| ioprio_level >= IOPRIO_BE_NR) {
+			pr_err("ioprio_%s invalid, expecting: (class,level) but parsed (%u,%u); ignored.",
+				ioprio_type,
+				ioprio_class, ioprio_level);
+			return size;
+		}
+
+		/* Use the maximum(/minimum) value in the class shift space to make integer
+		  comparison correct for ioprio_writeback(/ioprio_bypass) for IOPRIO_CLASS_IDLE.
+		  This is necessary because there are no ioprio levels for the idle class. */
+		if (ioprio_class == IOPRIO_CLASS_IDLE) {
+			if (ioprio_hint == &dc->ioprio_writeback)
+				ioprio_level = IOPRIO_PRIO_MASK;
+			else
+				/* Same, but 0 for bypass (inverted vs. writeback) */
+				ioprio_level = 0;
+		}
+
+		*ioprio_hint = IOPRIO_PRIO_VALUE(ioprio_class, ioprio_level);
+
+		if (!ioprio_valid(*ioprio_hint))
+			pr_info("disabled ioprio_%s hints.", ioprio_type);
+		else
+			pr_info("set hint for cache %s with priority %s: (class,level) = (%u,%u)",
+				ioprio_type,
+				( ioprio_hint == &dc->ioprio_writeback ? "at-or-above" : "at-or-below" ),
+				ioprio_class, ioprio_level);
+
+		if (ioprio_valid(dc->ioprio_writeback)
+			&& ioprio_valid(dc->ioprio_bypass)
+			&& dc->ioprio_writeback >= dc->ioprio_bypass)
+			pr_warning(
+				"warning: ioprio_writeback hint is neither disabled nor higher priority than the bypass hint; "
+				"will always writeback!");
+	}
+
 	return size;
 }
 
@@ -335,6 +404,8 @@ static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_verify,
 	&sysfs_bypass_torture_test,
 #endif
+	&sysfs_ioprio_bypass,
+	&sysfs_ioprio_writeback,
 	NULL
 };
 KTYPE(bch_cached_dev);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 42c66e7..3d463f0 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -511,6 +511,14 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_rate_d_term	= 30;
 	dc->writeback_rate_p_term_inverse = 6000;
 
+	/* These defaults provide the best SSD life by enabling bypass
+	 for priorities at-or-below BE-7. This also provides better
+	 performance (cache hits) by preventing (near-)idle processes from
+	 polluting the cache working set.  Only set ioprio_writeback if
+	 you really need it: it will wear out your SSD sooner. */
+	dc->ioprio_writeback = IOPRIO_PRIO_VALUE(0, 0);
+	dc->ioprio_bypass    = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, (IOPRIO_BE_NR-1));
+
 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 }
 
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 629bd1a..cd82fe8 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -43,6 +43,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 				    unsigned cache_mode, bool would_skip)
 {
 	unsigned in_use = dc->disk.c->gc_stats.in_use;
+	struct io_context *ioc;
+	unsigned short ioprio;
 
 	if (cache_mode != CACHE_MODE_WRITEBACK ||
 	    test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
@@ -57,6 +59,28 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 	if (would_skip)
 		return false;
 
+	/* If the ioprio already exists on the bio, use that.  We assume that
+	 * the upper layer properly assigned the calling process's ioprio to
+	 * the bio being passed to bcache. Otherwise, use current's ioc. */
+	ioprio = bio_prio(bio);
+	if (!ioprio_valid(ioprio)) {
+		ioc = get_task_io_context(current, GFP_NOIO, NUMA_NO_NODE);
+		if (ioc) {
+			if (ioprio_valid(ioc->ioprio))
+				ioprio = ioc->ioprio;
+			put_io_context(ioc);
+			ioc = NULL;
+		}
+	}
+
+	/* If process ioprio is higher-or-equal to dc->ioprio_writeback, then
+	 * hint for writeback. Note that a higher-priority IO class+value
+	 * has a lesser numeric value. */
+	if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback)
+		&& ioprio <= dc->ioprio_writeback) {
+		return true;
+	}
+
 	return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
 }
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 07/19] bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints
  2017-06-29 13:45   ` Christoph Hellwig
  2016-10-11 19:04     ` [PATCH 07/19] bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints Eric Wheeler
@ 2016-10-11 19:04     ` Eric Wheeler
  2016-10-11 19:08     ` [PATCH 08/19] bcache: documentation for sysfs entries describing bcache cache hinting Eric Wheeler
                       ` (28 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2016-10-11 19:04 UTC (permalink / raw)
  To: linux-block

Add sysfs entries to support to hint for bypass/writeback by the ioprio
assigned to the bio.  If the bio is unassigned, use current's io-context
ioprio for cache writeback or bypass (configured per-process with
`ionice`).

Having idle IOs bypass the cache can increase performance elsewhere
since you probably don't care about their performance.  In addition,
this prevents idle IOs from promoting into (polluting) your cache and
evicting blocks that are more important elsewhere.

If you really nead the performance at the expense of SSD wearout,
then configure ioprio_writeback and set your `ionice` appropriately.

For example:
	echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
	echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback

See the documentation commit for details.

Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
Acked-by: Kent Overstreet <kent.overstreet@gmail.com>
Tested-by: Kai Krakow <kai@kaishome.de>
Cc: nix@esperi.org.uk
---
 drivers/md/bcache/bcache.h    |  3 ++
 drivers/md/bcache/request.c   | 24 +++++++++++++++
 drivers/md/bcache/sysfs.c     | 71 +++++++++++++++++++++++++++++++++++++++++++
 drivers/md/bcache/writeback.c |  8 +++++
 drivers/md/bcache/writeback.h | 24 +++++++++++++++
 5 files changed, 130 insertions(+)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index dee542f..44123e4 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -367,6 +367,9 @@ struct cached_dev {
 	unsigned		writeback_rate_update_seconds;
 	unsigned		writeback_rate_d_term;
 	unsigned		writeback_rate_p_term_inverse;
+
+	unsigned short		ioprio_writeback;
+	unsigned short		ioprio_bypass;
 };
 
 enum alloc_reserve {
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index d27707d..a95609f 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -373,6 +373,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	unsigned sectors, congested = bch_get_congested(c);
 	struct task_struct *task = current;
 	struct io *i;
+	struct io_context *ioc;
+	unsigned short ioprio;
 
 	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
@@ -384,6 +386,28 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	     op_is_write(bio_op(bio))))
 		goto skip;
 
+	/* If the ioprio already exists on the bio, use that.  We assume that
+	 * the upper layer properly assigned the calling process's ioprio to
+	 * the bio being passed to bcache. Otherwise, use current's ioc. */
+	ioprio = bio_prio(bio);
+	if (!ioprio_valid(ioprio)) {
+		ioc = get_task_io_context(current, GFP_NOIO, NUMA_NO_NODE);
+		if (ioc) {
+			if (ioprio_valid(ioc->ioprio))
+				ioprio = ioc->ioprio;
+			put_io_context(ioc);
+			ioc = NULL;
+		}
+	}
+
+	/* If process ioprio is lower-or-equal to dc->ioprio_bypass, then
+	 * hint for bypass. Note that a lower-priority IO class+value
+	 * has a greater numeric value. */
+	if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback)
+		&& ioprio >= dc->ioprio_bypass) {
+		return true;
+	}
+
 	if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
 	    bio_sectors(bio) & (c->sb.block_size - 1)) {
 		pr_debug("skipping unaligned io");
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index f90f136..cc0076d 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -107,6 +107,9 @@ rw_attribute(btree_shrinker_disabled);
 rw_attribute(copy_gc_enabled);
 rw_attribute(size);
 
+rw_attribute(ioprio_writeback);
+rw_attribute(ioprio_bypass);
+
 SHOW(__bch_cached_dev)
 {
 	struct cached_dev *dc = container_of(kobj, struct cached_dev,
@@ -183,6 +186,17 @@ SHOW(__bch_cached_dev)
 		return strlen(buf);
 	}
 
+	if (attr == &sysfs_ioprio_bypass)
+		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
+			IOPRIO_PRIO_CLASS(dc->ioprio_bypass),
+			IOPRIO_PRIO_DATA(dc->ioprio_bypass));
+
+	if (attr == &sysfs_ioprio_writeback)
+		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
+			IOPRIO_PRIO_CLASS(dc->ioprio_writeback),
+			IOPRIO_PRIO_DATA(dc->ioprio_writeback));
+
+
 #undef var
 	return 0;
 }
@@ -195,6 +209,10 @@ STORE(__cached_dev)
 	unsigned v = size;
 	struct cache_set *c;
 	struct kobj_uevent_env *env;
+	unsigned ioprio_class = 0; /* invalid initial ioprio values */
+	unsigned ioprio_level = IOPRIO_BE_NR;
+	unsigned short *ioprio_hint = NULL;
+	char *ioprio_type = NULL;
 
 #define d_strtoul(var)		sysfs_strtoul(var, dc->var)
 #define d_strtoul_nonzero(var)	sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
@@ -283,6 +301,57 @@ STORE(__cached_dev)
 	if (attr == &sysfs_stop)
 		bcache_device_stop(&dc->disk);
 
+	/* ioprio hinting: we use ioprio_hint to reduce duplicate printk verbiage */
+	if (attr == &sysfs_ioprio_writeback) {
+		ioprio_hint = &dc->ioprio_writeback;
+		ioprio_type = "writeback";
+	}
+
+	if (attr == &sysfs_ioprio_bypass) {
+		ioprio_hint = &dc->ioprio_bypass;
+		ioprio_type = "bypass";
+	}
+
+	if (ioprio_hint != NULL)
+	{
+		if (sscanf(buf, "%u,%u", &ioprio_class, &ioprio_level) != 2
+			|| ioprio_class > IOPRIO_CLASS_IDLE
+			|| ioprio_level >= IOPRIO_BE_NR) {
+			pr_err("ioprio_%s invalid, expecting: (class,level) but parsed (%u,%u); ignored.",
+				ioprio_type,
+				ioprio_class, ioprio_level);
+			return size;
+		}
+
+		/* Use the maximum(/minimum) value in the class shift space to make integer
+		  comparison correct for ioprio_writeback(/ioprio_bypass) for IOPRIO_CLASS_IDLE.
+		  This is necessary because there are no ioprio levels for the idle class. */
+		if (ioprio_class == IOPRIO_CLASS_IDLE) {
+			if (ioprio_hint == &dc->ioprio_writeback)
+				ioprio_level = IOPRIO_PRIO_MASK;
+			else
+				/* Same, but 0 for bypass (inverted vs. writeback) */
+				ioprio_level = 0;
+		}
+
+		*ioprio_hint = IOPRIO_PRIO_VALUE(ioprio_class, ioprio_level);
+
+		if (!ioprio_valid(*ioprio_hint))
+			pr_info("disabled ioprio_%s hints.", ioprio_type);
+		else
+			pr_info("set hint for cache %s with priority %s: (class,level) = (%u,%u)",
+				ioprio_type,
+				( ioprio_hint == &dc->ioprio_writeback ? "at-or-above" : "at-or-below" ),
+				ioprio_class, ioprio_level);
+
+		if (ioprio_valid(dc->ioprio_writeback)
+			&& ioprio_valid(dc->ioprio_bypass)
+			&& dc->ioprio_writeback >= dc->ioprio_bypass)
+			pr_warning(
+				"warning: ioprio_writeback hint is neither disabled nor higher priority than the bypass hint; "
+				"will always writeback!");
+	}
+
 	return size;
 }
 
@@ -335,6 +404,8 @@ static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_verify,
 	&sysfs_bypass_torture_test,
 #endif
+	&sysfs_ioprio_bypass,
+	&sysfs_ioprio_writeback,
 	NULL
 };
 KTYPE(bch_cached_dev);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 42c66e7..3d463f0 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -511,6 +511,14 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_rate_d_term	= 30;
 	dc->writeback_rate_p_term_inverse = 6000;
 
+	/* These defaults provide the best SSD life by enabling bypass
+	 for priorities at-or-below BE-7. This also provides better
+	 performance (cache hits) by preventing (near-)idle processes from
+	 polluting the cache working set.  Only set ioprio_writeback if
+	 you really need it: it will wear out your SSD sooner. */
+	dc->ioprio_writeback = IOPRIO_PRIO_VALUE(0, 0);
+	dc->ioprio_bypass    = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, (IOPRIO_BE_NR-1));
+
 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 }
 
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 629bd1a..cd82fe8 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -43,6 +43,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 				    unsigned cache_mode, bool would_skip)
 {
 	unsigned in_use = dc->disk.c->gc_stats.in_use;
+	struct io_context *ioc;
+	unsigned short ioprio;
 
 	if (cache_mode != CACHE_MODE_WRITEBACK ||
 	    test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
@@ -57,6 +59,28 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 	if (would_skip)
 		return false;
 
+	/* If the ioprio already exists on the bio, use that.  We assume that
+	 * the upper layer properly assigned the calling process's ioprio to
+	 * the bio being passed to bcache. Otherwise, use current's ioc. */
+	ioprio = bio_prio(bio);
+	if (!ioprio_valid(ioprio)) {
+		ioc = get_task_io_context(current, GFP_NOIO, NUMA_NO_NODE);
+		if (ioc) {
+			if (ioprio_valid(ioc->ioprio))
+				ioprio = ioc->ioprio;
+			put_io_context(ioc);
+			ioc = NULL;
+		}
+	}
+
+	/* If process ioprio is higher-or-equal to dc->ioprio_writeback, then
+	 * hint for writeback. Note that a higher-priority IO class+value
+	 * has a lesser numeric value. */
+	if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback)
+		&& ioprio <= dc->ioprio_writeback) {
+		return true;
+	}
+
 	return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
 }
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 08/19] bcache: documentation for sysfs entries describing bcache cache hinting
  2017-06-29 13:45   ` Christoph Hellwig
  2016-10-11 19:04     ` [PATCH 07/19] bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints Eric Wheeler
  2016-10-11 19:04     ` Eric Wheeler
@ 2016-10-11 19:08     ` Eric Wheeler
  2016-10-11 19:08     ` Eric Wheeler
                       ` (27 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2016-10-11 19:08 UTC (permalink / raw)
  To: linux-block

Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
---
 Documentation/bcache.txt | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/Documentation/bcache.txt b/Documentation/bcache.txt
index a9259b5..c78c012 100644
--- a/Documentation/bcache.txt
+++ b/Documentation/bcache.txt
@@ -133,6 +133,86 @@ the backing devices to passthrough mode.
    writeback mode). It currently doesn't do anything intelligent if it fails to
    read some of the dirty data, though.
 
+SSD LONGEVITY: PER-PROCESS CACHE HINTING WITH IO PRIORITY
+---------------------------------------------------------
+
+Processes can be assigned an IO priority using `ionice` and bcache will
+either try to writeback or bypass the cache based on the IO priority
+level assigned to the process and the configuration of the syfs ioprio
+hints.  If configured properly for your workload, this can both increase
+performance and reduce SSD wear (erase/write cycles).
+
+Having idle IOs bypass the cache can increase performance elsewhere
+since you probably don't care about their performance.  In addition,
+this prevents idle IOs from promoting into (polluting) your cache and
+evicting blocks that are more important elsewhere.
+
+Default sysfs values:
+	2,7: ioprio_bypass is hinted for process IOs at-or-below best-effort-7.
+	0,0: ioprio_writeback hinting is disabled by default.
+
+Cache hinting is configured by writing 'class,level' pairs to sysfs.
+In this example, we write the following:
+
+    echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
+    echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback
+
+Thus, processes with the following IO class (ionice -c) and level (-n)
+will the behave as shown in this table:
+
+	(-c) IO Class    (-n) Class level       Action
+	-----------------------------------------------------
+	(1) Realtime      0-7                   Writeback
+	(2) Best-effort     0                   Writeback
+	(2) Best-effort   1-6                   Normal, as if hinting were disabled
+	(2) Best-effort     7                   Bypass cache
+	(3) Idle          n/a                   Bypass cache
+
+For processes at-or-below best-effort-7 (ionice -c2 -n7), the
+ioprio_bypass behavior is as follows:
+
+* Reads will come from the backing device and will not promote into
+  (pollute) your cache.  If the block being read was already in the cache,
+  then it will be read from the cache (and remain cached).
+
+* If you are using writeback mode, then low-priority bypass-hinted writes
+  will go directly to the backing device.  If the write was dirty in
+  cache, it will cache-invalidate and write directly to the backing
+  device.  If a high-priority task later writes the same block then it
+  will writeback so no performance is lost for write-after-write.
+
+  For read-after-bypassed-write, the block will be read from the backing
+  device (not cached) so there may be a miss penalty when a low-priority
+  process write bypasses the cache followed by a high-priority read that
+  would otherwise have hit.  In practice, this is not an issue; to date,
+  none have wanted low-priority writes and high-priority reads of the
+  same block.
+
+For processes in our example at-or-above best-effort-0 (ionice -c2 -n0),
+the ioprio_writeback behavior is as follows:
+
+* The writeback hint has no effect unless your 'cache_mode' is writeback.
+  Assuming writeback mode, all writes at this priority will writeback.
+  Of course this will increase SSD wear, so only use writeback hinting
+  if you need it.
+
+* Reads are unaffected by ioprio_writeback, except that read-after-write
+  will of course read from the cache.
+
+Linux assigns processes the best-effort class with a level of 4 if
+no process is assigned  Thus, without `ionice` your processes will
+follow normal bcache should_writeback/should_bypass symantecs as if the
+ioprio_writeback/ioprio_bypass sysfs flags were disabled.
+
+Also note that in order to be hinted by ioprio_writeback/ioprio_bypass,
+the process must have a valid ioprio setting as returned by
+get_task_io_context()->ioprio. Thus, a process without an IO context
+will be ignored by the ioprio_writeback/ioprio_bypass hints even if your
+sysfs hints specify that best-effort-4 should be flagged for bypass
+or writeback.  If in doubt, explicitly set the process IO priority with
+`ionice`.
+
+See `man ionice` for more detail about per-process IO priority in Linux.
 
 HOWTO/COOKBOOK
 --------------
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 08/19] bcache: documentation for sysfs entries describing bcache cache hinting
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (2 preceding siblings ...)
  2016-10-11 19:08     ` [PATCH 08/19] bcache: documentation for sysfs entries describing bcache cache hinting Eric Wheeler
@ 2016-10-11 19:08     ` Eric Wheeler
  2016-10-20  0:36     ` [PATCH 09/19] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints Eric Wheeler
                       ` (26 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2016-10-11 19:08 UTC (permalink / raw)
  To: linux-block

Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
---
 Documentation/bcache.txt | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/Documentation/bcache.txt b/Documentation/bcache.txt
index a9259b5..c78c012 100644
--- a/Documentation/bcache.txt
+++ b/Documentation/bcache.txt
@@ -133,6 +133,86 @@ the backing devices to passthrough mode.
    writeback mode). It currently doesn't do anything intelligent if it fails to
    read some of the dirty data, though.
 
+SSD LONGEVITY: PER-PROCESS CACHE HINTING WITH IO PRIORITY
+---------------------------------------------------------
+
+Processes can be assigned an IO priority using `ionice` and bcache will
+either try to writeback or bypass the cache based on the IO priority
+level assigned to the process and the configuration of the syfs ioprio
+hints.  If configured properly for your workload, this can both increase
+performance and reduce SSD wear (erase/write cycles).
+
+Having idle IOs bypass the cache can increase performance elsewhere
+since you probably don't care about their performance.  In addition,
+this prevents idle IOs from promoting into (polluting) your cache and
+evicting blocks that are more important elsewhere.
+
+Default sysfs values:
+	2,7: ioprio_bypass is hinted for process IOs at-or-below best-effort-7.
+	0,0: ioprio_writeback hinting is disabled by default.
+
+Cache hinting is configured by writing 'class,level' pairs to sysfs.
+In this example, we write the following:
+
+    echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
+    echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback
+
+Thus, processes with the following IO class (ionice -c) and level (-n)
+will the behave as shown in this table:
+
+	(-c) IO Class    (-n) Class level       Action
+	-----------------------------------------------------
+	(1) Realtime      0-7                   Writeback
+	(2) Best-effort     0                   Writeback
+	(2) Best-effort   1-6                   Normal, as if hinting were disabled
+	(2) Best-effort     7                   Bypass cache
+	(3) Idle          n/a                   Bypass cache
+
+For processes at-or-below best-effort-7 (ionice -c2 -n7), the
+ioprio_bypass behavior is as follows:
+
+* Reads will come from the backing device and will not promote into
+  (pollute) your cache.  If the block being read was already in the cache,
+  then it will be read from the cache (and remain cached).
+
+* If you are using writeback mode, then low-priority bypass-hinted writes
+  will go directly to the backing device.  If the write was dirty in
+  cache, it will cache-invalidate and write directly to the backing
+  device.  If a high-priority task later writes the same block then it
+  will writeback so no performance is lost for write-after-write.
+
+  For read-after-bypassed-write, the block will be read from the backing
+  device (not cached) so there may be a miss penalty when a low-priority
+  process write bypasses the cache followed by a high-priority read that
+  would otherwise have hit.  In practice, this is not an issue; to date,
+  none have wanted low-priority writes and high-priority reads of the
+  same block.
+
+For processes in our example at-or-above best-effort-0 (ionice -c2 -n0),
+the ioprio_writeback behavior is as follows:
+
+* The writeback hint has no effect unless your 'cache_mode' is writeback.
+  Assuming writeback mode, all writes at this priority will writeback.
+  Of course this will increase SSD wear, so only use writeback hinting
+  if you need it.
+
+* Reads are unaffected by ioprio_writeback, except that read-after-write
+  will of course read from the cache.
+
+Linux assigns processes the best-effort class with a level of 4 if
+no process is assigned  Thus, without `ionice` your processes will
+follow normal bcache should_writeback/should_bypass symantecs as if the
+ioprio_writeback/ioprio_bypass sysfs flags were disabled.
+
+Also note that in order to be hinted by ioprio_writeback/ioprio_bypass,
+the process must have a valid ioprio setting as returned by
+get_task_io_context()->ioprio. Thus, a process without an IO context
+will be ignored by the ioprio_writeback/ioprio_bypass hints even if your
+sysfs hints specify that best-effort-4 should be flagged for bypass
+or writeback.  If in doubt, explicitly set the process IO priority with
+`ionice`.
+
+See `man ionice` for more detail about per-process IO priority in Linux.
 
 HOWTO/COOKBOOK
 --------------
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 09/19] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (3 preceding siblings ...)
  2016-10-11 19:08     ` Eric Wheeler
@ 2016-10-20  0:36     ` Eric Wheeler
  2016-10-20  0:36     ` Eric Wheeler
                       ` (25 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2016-10-20  0:36 UTC (permalink / raw)
  To: linux-block

Bypass if:     bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND)

Writeback if:  op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)

Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
---
 drivers/md/bcache/request.c   | 3 +++
 drivers/md/bcache/writeback.h | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index a95609f..4629b0c 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -386,6 +386,9 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	     op_is_write(bio_op(bio))))
 		goto skip;
 
+	if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))
+		goto skip;
+
 	/* If the ioprio already exists on the bio, use that.  We assume that
 	 * the upper layer properly assigned the calling process's ioprio to
 	 * the bio being passed to bcache. Otherwise, use current's ioc. */
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index cd82fe8..ea2f92e 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -81,7 +81,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 		return true;
 	}
 
-	return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
+	return op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)
+		|| in_use <= CUTOFF_WRITEBACK;
 }
 
 static inline void bch_writeback_queue(struct cached_dev *dc)
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 09/19] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (4 preceding siblings ...)
  2016-10-20  0:36     ` [PATCH 09/19] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints Eric Wheeler
@ 2016-10-20  0:36     ` Eric Wheeler
  2017-05-09 19:03     ` [PATCH 01/19] bcache: Fix leak of bdev reference Jan Kara
                       ` (24 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2016-10-20  0:36 UTC (permalink / raw)
  To: linux-block

Bypass if:     bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND)

Writeback if:  op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)

Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
---
 drivers/md/bcache/request.c   | 3 +++
 drivers/md/bcache/writeback.h | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index a95609f..4629b0c 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -386,6 +386,9 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	     op_is_write(bio_op(bio))))
 		goto skip;
 
+	if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))
+		goto skip;
+
 	/* If the ioprio already exists on the bio, use that.  We assume that
 	 * the upper layer properly assigned the calling process's ioprio to
 	 * the bio being passed to bcache. Otherwise, use current's ioc. */
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index cd82fe8..ea2f92e 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -81,7 +81,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 		return true;
 	}
 
-	return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
+	return op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)
+		|| in_use <= CUTOFF_WRITEBACK;
 }
 
 static inline void bch_writeback_queue(struct cached_dev *dc)
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 01/19] bcache: Fix leak of bdev reference
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (5 preceding siblings ...)
  2016-10-20  0:36     ` Eric Wheeler
@ 2017-05-09 19:03     ` Jan Kara
  2017-05-09 19:03     ` Jan Kara
                       ` (23 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Jan Kara @ 2017-05-09 19:03 UTC (permalink / raw)
  To: linux-block

If blkdev_get_by_path() in register_bcache() fails, we try to lookup the
block device using lookup_bdev() to detect which situation we are in to
properly report error. However we never drop the reference returned to
us from lookup_bdev(). Fix that.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/super.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 8352fad..9a2c190 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1964,6 +1964,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			else
 				err = "device busy";
 			mutex_unlock(&bch_register_lock);
+			if (!IS_ERR(bdev))
+				bdput(bdev);
 			if (attr == &ksysfs_register_quiet)
 				goto out;
 		}
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 01/19] bcache: Fix leak of bdev reference
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (6 preceding siblings ...)
  2017-05-09 19:03     ` [PATCH 01/19] bcache: Fix leak of bdev reference Jan Kara
@ 2017-05-09 19:03     ` Jan Kara
  2017-05-09 19:05     ` [PATCH 02/19] bcache: fix sequential large write IO bypass Tang Junhui
                       ` (22 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Jan Kara @ 2017-05-09 19:03 UTC (permalink / raw)
  To: linux-block

If blkdev_get_by_path() in register_bcache() fails, we try to lookup the
block device using lookup_bdev() to detect which situation we are in to
properly report error. However we never drop the reference returned to
us from lookup_bdev(). Fix that.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/super.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 8352fad..9a2c190 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1964,6 +1964,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			else
 				err = "device busy";
 			mutex_unlock(&bch_register_lock);
+			if (!IS_ERR(bdev))
+				bdput(bdev);
 			if (attr == &ksysfs_register_quiet)
 				goto out;
 		}
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 02/19] bcache: fix sequential large write IO bypass
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (7 preceding siblings ...)
  2017-05-09 19:03     ` Jan Kara
@ 2017-05-09 19:05     ` Tang Junhui
  2017-05-09 19:05     ` Tang Junhui
                       ` (21 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tang Junhui @ 2017-05-09 19:05 UTC (permalink / raw)
  To: linux-block

Sequential write IOs were tested with bs=1M by FIO in writeback cache
mode, these IOs were expected to be bypassed, but actually they did not.
We debug the code, and find in check_should_bypass():
    if (!congested &&
        mode == CACHE_MODE_WRITEBACK &&
        op_is_write(bio_op(bio)) &&
        (bio->bi_opf & REQ_SYNC))
        goto rescale
that means, If in writeback mode, a write IO with REQ_SYNC flag will not
be bypassed though it is a sequential large IO, It's not a correct thing
to do actually, so this patch remove these codes.

Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/request.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 019b3df..958072a 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -400,12 +400,6 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	if (!congested && !dc->sequential_cutoff)
 		goto rescale;
 
-	if (!congested &&
-	    mode == CACHE_MODE_WRITEBACK &&
-	    op_is_write(bio->bi_opf) &&
-	    op_is_sync(bio->bi_opf))
-		goto rescale;
-
 	spin_lock(&dc->io_lock);
 
 	hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 02/19] bcache: fix sequential large write IO bypass
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (8 preceding siblings ...)
  2017-05-09 19:05     ` [PATCH 02/19] bcache: fix sequential large write IO bypass Tang Junhui
@ 2017-05-09 19:05     ` Tang Junhui
  2017-05-09 19:07     ` [PATCH 03/19] bcache: do not subtract sectors_to_gc for bypassed IO Tang Junhui
                       ` (20 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tang Junhui @ 2017-05-09 19:05 UTC (permalink / raw)
  To: linux-block

Sequential write IOs were tested with bs=1M by FIO in writeback cache
mode, these IOs were expected to be bypassed, but actually they did not.
We debug the code, and find in check_should_bypass():
    if (!congested &&
        mode == CACHE_MODE_WRITEBACK &&
        op_is_write(bio_op(bio)) &&
        (bio->bi_opf & REQ_SYNC))
        goto rescale
that means, If in writeback mode, a write IO with REQ_SYNC flag will not
be bypassed though it is a sequential large IO, It's not a correct thing
to do actually, so this patch remove these codes.

Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/request.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 019b3df..958072a 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -400,12 +400,6 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	if (!congested && !dc->sequential_cutoff)
 		goto rescale;
 
-	if (!congested &&
-	    mode == CACHE_MODE_WRITEBACK &&
-	    op_is_write(bio->bi_opf) &&
-	    op_is_sync(bio->bi_opf))
-		goto rescale;
-
 	spin_lock(&dc->io_lock);
 
 	hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 03/19] bcache: do not subtract sectors_to_gc for bypassed IO
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (9 preceding siblings ...)
  2017-05-09 19:05     ` Tang Junhui
@ 2017-05-09 19:07     ` Tang Junhui
  2017-05-09 19:07     ` Tang Junhui
                       ` (19 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tang Junhui @ 2017-05-09 19:07 UTC (permalink / raw)
  To: linux-block

Since bypassed IOs use no bucket, so do not subtract sectors_to_gc to
trigger gc thread.

Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/request.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 958072a..4b413db 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -196,12 +196,12 @@ static void bch_data_insert_start(struct closure *cl)
 	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 	struct bio *bio = op->bio, *n;
 
-	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
-		wake_up_gc(op->c);
-
 	if (op->bypass)
 		return bch_data_invalidate(cl);
 
+	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
+		wake_up_gc(op->c);
+
 	/*
 	 * Journal writes are marked REQ_PREFLUSH; if the original write was a
 	 * flush, it'll wait on the journal write.
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 03/19] bcache: do not subtract sectors_to_gc for bypassed IO
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (10 preceding siblings ...)
  2017-05-09 19:07     ` [PATCH 03/19] bcache: do not subtract sectors_to_gc for bypassed IO Tang Junhui
@ 2017-05-09 19:07     ` Tang Junhui
  2017-05-09 19:14     ` [PATCH 05/19] bcache: fix calling ida_simple_remove() with incorrect minor Tang Junhui
                       ` (18 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tang Junhui @ 2017-05-09 19:07 UTC (permalink / raw)
  To: linux-block

Since bypassed IOs use no bucket, so do not subtract sectors_to_gc to
trigger gc thread.

Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/request.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 958072a..4b413db 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -196,12 +196,12 @@ static void bch_data_insert_start(struct closure *cl)
 	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 	struct bio *bio = op->bio, *n;
 
-	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
-		wake_up_gc(op->c);
-
 	if (op->bypass)
 		return bch_data_invalidate(cl);
 
+	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
+		wake_up_gc(op->c);
+
 	/*
 	 * Journal writes are marked REQ_PREFLUSH; if the original write was a
 	 * flush, it'll wait on the journal write.
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 05/19] bcache: fix calling ida_simple_remove() with incorrect minor
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (12 preceding siblings ...)
  2017-05-09 19:14     ` [PATCH 05/19] bcache: fix calling ida_simple_remove() with incorrect minor Tang Junhui
@ 2017-05-09 19:14     ` Tang Junhui
  2017-06-01  8:48     ` [PATCH 10/19] bcache: initialize stripe_sectors_dirty correctly for thin flash device Tang Junhui
                       ` (16 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tang Junhui @ 2017-05-09 19:14 UTC (permalink / raw)
  To: linux-block

bcache called ida_simple_remove() with minor which have multiplied by
BCACHE_MINORS, it would cause minor wrong release and leakage.

In addition, when adding partition support to bcache, the name assignment
was not updated, resulting in numbers jumping (bcache0, bcache16,
bcache32...). This has been fixed implicitly by the rework.

Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
Reviewed-by: Coly Li <colyli@suse.de>
Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: stable@vger.kernel.org # 4.10
Cc: Stefan Bader <stefan.bader@canonical.com>
Fixes: b8c0d91 (bcache: partition support: add 16 minors per bcacheN device)
BugLink: https://bugs.launchpad.net/bugs/1667078
---
 drivers/md/bcache/super.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 9a2c190..48b8c20 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -58,7 +58,10 @@ static wait_queue_head_t unregister_wait;
 struct workqueue_struct *bcache_wq;
 
 #define BTREE_MAX_PAGES		(256 * 1024 / PAGE_SIZE)
-#define BCACHE_MINORS		16 /* partition support */
+#define BCACHE_MINORS_BITS                4 /* bcache partition support */
+#define BCACHE_MINORS                     (1 << BCACHE_MINORS_BITS)
+#define BCACHE_TO_IDA_MINORS(first_minor) ((first_minor) >> BCACHE_MINORS_BITS)
+#define IDA_TO_BCACHE_MINORS(minor)       ((minor) << BCACHE_MINORS_BITS)
 
 /* Superblock */
 
@@ -734,7 +737,8 @@ static void bcache_device_free(struct bcache_device *d)
 	if (d->disk && d->disk->queue)
 		blk_cleanup_queue(d->disk->queue);
 	if (d->disk) {
-		ida_simple_remove(&bcache_minor, d->disk->first_minor);
+		ida_simple_remove(&bcache_minor,
+			BCACHE_TO_IDA_MINORS(d->disk->first_minor));
 		put_disk(d->disk);
 	}
 
@@ -776,11 +780,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	if (!d->full_dirty_stripes)
 		return -ENOMEM;
 
-	minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
+	minor = ida_simple_get(&bcache_minor, 0,
+		BCACHE_TO_IDA_MINORS(MINORMASK) + 1, GFP_KERNEL);
 	if (minor < 0)
 		return minor;
 
-	minor *= BCACHE_MINORS;
 
 	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
 					   BIOSET_NEED_BVECS |
@@ -794,7 +798,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
 
 	d->disk->major		= bcache_major;
-	d->disk->first_minor	= minor;
+	d->disk->first_minor	= IDA_TO_BCACHE_MINORS(minor);
 	d->disk->fops		= &bcache_ops;
 	d->disk->private_data	= d;
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 05/19] bcache: fix calling ida_simple_remove() with incorrect minor
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (11 preceding siblings ...)
  2017-05-09 19:07     ` Tang Junhui
@ 2017-05-09 19:14     ` Tang Junhui
  2017-05-09 19:14     ` Tang Junhui
                       ` (17 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tang Junhui @ 2017-05-09 19:14 UTC (permalink / raw)
  To: linux-block

bcache called ida_simple_remove() with minor which have multiplied by
BCACHE_MINORS, it would cause minor wrong release and leakage.

In addition, when adding partition support to bcache, the name assignment
was not updated, resulting in numbers jumping (bcache0, bcache16,
bcache32...). This has been fixed implicitly by the rework.

Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
Reviewed-by: Coly Li <colyli@suse.de>
Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: stable@vger.kernel.org # 4.10
Cc: Stefan Bader <stefan.bader@canonical.com>
Fixes: b8c0d91 (bcache: partition support: add 16 minors per bcacheN device)
BugLink: https://bugs.launchpad.net/bugs/1667078
---
 drivers/md/bcache/super.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 9a2c190..48b8c20 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -58,7 +58,10 @@ static wait_queue_head_t unregister_wait;
 struct workqueue_struct *bcache_wq;
 
 #define BTREE_MAX_PAGES		(256 * 1024 / PAGE_SIZE)
-#define BCACHE_MINORS		16 /* partition support */
+#define BCACHE_MINORS_BITS                4 /* bcache partition support */
+#define BCACHE_MINORS                     (1 << BCACHE_MINORS_BITS)
+#define BCACHE_TO_IDA_MINORS(first_minor) ((first_minor) >> BCACHE_MINORS_BITS)
+#define IDA_TO_BCACHE_MINORS(minor)       ((minor) << BCACHE_MINORS_BITS)
 
 /* Superblock */
 
@@ -734,7 +737,8 @@ static void bcache_device_free(struct bcache_device *d)
 	if (d->disk && d->disk->queue)
 		blk_cleanup_queue(d->disk->queue);
 	if (d->disk) {
-		ida_simple_remove(&bcache_minor, d->disk->first_minor);
+		ida_simple_remove(&bcache_minor,
+			BCACHE_TO_IDA_MINORS(d->disk->first_minor));
 		put_disk(d->disk);
 	}
 
@@ -776,11 +780,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	if (!d->full_dirty_stripes)
 		return -ENOMEM;
 
-	minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
+	minor = ida_simple_get(&bcache_minor, 0,
+		BCACHE_TO_IDA_MINORS(MINORMASK) + 1, GFP_KERNEL);
 	if (minor < 0)
 		return minor;
 
-	minor *= BCACHE_MINORS;
 
 	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
 					   BIOSET_NEED_BVECS |
@@ -794,7 +798,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
 
 	d->disk->major		= bcache_major;
-	d->disk->first_minor	= minor;
+	d->disk->first_minor	= IDA_TO_BCACHE_MINORS(minor);
 	d->disk->fops		= &bcache_ops;
 	d->disk->private_data	= d;
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PULL] bcache updates based on git.kernel.dk/linux-block:for-next
@ 2017-05-25 19:10 Eric Wheeler
  2017-06-28 23:06   ` Eric Wheeler
  2017-07-14 11:40 ` [PULL] bcache updates based on git.kernel.dk/linux-block:for-next Eddie Chapman
  0 siblings, 2 replies; 120+ messages in thread
From: Eric Wheeler @ 2017-05-25 19:10 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-bcache, Jan Kara, stable, tang.junhui, Kent Overstreet,
	Coly Li, Stefan Bader, Liang Chen, nix

Hi Jens,

Please pull these updates and bugfixes from the bcache community when you 
have a minute.  If you need a rebase against something else then please 
let me know and I would be happy to update for you.

Thank you for your help!



The following changes since commit 0789bd7bdb5bd036fe3df96c19528f46127a0160:

  Merge branch 'for-linus' into for-next (2017-05-10 07:40:47 -0600)

are available in the git repository at:

  https://bitbucket.org/ewheelerinc/linux.git bcache-updates-linux-block-for-next

for you to fetch changes up to 8a02c3d571b895931db7a2700f05d8c70a7c6cb2:

  bcache: update bio->bi_opf bypass/writeback REQ_ flag hints (2017-05-11 12:29:08 -0700)

----------------------------------------------------------------
Jan Kara (1):
      bcache: Fix leak of bdev reference

Liang Chen (1):
      bcache: explicitly destory mutex while exiting

tang.junhui (4):
      bcache: fix sequential large write IO bypass
      bcache: do not subtract sectors_to_gc for bypassed IO
      bcache: fix wrong cache_misses statistics
      bcache: fix calling ida_simple_remove() with incorrect minor

Eric Wheeler (3):
      bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints
      bcache: documentation for sysfs entries describing bcache cache hinting
      bcache: update bio->bi_opf bypass/writeback REQ_ flag hints


 Documentation/bcache.txt      | 80 ++++++++++++++++++++++++++++++++++++++++
 drivers/md/bcache/bcache.h    |  3 ++
 drivers/md/bcache/request.c   | 45 +++++++++++++++++-----
 drivers/md/bcache/super.c     | 18 ++++++---
 drivers/md/bcache/sysfs.c     | 71 +++++++++++++++++++++++++++++++++++
 drivers/md/bcache/writeback.c |  8 ++++
 drivers/md/bcache/writeback.h | 27 +++++++++++++-
 7 files changed, 236 insertions(+), 16 deletions(-)

^ permalink raw reply	[flat|nested] 120+ messages in thread

* [PATCH 10/19] bcache: initialize stripe_sectors_dirty correctly for thin flash device
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (14 preceding siblings ...)
  2017-06-01  8:48     ` [PATCH 10/19] bcache: initialize stripe_sectors_dirty correctly for thin flash device Tang Junhui
@ 2017-06-01  8:48     ` Tang Junhui
  2017-06-12 21:18     ` [PATCH 14/19] bcache: Correct return value for sysfs attach errors Tony Asleson
                       ` (14 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tang Junhui @ 2017-06-01  8:48 UTC (permalink / raw)
  To: linux-block

Thin flash device does not initialize stripe_sectors_dirty correctly, this
patch fixes this issue.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/super.c     | 3 ++-
 drivers/md/bcache/writeback.c | 8 ++++----
 drivers/md/bcache/writeback.h | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 1f84791..e06641e 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1030,7 +1030,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 	}
 
 	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
-		bch_sectors_dirty_init(dc);
+		bch_sectors_dirty_init(&dc->disk);
 		atomic_set(&dc->has_dirty, 1);
 		atomic_inc(&dc->count);
 		bch_writeback_queue(dc);
@@ -1232,6 +1232,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
 		goto err;
 
 	bcache_device_attach(d, c, u - c->uuids);
+	bch_sectors_dirty_init(d);
 	bch_flash_dev_request_init(d);
 	add_disk(d->disk);
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 3d463f0..4ac8b13 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -482,17 +482,17 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
 	return MAP_CONTINUE;
 }
 
-void bch_sectors_dirty_init(struct cached_dev *dc)
+void bch_sectors_dirty_init(struct bcache_device *d)
 {
 	struct sectors_dirty_init op;
 
 	bch_btree_op_init(&op.op, -1);
-	op.inode = dc->disk.id;
+	op.inode = d->id;
 
-	bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
+	bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
 			   sectors_dirty_init_fn, 0);
 
-	dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
+	d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
 }
 
 void bch_cached_dev_writeback_init(struct cached_dev *dc)
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index ea2f92e..c2ab4b4 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -109,7 +109,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
 
 void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
 
-void bch_sectors_dirty_init(struct cached_dev *dc);
+void bch_sectors_dirty_init(struct bcache_device *);
 void bch_cached_dev_writeback_init(struct cached_dev *);
 int bch_cached_dev_writeback_start(struct cached_dev *);
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 10/19] bcache: initialize stripe_sectors_dirty correctly for thin flash device
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (13 preceding siblings ...)
  2017-05-09 19:14     ` Tang Junhui
@ 2017-06-01  8:48     ` Tang Junhui
  2017-06-01  8:48     ` Tang Junhui
                       ` (15 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tang Junhui @ 2017-06-01  8:48 UTC (permalink / raw)
  To: linux-block

Thin flash device does not initialize stripe_sectors_dirty correctly, this
patch fixes this issue.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/super.c     | 3 ++-
 drivers/md/bcache/writeback.c | 8 ++++----
 drivers/md/bcache/writeback.h | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 1f84791..e06641e 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1030,7 +1030,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 	}
 
 	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
-		bch_sectors_dirty_init(dc);
+		bch_sectors_dirty_init(&dc->disk);
 		atomic_set(&dc->has_dirty, 1);
 		atomic_inc(&dc->count);
 		bch_writeback_queue(dc);
@@ -1232,6 +1232,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
 		goto err;
 
 	bcache_device_attach(d, c, u - c->uuids);
+	bch_sectors_dirty_init(d);
 	bch_flash_dev_request_init(d);
 	add_disk(d->disk);
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 3d463f0..4ac8b13 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -482,17 +482,17 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
 	return MAP_CONTINUE;
 }
 
-void bch_sectors_dirty_init(struct cached_dev *dc)
+void bch_sectors_dirty_init(struct bcache_device *d)
 {
 	struct sectors_dirty_init op;
 
 	bch_btree_op_init(&op.op, -1);
-	op.inode = dc->disk.id;
+	op.inode = d->id;
 
-	bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
+	bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
 			   sectors_dirty_init_fn, 0);
 
-	dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
+	d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
 }
 
 void bch_cached_dev_writeback_init(struct cached_dev *dc)
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index ea2f92e..c2ab4b4 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -109,7 +109,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
 
 void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
 
-void bch_sectors_dirty_init(struct cached_dev *dc);
+void bch_sectors_dirty_init(struct bcache_device *);
 void bch_cached_dev_writeback_init(struct cached_dev *);
 int bch_cached_dev_writeback_start(struct cached_dev *);
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 14/19] bcache: Correct return value for sysfs attach errors
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (16 preceding siblings ...)
  2017-06-12 21:18     ` [PATCH 14/19] bcache: Correct return value for sysfs attach errors Tony Asleson
@ 2017-06-12 21:18     ` Tony Asleson
  2017-06-28  0:30     ` [PATCH 13/19] bcache: delete redundant calling set_gc_sectors() Tang Junhui
                       ` (12 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tony Asleson @ 2017-06-12 21:18 UTC (permalink / raw)
  To: linux-block

If you encounter any errors in bch_cached_dev_attach it will return a negative
error code.  The variable 'v' which stores the result is unsigned, thus user
space sees a very large value returned for bytes written which can cause
incorrect user space behavior.  Utilize 1 signed variable to use throughout
the function to preserve error return capability.

Signed-off-by: Tony Asleson <tasleson@redhat.com>
Acked-by: Coly Li <colyli@suse.de>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/sysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index cc0076d..7579ca6 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -206,7 +206,7 @@ STORE(__cached_dev)
 {
 	struct cached_dev *dc = container_of(kobj, struct cached_dev,
 					     disk.kobj);
-	unsigned v = size;
+	ssize_t v = size;
 	struct cache_set *c;
 	struct kobj_uevent_env *env;
 	unsigned ioprio_class = 0; /* invalid initial ioprio values */
@@ -245,7 +245,7 @@ STORE(__cached_dev)
 		bch_cached_dev_run(dc);
 
 	if (attr == &sysfs_cache_mode) {
-		ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1);
+		v = bch_read_string_list(buf, bch_cache_modes + 1);
 
 		if (v < 0)
 			return v;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 14/19] bcache: Correct return value for sysfs attach errors
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (15 preceding siblings ...)
  2017-06-01  8:48     ` Tang Junhui
@ 2017-06-12 21:18     ` Tony Asleson
  2017-06-12 21:18     ` Tony Asleson
                       ` (13 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tony Asleson @ 2017-06-12 21:18 UTC (permalink / raw)
  To: linux-block

If you encounter any errors in bch_cached_dev_attach it will return a negative
error code.  The variable 'v' which stores the result is unsigned, thus user
space sees a very large value returned for bytes written which can cause
incorrect user space behavior.  Utilize 1 signed variable to use throughout
the function to preserve error return capability.

Signed-off-by: Tony Asleson <tasleson@redhat.com>
Acked-by: Coly Li <colyli@suse.de>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/sysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index cc0076d..7579ca6 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -206,7 +206,7 @@ STORE(__cached_dev)
 {
 	struct cached_dev *dc = container_of(kobj, struct cached_dev,
 					     disk.kobj);
-	unsigned v = size;
+	ssize_t v = size;
 	struct cache_set *c;
 	struct kobj_uevent_env *env;
 	unsigned ioprio_class = 0; /* invalid initial ioprio values */
@@ -245,7 +245,7 @@ STORE(__cached_dev)
 		bch_cached_dev_run(dc);
 
 	if (attr == &sysfs_cache_mode) {
-		ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1);
+		v = bch_read_string_list(buf, bch_cache_modes + 1);
 
 		if (v < 0)
 			return v;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 13/19] bcache: delete redundant calling set_gc_sectors()
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (17 preceding siblings ...)
  2017-06-12 21:18     ` Tony Asleson
@ 2017-06-28  0:30     ` Tang Junhui
  2017-06-28  0:30     ` Tang Junhui
                       ` (11 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tang Junhui @ 2017-06-28  0:30 UTC (permalink / raw)
  To: linux-block

set_gc_sectors() has been called in bch_gc_thread(), and it was called
again in bch_btree_gc_finish() . The following call is unnecessary, so
delete it.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
---
 drivers/md/bcache/btree.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 77aa20b..66d8036 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1662,7 +1662,6 @@ static size_t bch_btree_gc_finish(struct cache_set *c)
 
 	mutex_lock(&c->bucket_lock);
 
-	set_gc_sectors(c);
 	c->gc_mark_valid = 1;
 	c->need_gc	= 0;
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 13/19] bcache: delete redundant calling set_gc_sectors()
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (18 preceding siblings ...)
  2017-06-28  0:30     ` [PATCH 13/19] bcache: delete redundant calling set_gc_sectors() Tang Junhui
@ 2017-06-28  0:30     ` Tang Junhui
  2017-06-28  0:37     ` [PATCH 16/19] bcache: increase the number of open buckets Tang Junhui
                       ` (10 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tang Junhui @ 2017-06-28  0:30 UTC (permalink / raw)
  To: linux-block

set_gc_sectors() has been called in bch_gc_thread(), and it was called
again in bch_btree_gc_finish() . The following call is unnecessary, so
delete it.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
---
 drivers/md/bcache/btree.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 77aa20b..66d8036 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1662,7 +1662,6 @@ static size_t bch_btree_gc_finish(struct cache_set *c)
 
 	mutex_lock(&c->bucket_lock);
 
-	set_gc_sectors(c);
 	c->gc_mark_valid = 1;
 	c->need_gc	= 0;
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 16/19] bcache: increase the number of open buckets
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (19 preceding siblings ...)
  2017-06-28  0:30     ` Tang Junhui
@ 2017-06-28  0:37     ` Tang Junhui
  2017-06-28  0:37     ` Tang Junhui
                       ` (9 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tang Junhui @ 2017-06-28  0:37 UTC (permalink / raw)
  To: linux-block

In currently, we only alloc 6 open buckets for each cache set,
but in usually, we always attach about 10 or so backend devices for
each cache set, and the each bcache device are always accessed by
about 10 or so threads in top application layer. So 6 open buckets
are too few, It has led to that each of the same thread write data
to different buckets, which would cause low efficiency write-back,
and also cause buckets inefficient, and would be Very easy to run
out of.

I add debug message in bch_open_buckets_alloc() to print alloc bucket
info, and test with ten bcache devices with a cache set, and each
bcache device is accessed by ten threads.

>From the debug message, we can see that, after the modification, One
bucket is more likely to assign to the same thread, and the data from
the same thread are more likely to write the same bucket. Usually the
same thread always write/read the same backend device, so it is good
for write-back and also promote the usage efficiency of buckets.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
---
 drivers/md/bcache/alloc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index ca4abe1..cacbe2d 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -68,6 +68,8 @@
 #include <linux/random.h>
 #include <trace/events/bcache.h>
 
+#define MAX_OPEN_BUCKETS 128
+
 /* Bucket heap / gen */
 
 uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
@@ -671,7 +673,7 @@ int bch_open_buckets_alloc(struct cache_set *c)
 
 	spin_lock_init(&c->data_bucket_lock);
 
-	for (i = 0; i < 6; i++) {
+	for (i = 0; i < MAX_OPEN_BUCKETS; i++) {
 		struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
 		if (!b)
 			return -ENOMEM;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 16/19] bcache: increase the number of open buckets
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (20 preceding siblings ...)
  2017-06-28  0:37     ` [PATCH 16/19] bcache: increase the number of open buckets Tang Junhui
@ 2017-06-28  0:37     ` Tang Junhui
  2017-06-28  0:41     ` [PATCH 17/19] bcache: fix for gc and write-back race Tang Junhui
                       ` (8 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tang Junhui @ 2017-06-28  0:37 UTC (permalink / raw)
  To: linux-block

In currently, we only alloc 6 open buckets for each cache set,
but in usually, we always attach about 10 or so backend devices for
each cache set, and the each bcache device are always accessed by
about 10 or so threads in top application layer. So 6 open buckets
are too few, It has led to that each of the same thread write data
to different buckets, which would cause low efficiency write-back,
and also cause buckets inefficient, and would be Very easy to run
out of.

I add debug message in bch_open_buckets_alloc() to print alloc bucket
info, and test with ten bcache devices with a cache set, and each
bcache device is accessed by ten threads.

>From the debug message, we can see that, after the modification, One
bucket is more likely to assign to the same thread, and the data from
the same thread are more likely to write the same bucket. Usually the
same thread always write/read the same backend device, so it is good
for write-back and also promote the usage efficiency of buckets.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
---
 drivers/md/bcache/alloc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index ca4abe1..cacbe2d 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -68,6 +68,8 @@
 #include <linux/random.h>
 #include <trace/events/bcache.h>
 
+#define MAX_OPEN_BUCKETS 128
+
 /* Bucket heap / gen */
 
 uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
@@ -671,7 +673,7 @@ int bch_open_buckets_alloc(struct cache_set *c)
 
 	spin_lock_init(&c->data_bucket_lock);
 
-	for (i = 0; i < 6; i++) {
+	for (i = 0; i < MAX_OPEN_BUCKETS; i++) {
 		struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
 		if (!b)
 			return -ENOMEM;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 17/19] bcache: fix for gc and write-back race
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (22 preceding siblings ...)
  2017-06-28  0:41     ` [PATCH 17/19] bcache: fix for gc and write-back race Tang Junhui
@ 2017-06-28  0:41     ` Tang Junhui
  2017-06-28 11:47     ` [PATCH 18/19] bcache: silence static checker warning Dan Carpenter
                       ` (6 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tang Junhui @ 2017-06-28  0:41 UTC (permalink / raw)
  To: linux-block

gc and write-back get raced (see the email "bcache get stucked" I sended
before):
gc thread						write-back thread
|							|bch_writeback_thread()
|bch_gc_thread()					|
|							|==>read_dirty()
|==>bch_btree_gc()					|
|==>btree_root() //get btree root			|
|			node write locker		|
|==>bch_btree_gc_root()					|
|							|==>read_dirty_submit()
|							|==>write_dirty()
|							|==>continue_at(cl, write_dirty_finish, system_wq);
|							|==>write_dirty_finish()//excute in system_wq
|							|==>bch_btree_insert()
|							|==>bch_btree_map_leaf_nodes()
|							|==>__bch_btree_map_nodes()
|							|==>btree_root //try to get btree root node read lock
|							|-----stuck here
|==>bch_btree_set_root()				|
|==>bch_journal_meta()					|
|==>bch_journal()					|
|==>journal_try_write()					|
|==>journal_write_unlocked() //journal_full(&c->journal) condition satisfied
|==>continue_at(cl, journal_write, system_wq); //try to excute journal_write in system_wq
|					//but work queue is excuting write_dirty_finish()
|==>closure_sync(); //wait journal_write execute over and wake up gc,
|			--stuck here
|==>release root node write locker

This patch alloc a separate work-queue for write-back thread to avoid such
race.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/bcache.h    | 1 +
 drivers/md/bcache/super.c     | 2 ++
 drivers/md/bcache/writeback.c | 8 ++++++--
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 44123e4..deb0a6c 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -333,6 +333,7 @@ struct cached_dev {
 	/* Limit number of writeback bios in flight */
 	struct semaphore	in_flight;
 	struct task_struct	*writeback_thread;
+	struct workqueue_struct	*writeback_write_wq;
 
 	struct keybuf		writeback_keys;
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e06641e..24cb9b7 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1063,6 +1063,8 @@ static void cached_dev_free(struct closure *cl)
 	cancel_delayed_work_sync(&dc->writeback_rate_update);
 	if (!IS_ERR_OR_NULL(dc->writeback_thread))
 		kthread_stop(dc->writeback_thread);
+	if (dc->writeback_write_wq)
+		destroy_workqueue(dc->writeback_write_wq);
 
 	mutex_lock(&bch_register_lock);
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 4104eaa..4bc5daa 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -189,7 +189,7 @@ static void write_dirty(struct closure *cl)
 
 	closure_bio_submit(&io->bio, cl);
 
-	continue_at(cl, write_dirty_finish, system_wq);
+	continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
 }
 
 static void read_dirty_endio(struct bio *bio)
@@ -209,7 +209,7 @@ static void read_dirty_submit(struct closure *cl)
 
 	closure_bio_submit(&io->bio, cl);
 
-	continue_at(cl, write_dirty, system_wq);
+	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
 }
 
 static void read_dirty(struct cached_dev *dc)
@@ -527,6 +527,10 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 
 int bch_cached_dev_writeback_start(struct cached_dev *dc)
 {
+	dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", WQ_MEM_RECLAIM, 0);
+	if (!dc->writeback_write_wq)
+		return -ENOMEM;
+
 	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
 					      "bcache_writeback");
 	if (IS_ERR(dc->writeback_thread))
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 17/19] bcache: fix for gc and write-back race
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (21 preceding siblings ...)
  2017-06-28  0:37     ` Tang Junhui
@ 2017-06-28  0:41     ` Tang Junhui
  2017-06-28  0:41     ` Tang Junhui
                       ` (7 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Tang Junhui @ 2017-06-28  0:41 UTC (permalink / raw)
  To: linux-block

gc and write-back get raced (see the email "bcache get stucked" I sended
before):
gc thread						write-back thread
|							|bch_writeback_thread()
|bch_gc_thread()					|
|							|==>read_dirty()
|==>bch_btree_gc()					|
|==>btree_root() //get btree root			|
|			node write locker		|
|==>bch_btree_gc_root()					|
|							|==>read_dirty_submit()
|							|==>write_dirty()
|							|==>continue_at(cl, write_dirty_finish, system_wq);
|							|==>write_dirty_finish()//excute in system_wq
|							|==>bch_btree_insert()
|							|==>bch_btree_map_leaf_nodes()
|							|==>__bch_btree_map_nodes()
|							|==>btree_root //try to get btree root node read lock
|							|-----stuck here
|==>bch_btree_set_root()				|
|==>bch_journal_meta()					|
|==>bch_journal()					|
|==>journal_try_write()					|
|==>journal_write_unlocked() //journal_full(&c->journal) condition satisfied
|==>continue_at(cl, journal_write, system_wq); //try to excute journal_write in system_wq
|					//but work queue is excuting write_dirty_finish()
|==>closure_sync(); //wait journal_write execute over and wake up gc,
|			--stuck here
|==>release root node write locker

This patch alloc a separate work-queue for write-back thread to avoid such
race.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/bcache.h    | 1 +
 drivers/md/bcache/super.c     | 2 ++
 drivers/md/bcache/writeback.c | 8 ++++++--
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 44123e4..deb0a6c 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -333,6 +333,7 @@ struct cached_dev {
 	/* Limit number of writeback bios in flight */
 	struct semaphore	in_flight;
 	struct task_struct	*writeback_thread;
+	struct workqueue_struct	*writeback_write_wq;
 
 	struct keybuf		writeback_keys;
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e06641e..24cb9b7 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1063,6 +1063,8 @@ static void cached_dev_free(struct closure *cl)
 	cancel_delayed_work_sync(&dc->writeback_rate_update);
 	if (!IS_ERR_OR_NULL(dc->writeback_thread))
 		kthread_stop(dc->writeback_thread);
+	if (dc->writeback_write_wq)
+		destroy_workqueue(dc->writeback_write_wq);
 
 	mutex_lock(&bch_register_lock);
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 4104eaa..4bc5daa 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -189,7 +189,7 @@ static void write_dirty(struct closure *cl)
 
 	closure_bio_submit(&io->bio, cl);
 
-	continue_at(cl, write_dirty_finish, system_wq);
+	continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
 }
 
 static void read_dirty_endio(struct bio *bio)
@@ -209,7 +209,7 @@ static void read_dirty_submit(struct closure *cl)
 
 	closure_bio_submit(&io->bio, cl);
 
-	continue_at(cl, write_dirty, system_wq);
+	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
 }
 
 static void read_dirty(struct cached_dev *dc)
@@ -527,6 +527,10 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 
 int bch_cached_dev_writeback_start(struct cached_dev *dc)
 {
+	dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", WQ_MEM_RECLAIM, 0);
+	if (!dc->writeback_write_wq)
+		return -ENOMEM;
+
 	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
 					      "bcache_writeback");
 	if (IS_ERR(dc->writeback_thread))
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 18/19] bcache: silence static checker warning
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (24 preceding siblings ...)
  2017-06-28 11:47     ` [PATCH 18/19] bcache: silence static checker warning Dan Carpenter
@ 2017-06-28 11:47     ` Dan Carpenter
  2017-06-28 11:48     ` [PATCH 19/19] bcache: Update continue_at() documentation Dan Carpenter
                       ` (4 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Dan Carpenter @ 2017-06-28 11:47 UTC (permalink / raw)
  To: linux-block

In olden times, closure_return() used to have a hidden return built in.
We removed the hidden return but forgot to add a new return here.  If
"c" were NULL we would oops on the next line, but fortunately "c" is
never NULL.  Let's just remove the if statement.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
---
 drivers/md/bcache/super.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 24cb9b7..243391d 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1381,9 +1381,6 @@ static void cache_set_flush(struct closure *cl)
 	struct btree *b;
 	unsigned i;
 
-	if (!c)
-		closure_return(cl);
-
 	bch_cache_accounting_destroy(&c->accounting);
 
 	kobject_put(&c->internal);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 18/19] bcache: silence static checker warning
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (23 preceding siblings ...)
  2017-06-28  0:41     ` Tang Junhui
@ 2017-06-28 11:47     ` Dan Carpenter
  2017-06-28 11:47     ` Dan Carpenter
                       ` (5 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Dan Carpenter @ 2017-06-28 11:47 UTC (permalink / raw)
  To: linux-block

In olden times, closure_return() used to have a hidden return built in.
We removed the hidden return but forgot to add a new return here.  If
"c" were NULL we would oops on the next line, but fortunately "c" is
never NULL.  Let's just remove the if statement.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
---
 drivers/md/bcache/super.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 24cb9b7..243391d 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1381,9 +1381,6 @@ static void cache_set_flush(struct closure *cl)
 	struct btree *b;
 	unsigned i;
 
-	if (!c)
-		closure_return(cl);
-
 	bch_cache_accounting_destroy(&c->accounting);
 
 	kobject_put(&c->internal);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 19/19] bcache: Update continue_at() documentation
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (26 preceding siblings ...)
  2017-06-28 11:48     ` [PATCH 19/19] bcache: Update continue_at() documentation Dan Carpenter
@ 2017-06-28 11:48     ` Dan Carpenter
  2017-06-29 16:19     ` [PULL] bcache fixes and updates for-4.13 Coly Li
                       ` (2 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Dan Carpenter @ 2017-06-28 11:48 UTC (permalink / raw)
  To: linux-block

continue_at() doesn't have a return statement anymore.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
---
 drivers/md/bcache/closure.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 1ec84ca..295b7e4 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -312,8 +312,6 @@ static inline void closure_wake_up(struct closure_waitlist *list)
  * been dropped with closure_put()), it will resume execution at @fn running out
  * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
  *
- * NOTE: This macro expands to a return in the calling function!
- *
  * This is because after calling continue_at() you no longer have a ref on @cl,
  * and whatever @cl owns may be freed out from under you - a running closure fn
  * has a ref on its own closure which continue_at() drops.
@@ -340,8 +338,6 @@ do {									\
  * Causes @fn to be executed out of @cl, in @wq context (or called directly if
  * @wq is NULL).
  *
- * NOTE: like continue_at(), this macro expands to a return in the caller!
- *
  * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
  * thus it's not safe to touch anything protected by @cl after a
  * continue_at_nobarrier().
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 19/19] bcache: Update continue_at() documentation
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (25 preceding siblings ...)
  2017-06-28 11:47     ` Dan Carpenter
@ 2017-06-28 11:48     ` Dan Carpenter
  2017-06-28 11:48     ` Dan Carpenter
                       ` (3 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Dan Carpenter @ 2017-06-28 11:48 UTC (permalink / raw)
  To: linux-block

continue_at() doesn't have a return statement anymore.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
---
 drivers/md/bcache/closure.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 1ec84ca..295b7e4 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -312,8 +312,6 @@ static inline void closure_wake_up(struct closure_waitlist *list)
  * been dropped with closure_put()), it will resume execution at @fn running out
  * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
  *
- * NOTE: This macro expands to a return in the calling function!
- *
  * This is because after calling continue_at() you no longer have a ref on @cl,
  * and whatever @cl owns may be freed out from under you - a running closure fn
  * has a ref on its own closure which continue_at() drops.
@@ -340,8 +338,6 @@ do {									\
  * Causes @fn to be executed out of @cl, in @wq context (or called directly if
  * @wq is NULL).
  *
- * NOTE: like continue_at(), this macro expands to a return in the caller!
- *
  * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
  * thus it's not safe to touch anything protected by @cl after a
  * continue_at_nobarrier().
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PULL] bcache fixes and updates for-4.13
  2017-05-25 19:10 [PULL] bcache updates based on git.kernel.dk/linux-block:for-next Eric Wheeler
@ 2017-06-28 23:06   ` Eric Wheeler
  2017-07-14 11:40 ` [PULL] bcache updates based on git.kernel.dk/linux-block:for-next Eddie Chapman
  1 sibling, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2017-06-28 23:06 UTC (permalink / raw)
  To: Jens Axboe
  Cc: dan.carpenter, tasleson, linux-bcache, linux-block, Jan Kara,
	stable, tang.junhui, Kent Overstreet, Coly Li, Stefan Bader,
	Liang Chen, nix

Hi Jens,

It looks like you're getting ready for 4.13, so here are the bcache 
updates rebased and ready from git.kernel.dk/linux-block for-4.13/block 
This tree builds, boots, and works as expected.  Please pull the new fixes 
and features below.

Thank you for your help!


The following changes since commit f1d4ef7d88832444e8dfeb0e85e19d3b6ecb5011:

  nvmet-rdma: register ib_client to not deadlock in device removal (2017-06-28 08:14:13 -0600)

are available in the git repository at:

  https://bitbucket.org/ewheelerinc/linux.git bcache-updates-linux-block-for-4.13

for you to fetch changes up to 960ce81bdbd46bc7faaeb2f9fcddfea1e48ee388:

  bcache: Update continue_at() documentation (2017-06-28 15:18:02 -0700)

----------------------------------------------------------------
Dan Carpenter (2):
      bcache: silence static checker warning
      bcache: Update continue_at() documentation

Eric Wheeler (3):
      bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints
      bcache: documentation for sysfs entries describing bcache cache hinting
      bcache: update bio->bi_opf bypass/writeback REQ_ flag hints

Jan Kara (1):
      bcache: Fix leak of bdev reference

Liang Chen (1):
      bcache: explicitly destory mutex while exiting

Tang Junhui (11):
      bcache: fix sequential large write IO bypass
      bcache: do not subtract sectors_to_gc for bypassed IO
      bcache: fix wrong cache_misses statistics
      bcache: fix calling ida_simple_remove() with incorrect minor
      bcache: initialize stripe_sectors_dirty correctly for thin flash device
      bcache: Subtract dirty sectors of thin flash from cache_sectors in calculating writeback rate
      bcache: update bucket_in_use periodically
      bcache: delete redundant calling set_gc_sectors()
      bcache: fix issue of writeback rate at minimum 1 key per second
      bcache: increase the number of open buckets
      bcache: fix for gc and write-back race

Tony Asleson (1):
      bcache: Correct return value for sysfs attach errors

 Documentation/bcache.txt      | 80 ++++++++++++++++++++++++++++++++++++++++++
 drivers/md/bcache/alloc.c     |  4 ++-
 drivers/md/bcache/bcache.h    |  4 +++
 drivers/md/bcache/btree.c     | 30 ++++++++++++++--
 drivers/md/bcache/closure.h   |  4 ---
 drivers/md/bcache/request.c   | 45 ++++++++++++++++++------
 drivers/md/bcache/super.c     | 26 +++++++++-----
 drivers/md/bcache/sysfs.c     | 75 +++++++++++++++++++++++++++++++++++++--
 drivers/md/bcache/util.c      |  9 ++++-
 drivers/md/bcache/writeback.c | 37 +++++++++++++------
 drivers/md/bcache/writeback.h | 48 +++++++++++++++++++++++--
 11 files changed, 319 insertions(+), 43 deletions(-)



--
Eric Wheeler

^ permalink raw reply	[flat|nested] 120+ messages in thread

* [PULL] bcache fixes and updates for-4.13
@ 2017-06-28 23:06   ` Eric Wheeler
  0 siblings, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2017-06-28 23:06 UTC (permalink / raw)
  To: Jens Axboe
  Cc: dan.carpenter, tasleson, linux-bcache, linux-block, Jan Kara,
	stable, tang.junhui, Kent Overstreet, Coly Li, Stefan Bader,
	Liang Chen, nix

Hi Jens,

It looks like you're getting ready for 4.13, so here are the bcache 
updates rebased and ready from git.kernel.dk/linux-block for-4.13/block 
This tree builds, boots, and works as expected.  Please pull the new fixes 
and features below.

Thank you for your help!


The following changes since commit f1d4ef7d88832444e8dfeb0e85e19d3b6ecb5011:

  nvmet-rdma: register ib_client to not deadlock in device removal (2017-06-28 08:14:13 -0600)

are available in the git repository at:

  https://bitbucket.org/ewheelerinc/linux.git bcache-updates-linux-block-for-4.13

for you to fetch changes up to 960ce81bdbd46bc7faaeb2f9fcddfea1e48ee388:

  bcache: Update continue_at() documentation (2017-06-28 15:18:02 -0700)

----------------------------------------------------------------
Dan Carpenter (2):
      bcache: silence static checker warning
      bcache: Update continue_at() documentation

Eric Wheeler (3):
      bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints
      bcache: documentation for sysfs entries describing bcache cache hinting
      bcache: update bio->bi_opf bypass/writeback REQ_ flag hints

Jan Kara (1):
      bcache: Fix leak of bdev reference

Liang Chen (1):
      bcache: explicitly destory mutex while exiting

Tang Junhui (11):
      bcache: fix sequential large write IO bypass
      bcache: do not subtract sectors_to_gc for bypassed IO
      bcache: fix wrong cache_misses statistics
      bcache: fix calling ida_simple_remove() with incorrect minor
      bcache: initialize stripe_sectors_dirty correctly for thin flash device
      bcache: Subtract dirty sectors of thin flash from cache_sectors in calculating writeback rate
      bcache: update bucket_in_use periodically
      bcache: delete redundant calling set_gc_sectors()
      bcache: fix issue of writeback rate at minimum 1 key per second
      bcache: increase the number of open buckets
      bcache: fix for gc and write-back race

Tony Asleson (1):
      bcache: Correct return value for sysfs attach errors

 Documentation/bcache.txt      | 80 ++++++++++++++++++++++++++++++++++++++++++
 drivers/md/bcache/alloc.c     |  4 ++-
 drivers/md/bcache/bcache.h    |  4 +++
 drivers/md/bcache/btree.c     | 30 ++++++++++++++--
 drivers/md/bcache/closure.h   |  4 ---
 drivers/md/bcache/request.c   | 45 ++++++++++++++++++------
 drivers/md/bcache/super.c     | 26 +++++++++-----
 drivers/md/bcache/sysfs.c     | 75 +++++++++++++++++++++++++++++++++++++--
 drivers/md/bcache/util.c      |  9 ++++-
 drivers/md/bcache/writeback.c | 37 +++++++++++++------
 drivers/md/bcache/writeback.h | 48 +++++++++++++++++++++++--
 11 files changed, 319 insertions(+), 43 deletions(-)

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PULL] bcache fixes and updates for-4.13
  2017-06-28 23:06   ` Eric Wheeler
  (?)
@ 2017-06-29 13:45   ` Christoph Hellwig
  2016-10-11 19:04     ` [PATCH 07/19] bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints Eric Wheeler
                       ` (30 more replies)
  -1 siblings, 31 replies; 120+ messages in thread
From: Christoph Hellwig @ 2017-06-29 13:45 UTC (permalink / raw)
  To: Eric Wheeler
  Cc: Jens Axboe, dan.carpenter, tasleson, linux-bcache, linux-block,
	Jan Kara, stable, tang.junhui, Kent Overstreet, Coly Li,
	Stefan Bader, Liang Chen, nix

On Wed, Jun 28, 2017 at 11:06:46PM +0000, Eric Wheeler wrote:
> Hi Jens,
> 
> It looks like you're getting ready for 4.13, so here are the bcache 
> updates rebased and ready from git.kernel.dk/linux-block for-4.13/block 
> This tree builds, boots, and works as expected.  Please pull the new fixes 
> and features below.
> 
> Thank you for your help!

Can you please send all the patches to linux-block for review as
a series?  Some of these subjects sound a little suspect.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PULL] bcache fixes and updates for-4.13
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (27 preceding siblings ...)
  2017-06-28 11:48     ` Dan Carpenter
@ 2017-06-29 16:19     ` Coly Li
  2017-06-29 22:12       ` Eric Wheeler
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
  30 siblings, 0 replies; 120+ messages in thread
From: Coly Li @ 2017-06-29 16:19 UTC (permalink / raw)
  To: Christoph Hellwig, Eric Wheeler
  Cc: Jens Axboe, dan.carpenter, tasleson, linux-bcache, linux-block,
	Jan Kara, stable, tang.junhui, Kent Overstreet, Coly Li,
	Stefan Bader, Liang Chen, nix

On 2017/6/29 下午9:45, Christoph Hellwig wrote:
> On Wed, Jun 28, 2017 at 11:06:46PM +0000, Eric Wheeler wrote:
>> Hi Jens,
>>
>> It looks like you're getting ready for 4.13, so here are the bcache 
>> updates rebased and ready from git.kernel.dk/linux-block for-4.13/block 
>> This tree builds, boots, and works as expected.  Please pull the new fixes 
>> and features below.
>>
>> Thank you for your help!
> 
> Can you please send all the patches to linux-block for review as
> a series?  Some of these subjects sound a little suspect.
> 

We need more eyes to look at bcache code, maybe we can always CC bcache
patches to linux-block for more review.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PULL] bcache fixes and updates for-4.13
  2017-06-29 13:45   ` Christoph Hellwig
@ 2017-06-29 22:12       ` Eric Wheeler
  2016-10-11 19:04     ` Eric Wheeler
                         ` (29 subsequent siblings)
  30 siblings, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2017-06-29 22:12 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, dan.carpenter, tasleson, linux-bcache, linux-block,
	Jan Kara, stable, tang.junhui, Kent Overstreet, Coly Li,
	Stefan Bader, Liang Chen, nix


On Thu, 29 Jun 2017, Christoph Hellwig wrote:

> On Wed, Jun 28, 2017 at 11:06:46PM +0000, Eric Wheeler wrote:
> > Hi Jens,
> > 
> > It looks like you're getting ready for 4.13, so here are the bcache 
> > updates rebased and ready from git.kernel.dk/linux-block for-4.13/block 
> > This tree builds, boots, and works as expected.  Please pull the new fixes 
> > and features below.
> > 
> > Thank you for your help!
> 
> Can you please send all the patches to linux-block for review as
> a series?  Some of these subjects sound a little suspect.


On their way, IRT this thread.

--
Eric Wheeler



> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PULL] bcache fixes and updates for-4.13
@ 2017-06-29 22:12       ` Eric Wheeler
  0 siblings, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2017-06-29 22:12 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, dan.carpenter, tasleson, linux-bcache, linux-block,
	Jan Kara, stable, tang.junhui, Kent Overstreet, Coly Li,
	Stefan Bader, Liang Chen, nix


On Thu, 29 Jun 2017, Christoph Hellwig wrote:

> On Wed, Jun 28, 2017 at 11:06:46PM +0000, Eric Wheeler wrote:
> > Hi Jens,
> > 
> > It looks like you're getting ready for 4.13, so here are the bcache 
> > updates rebased and ready from git.kernel.dk/linux-block for-4.13/block 
> > This tree builds, boots, and works as expected.  Please pull the new fixes 
> > and features below.
> > 
> > Thank you for your help!
> 
> Can you please send all the patches to linux-block for review as
> a series?  Some of these subjects sound a little suspect.


On their way, IRT this thread.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PULL] bcache fixes and updates for-4.13
  2017-06-29 22:12       ` Eric Wheeler
  (?)
@ 2017-06-29 22:25       ` Eric Wheeler
  2017-06-29 23:28         ` Nick Alcock
  -1 siblings, 1 reply; 120+ messages in thread
From: Eric Wheeler @ 2017-06-29 22:25 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, dan.carpenter, tasleson, linux-bcache, linux-block,
	Jan Kara, stable, tang.junhui, Kent Overstreet, Coly Li,
	Stefan Bader, Liang Chen, nix

On Thu, 29 Jun 2017, Eric Wheeler wrote:

> 
> On Thu, 29 Jun 2017, Christoph Hellwig wrote:
> 
> > On Wed, Jun 28, 2017 at 11:06:46PM +0000, Eric Wheeler wrote:
> > > Hi Jens,
> > > 
> > > It looks like you're getting ready for 4.13, so here are the bcache 
> > > updates rebased and ready from git.kernel.dk/linux-block for-4.13/block 
> > > This tree builds, boots, and works as expected.  Please pull the new fixes 
> > > and features below.
> > > 
> > > Thank you for your help!
> > 
> > Can you please send all the patches to linux-block for review as
> > a series?  Some of these subjects sound a little suspect.
> 
> 
> On their way, IRT this thread.

Hmm, I think vger might not be letting them in because the From: header 
differs from the original sender (ie, I am not @oracle.com) so 
DNS/SPF/DKIM and such are wrong.

What do you do in these cases?  Should I rewrite the From: header?  
Somehow the original author should remain.

--
Eric Wheeler



> 
> --
> Eric Wheeler
> 
> 
> 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PULL] bcache fixes and updates for-4.13
  2017-06-29 22:25       ` Eric Wheeler
@ 2017-06-29 23:28         ` Nick Alcock
  0 siblings, 0 replies; 120+ messages in thread
From: Nick Alcock @ 2017-06-29 23:28 UTC (permalink / raw)
  To: Eric Wheeler
  Cc: Christoph Hellwig, Jens Axboe, dan.carpenter, tasleson,
	linux-bcache, linux-block, Jan Kara, stable, tang.junhui,
	Kent Overstreet, Coly Li, Stefan Bader, Liang Chen

On 29 Jun 2017, Eric Wheeler verbalised:
> Hmm, I think vger might not be letting them in because the From: header 
> differs from the original sender (ie, I am not @oracle.com) so 
> DNS/SPF/DKIM and such are wrong.
>
> What do you do in these cases?  Should I rewrite the From: header?  
> Somehow the original author should remain.

If these are patches from 'git format-patch', you do it by putting your
name in the email From: and the author you want in the commit as a From
header in the very first line of the email. 'git am' will then ignore
the header line in favour of the first-line override. I believe 'git
send-email' should be doing this automatically if your email address is
not the same as the patch author, but I could be wrong...

See the DISCUSSION section in 'man git-am'.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* [PATCH 01/19] bcache: Fix leak of bdev reference
  2017-06-29 13:45   ` Christoph Hellwig
                       ` (29 preceding siblings ...)
  2017-06-29 22:12       ` Eric Wheeler
@ 2017-06-30 20:42     ` bcache
  2017-06-30 20:42       ` [PATCH 02/19] bcache: fix sequential large write IO bypass bcache
                         ` (19 more replies)
  30 siblings, 20 replies; 120+ messages in thread
From: bcache @ 2017-06-30 20:42 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Jan Kara, stable

From: Jan Kara <jack@suse.cz>

If blkdev_get_by_path() in register_bcache() fails, we try to lookup the
block device using lookup_bdev() to detect which situation we are in to
properly report error. However we never drop the reference returned to
us from lookup_bdev(). Fix that.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/super.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 8352fad..9a2c190 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1964,6 +1964,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
 			else
 				err = "device busy";
 			mutex_unlock(&bch_register_lock);
+			if (!IS_ERR(bdev))
+				bdput(bdev);
 			if (attr == &ksysfs_register_quiet)
 				goto out;
 		}
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 02/19] bcache: fix sequential large write IO bypass
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
@ 2017-06-30 20:42       ` bcache
  2017-07-05 18:25         ` Christoph Hellwig
  2017-06-30 20:42       ` [PATCH 03/19] bcache: do not subtract sectors_to_gc for bypassed IO bcache
                         ` (18 subsequent siblings)
  19 siblings, 1 reply; 120+ messages in thread
From: bcache @ 2017-06-30 20:42 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Tang Junhui, stable

From: Tang Junhui <tang.junhui@zte.com.cn>

Sequential write IOs were tested with bs=1M by FIO in writeback cache
mode, these IOs were expected to be bypassed, but actually they did not.
We debug the code, and find in check_should_bypass():
    if (!congested &&
        mode == CACHE_MODE_WRITEBACK &&
        op_is_write(bio_op(bio)) &&
        (bio->bi_opf & REQ_SYNC))
        goto rescale
that means, If in writeback mode, a write IO with REQ_SYNC flag will not
be bypassed though it is a sequential large IO, It's not a correct thing
to do actually, so this patch remove these codes.

Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/request.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 019b3df..958072a 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -400,12 +400,6 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	if (!congested && !dc->sequential_cutoff)
 		goto rescale;
 
-	if (!congested &&
-	    mode == CACHE_MODE_WRITEBACK &&
-	    op_is_write(bio->bi_opf) &&
-	    op_is_sync(bio->bi_opf))
-		goto rescale;
-
 	spin_lock(&dc->io_lock);
 
 	hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 03/19] bcache: do not subtract sectors_to_gc for bypassed IO
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
  2017-06-30 20:42       ` [PATCH 02/19] bcache: fix sequential large write IO bypass bcache
@ 2017-06-30 20:42       ` bcache
  2017-07-01 17:26         ` Coly Li
  2017-07-05 18:25         ` Christoph Hellwig
  2017-06-30 20:42       ` [PATCH 04/19] bcache: fix wrong cache_misses statistics bcache
                         ` (17 subsequent siblings)
  19 siblings, 2 replies; 120+ messages in thread
From: bcache @ 2017-06-30 20:42 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Tang Junhui, stable

From: Tang Junhui <tang.junhui@zte.com.cn>

Since bypassed IOs use no bucket, so do not subtract sectors_to_gc to
trigger gc thread.

Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/request.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 958072a..4b413db 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -196,12 +196,12 @@ static void bch_data_insert_start(struct closure *cl)
 	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 	struct bio *bio = op->bio, *n;
 
-	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
-		wake_up_gc(op->c);
-
 	if (op->bypass)
 		return bch_data_invalidate(cl);
 
+	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
+		wake_up_gc(op->c);
+
 	/*
 	 * Journal writes are marked REQ_PREFLUSH; if the original write was a
 	 * flush, it'll wait on the journal write.
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 04/19] bcache: fix wrong cache_misses statistics
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
  2017-06-30 20:42       ` [PATCH 02/19] bcache: fix sequential large write IO bypass bcache
  2017-06-30 20:42       ` [PATCH 03/19] bcache: do not subtract sectors_to_gc for bypassed IO bcache
@ 2017-06-30 20:42       ` bcache
  2017-07-01 17:58         ` Coly Li
  2017-06-30 20:42       ` [PATCH 05/19] bcache: fix calling ida_simple_remove() with incorrect minor bcache
                         ` (16 subsequent siblings)
  19 siblings, 1 reply; 120+ messages in thread
From: bcache @ 2017-06-30 20:42 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Tang Junhui, stable

From: Tang Junhui <tang.junhui@zte.com.cn>

Some missed IOs are not counted into cache_misses, this patch fix this
issue.

Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/request.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 4b413db..d27707d 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -462,6 +462,7 @@ struct search {
 	unsigned		recoverable:1;
 	unsigned		write:1;
 	unsigned		read_dirty_data:1;
+	unsigned		cache_missed:1;
 
 	unsigned long		start_time;
 
@@ -647,6 +648,7 @@ static inline struct search *search_alloc(struct bio *bio,
 
 	s->orig_bio		= bio;
 	s->cache_miss		= NULL;
+	s->cache_missed		= 0;
 	s->d			= d;
 	s->recoverable		= 1;
 	s->write		= op_is_write(bio_op(bio));
@@ -758,7 +760,7 @@ static void cached_dev_read_done_bh(struct closure *cl)
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 
 	bch_mark_cache_accounting(s->iop.c, s->d,
-				  !s->cache_miss, s->iop.bypass);
+				  !s->cache_missed, s->iop.bypass);
 	trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
 
 	if (s->iop.status)
@@ -777,6 +779,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
 	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
 	struct bio *miss, *cache_bio;
 
+	s->cache_missed = 1; /* true */
+
 	if (s->cache_miss || s->iop.bypass) {
 		miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split);
 		ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 05/19] bcache: fix calling ida_simple_remove() with incorrect minor
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (2 preceding siblings ...)
  2017-06-30 20:42       ` [PATCH 04/19] bcache: fix wrong cache_misses statistics bcache
@ 2017-06-30 20:42       ` bcache
  2017-07-05 18:26         ` Christoph Hellwig
  2017-06-30 20:42       ` [PATCH 06/19] bcache: explicitly destory mutex while exiting bcache
                         ` (15 subsequent siblings)
  19 siblings, 1 reply; 120+ messages in thread
From: bcache @ 2017-06-30 20:42 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Tang Junhui, stable

From: Tang Junhui <tang.junhui@zte.com.cn>

bcache called ida_simple_remove() with minor which have multiplied by
BCACHE_MINORS, it would cause minor wrong release and leakage.

In addition, when adding partition support to bcache, the name assignment
was not updated, resulting in numbers jumping (bcache0, bcache16,
bcache32...). This has been fixed implicitly by the rework.

Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
Reviewed-by: Coly Li <colyli@suse.de>
Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: stable@vger.kernel.org # 4.10
Cc: Stefan Bader <stefan.bader@canonical.com>
Fixes: b8c0d91 (bcache: partition support: add 16 minors per bcacheN device)
BugLink: https://bugs.launchpad.net/bugs/1667078
---
 drivers/md/bcache/super.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 9a2c190..48b8c20 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -58,7 +58,10 @@ static wait_queue_head_t unregister_wait;
 struct workqueue_struct *bcache_wq;
 
 #define BTREE_MAX_PAGES		(256 * 1024 / PAGE_SIZE)
-#define BCACHE_MINORS		16 /* partition support */
+#define BCACHE_MINORS_BITS                4 /* bcache partition support */
+#define BCACHE_MINORS                     (1 << BCACHE_MINORS_BITS)
+#define BCACHE_TO_IDA_MINORS(first_minor) ((first_minor) >> BCACHE_MINORS_BITS)
+#define IDA_TO_BCACHE_MINORS(minor)       ((minor) << BCACHE_MINORS_BITS)
 
 /* Superblock */
 
@@ -734,7 +737,8 @@ static void bcache_device_free(struct bcache_device *d)
 	if (d->disk && d->disk->queue)
 		blk_cleanup_queue(d->disk->queue);
 	if (d->disk) {
-		ida_simple_remove(&bcache_minor, d->disk->first_minor);
+		ida_simple_remove(&bcache_minor,
+			BCACHE_TO_IDA_MINORS(d->disk->first_minor));
 		put_disk(d->disk);
 	}
 
@@ -776,11 +780,11 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	if (!d->full_dirty_stripes)
 		return -ENOMEM;
 
-	minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
+	minor = ida_simple_get(&bcache_minor, 0,
+		BCACHE_TO_IDA_MINORS(MINORMASK) + 1, GFP_KERNEL);
 	if (minor < 0)
 		return minor;
 
-	minor *= BCACHE_MINORS;
 
 	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
 					   BIOSET_NEED_BVECS |
@@ -794,7 +798,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
 
 	d->disk->major		= bcache_major;
-	d->disk->first_minor	= minor;
+	d->disk->first_minor	= IDA_TO_BCACHE_MINORS(minor);
 	d->disk->fops		= &bcache_ops;
 	d->disk->private_data	= d;
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 06/19] bcache: explicitly destory mutex while exiting
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (3 preceding siblings ...)
  2017-06-30 20:42       ` [PATCH 05/19] bcache: fix calling ida_simple_remove() with incorrect minor bcache
@ 2017-06-30 20:42       ` bcache
  2017-07-01 18:43         ` Coly Li
  2017-07-05 18:27         ` Christoph Hellwig
  2017-06-30 20:42       ` [PATCH 07/19] bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints bcache
                         ` (14 subsequent siblings)
  19 siblings, 2 replies; 120+ messages in thread
From: bcache @ 2017-06-30 20:42 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Liang Chen, stable

From: Liang Chen <liangchen.linux@gmail.com>

mutex_destroy does nothing most of time, but it's better to call
it to make the code future proof and it also has some meaning
for like mutex debug.

Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/super.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 48b8c20..1f84791 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2089,6 +2089,7 @@ static void bcache_exit(void)
 	if (bcache_major)
 		unregister_blkdev(bcache_major, "bcache");
 	unregister_reboot_notifier(&reboot);
+	mutex_destroy(&bch_register_lock);
 }
 
 static int __init bcache_init(void)
@@ -2106,6 +2107,7 @@ static int __init bcache_init(void)
 
 	bcache_major = register_blkdev(0, "bcache");
 	if (bcache_major < 0) {
+		mutex_destroy(&bch_register_lock);
 		unregister_reboot_notifier(&reboot);
 		return bcache_major;
 	}
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 07/19] bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (4 preceding siblings ...)
  2017-06-30 20:42       ` [PATCH 06/19] bcache: explicitly destory mutex while exiting bcache
@ 2017-06-30 20:42       ` bcache
  2017-07-05 18:47         ` Christoph Hellwig
  2017-06-30 20:42       ` [PATCH 08/19] bcache: documentation for sysfs entries describing bcache cache hinting bcache
                         ` (13 subsequent siblings)
  19 siblings, 1 reply; 120+ messages in thread
From: bcache @ 2017-06-30 20:42 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Eric Wheeler, Eric Wheeler, nix

From: Eric Wheeler <git@linux.ewheeler.net>

Add sysfs entries to support to hint for bypass/writeback by the ioprio
assigned to the bio.  If the bio is unassigned, use current's io-context
ioprio for cache writeback or bypass (configured per-process with
`ionice`).

Having idle IOs bypass the cache can increase performance elsewhere
since you probably don't care about their performance.  In addition,
this prevents idle IOs from promoting into (polluting) your cache and
evicting blocks that are more important elsewhere.

If you really nead the performance at the expense of SSD wearout,
then configure ioprio_writeback and set your `ionice` appropriately.

For example:
	echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
	echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback

See the documentation commit for details.

Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
Acked-by: Kent Overstreet <kent.overstreet@gmail.com>
Tested-by: Kai Krakow <kai@kaishome.de>
Cc: nix@esperi.org.uk
---
 drivers/md/bcache/bcache.h    |  3 ++
 drivers/md/bcache/request.c   | 24 +++++++++++++++
 drivers/md/bcache/sysfs.c     | 71 +++++++++++++++++++++++++++++++++++++++++++
 drivers/md/bcache/writeback.c |  8 +++++
 drivers/md/bcache/writeback.h | 24 +++++++++++++++
 5 files changed, 130 insertions(+)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index dee542f..44123e4 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -367,6 +367,9 @@ struct cached_dev {
 	unsigned		writeback_rate_update_seconds;
 	unsigned		writeback_rate_d_term;
 	unsigned		writeback_rate_p_term_inverse;
+
+	unsigned short		ioprio_writeback;
+	unsigned short		ioprio_bypass;
 };
 
 enum alloc_reserve {
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index d27707d..a95609f 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -373,6 +373,8 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	unsigned sectors, congested = bch_get_congested(c);
 	struct task_struct *task = current;
 	struct io *i;
+	struct io_context *ioc;
+	unsigned short ioprio;
 
 	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
@@ -384,6 +386,28 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	     op_is_write(bio_op(bio))))
 		goto skip;
 
+	/* If the ioprio already exists on the bio, use that.  We assume that
+	 * the upper layer properly assigned the calling process's ioprio to
+	 * the bio being passed to bcache. Otherwise, use current's ioc. */
+	ioprio = bio_prio(bio);
+	if (!ioprio_valid(ioprio)) {
+		ioc = get_task_io_context(current, GFP_NOIO, NUMA_NO_NODE);
+		if (ioc) {
+			if (ioprio_valid(ioc->ioprio))
+				ioprio = ioc->ioprio;
+			put_io_context(ioc);
+			ioc = NULL;
+		}
+	}
+
+	/* If process ioprio is lower-or-equal to dc->ioprio_bypass, then
+	 * hint for bypass. Note that a lower-priority IO class+value
+	 * has a greater numeric value. */
+	if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback)
+		&& ioprio >= dc->ioprio_bypass) {
+		return true;
+	}
+
 	if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
 	    bio_sectors(bio) & (c->sb.block_size - 1)) {
 		pr_debug("skipping unaligned io");
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index f90f136..cc0076d 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -107,6 +107,9 @@ rw_attribute(btree_shrinker_disabled);
 rw_attribute(copy_gc_enabled);
 rw_attribute(size);
 
+rw_attribute(ioprio_writeback);
+rw_attribute(ioprio_bypass);
+
 SHOW(__bch_cached_dev)
 {
 	struct cached_dev *dc = container_of(kobj, struct cached_dev,
@@ -183,6 +186,17 @@ SHOW(__bch_cached_dev)
 		return strlen(buf);
 	}
 
+	if (attr == &sysfs_ioprio_bypass)
+		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
+			IOPRIO_PRIO_CLASS(dc->ioprio_bypass),
+			IOPRIO_PRIO_DATA(dc->ioprio_bypass));
+
+	if (attr == &sysfs_ioprio_writeback)
+		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
+			IOPRIO_PRIO_CLASS(dc->ioprio_writeback),
+			IOPRIO_PRIO_DATA(dc->ioprio_writeback));
+
+
 #undef var
 	return 0;
 }
@@ -195,6 +209,10 @@ STORE(__cached_dev)
 	unsigned v = size;
 	struct cache_set *c;
 	struct kobj_uevent_env *env;
+	unsigned ioprio_class = 0; /* invalid initial ioprio values */
+	unsigned ioprio_level = IOPRIO_BE_NR;
+	unsigned short *ioprio_hint = NULL;
+	char *ioprio_type = NULL;
 
 #define d_strtoul(var)		sysfs_strtoul(var, dc->var)
 #define d_strtoul_nonzero(var)	sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
@@ -283,6 +301,57 @@ STORE(__cached_dev)
 	if (attr == &sysfs_stop)
 		bcache_device_stop(&dc->disk);
 
+	/* ioprio hinting: we use ioprio_hint to reduce duplicate printk verbiage */
+	if (attr == &sysfs_ioprio_writeback) {
+		ioprio_hint = &dc->ioprio_writeback;
+		ioprio_type = "writeback";
+	}
+
+	if (attr == &sysfs_ioprio_bypass) {
+		ioprio_hint = &dc->ioprio_bypass;
+		ioprio_type = "bypass";
+	}
+
+	if (ioprio_hint != NULL)
+	{
+		if (sscanf(buf, "%u,%u", &ioprio_class, &ioprio_level) != 2
+			|| ioprio_class > IOPRIO_CLASS_IDLE
+			|| ioprio_level >= IOPRIO_BE_NR) {
+			pr_err("ioprio_%s invalid, expecting: (class,level) but parsed (%u,%u); ignored.",
+				ioprio_type,
+				ioprio_class, ioprio_level);
+			return size;
+		}
+
+		/* Use the maximum(/minimum) value in the class shift space to make integer
+		  comparison correct for ioprio_writeback(/ioprio_bypass) for IOPRIO_CLASS_IDLE.
+		  This is necessary because there are no ioprio levels for the idle class. */
+		if (ioprio_class == IOPRIO_CLASS_IDLE) {
+			if (ioprio_hint == &dc->ioprio_writeback)
+				ioprio_level = IOPRIO_PRIO_MASK;
+			else
+				/* Same, but 0 for bypass (inverted vs. writeback) */
+				ioprio_level = 0;
+		}
+
+		*ioprio_hint = IOPRIO_PRIO_VALUE(ioprio_class, ioprio_level);
+
+		if (!ioprio_valid(*ioprio_hint))
+			pr_info("disabled ioprio_%s hints.", ioprio_type);
+		else
+			pr_info("set hint for cache %s with priority %s: (class,level) = (%u,%u)",
+				ioprio_type,
+				( ioprio_hint == &dc->ioprio_writeback ? "at-or-above" : "at-or-below" ),
+				ioprio_class, ioprio_level);
+
+		if (ioprio_valid(dc->ioprio_writeback)
+			&& ioprio_valid(dc->ioprio_bypass)
+			&& dc->ioprio_writeback >= dc->ioprio_bypass)
+			pr_warning(
+				"warning: ioprio_writeback hint is neither disabled nor higher priority than the bypass hint; "
+				"will always writeback!");
+	}
+
 	return size;
 }
 
@@ -335,6 +404,8 @@ static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_verify,
 	&sysfs_bypass_torture_test,
 #endif
+	&sysfs_ioprio_bypass,
+	&sysfs_ioprio_writeback,
 	NULL
 };
 KTYPE(bch_cached_dev);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 42c66e7..3d463f0 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -511,6 +511,14 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_rate_d_term	= 30;
 	dc->writeback_rate_p_term_inverse = 6000;
 
+	/* These defaults provide the best SSD life by enabling bypass
+	 for priorities at-or-below BE-7. This also provides better
+	 performance (cache hits) by preventing (near-)idle processes from
+	 polluting the cache working set.  Only set ioprio_writeback if
+	 you really need it: it will wear out your SSD sooner. */
+	dc->ioprio_writeback = IOPRIO_PRIO_VALUE(0, 0);
+	dc->ioprio_bypass    = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, (IOPRIO_BE_NR-1));
+
 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 }
 
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 629bd1a..cd82fe8 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -43,6 +43,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 				    unsigned cache_mode, bool would_skip)
 {
 	unsigned in_use = dc->disk.c->gc_stats.in_use;
+	struct io_context *ioc;
+	unsigned short ioprio;
 
 	if (cache_mode != CACHE_MODE_WRITEBACK ||
 	    test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
@@ -57,6 +59,28 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 	if (would_skip)
 		return false;
 
+	/* If the ioprio already exists on the bio, use that.  We assume that
+	 * the upper layer properly assigned the calling process's ioprio to
+	 * the bio being passed to bcache. Otherwise, use current's ioc. */
+	ioprio = bio_prio(bio);
+	if (!ioprio_valid(ioprio)) {
+		ioc = get_task_io_context(current, GFP_NOIO, NUMA_NO_NODE);
+		if (ioc) {
+			if (ioprio_valid(ioc->ioprio))
+				ioprio = ioc->ioprio;
+			put_io_context(ioc);
+			ioc = NULL;
+		}
+	}
+
+	/* If process ioprio is higher-or-equal to dc->ioprio_writeback, then
+	 * hint for writeback. Note that a higher-priority IO class+value
+	 * has a lesser numeric value. */
+	if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback)
+		&& ioprio <= dc->ioprio_writeback) {
+		return true;
+	}
+
 	return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
 }
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 08/19] bcache: documentation for sysfs entries describing bcache cache hinting
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (5 preceding siblings ...)
  2017-06-30 20:42       ` [PATCH 07/19] bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints bcache
@ 2017-06-30 20:42       ` bcache
  2017-07-05 18:27         ` Christoph Hellwig
  2017-06-30 20:42       ` [PATCH 09/19] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints bcache
                         ` (12 subsequent siblings)
  19 siblings, 1 reply; 120+ messages in thread
From: bcache @ 2017-06-30 20:42 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Eric Wheeler, Eric Wheeler

From: Eric Wheeler <git@linux.ewheeler.net>

Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
---
 Documentation/bcache.txt | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/Documentation/bcache.txt b/Documentation/bcache.txt
index a9259b5..c78c012 100644
--- a/Documentation/bcache.txt
+++ b/Documentation/bcache.txt
@@ -133,6 +133,86 @@ the backing devices to passthrough mode.
    writeback mode). It currently doesn't do anything intelligent if it fails to
    read some of the dirty data, though.
 
+SSD LONGEVITY: PER-PROCESS CACHE HINTING WITH IO PRIORITY
+---------------------------------------------------------
+
+Processes can be assigned an IO priority using `ionice` and bcache will
+either try to writeback or bypass the cache based on the IO priority
+level assigned to the process and the configuration of the syfs ioprio
+hints.  If configured properly for your workload, this can both increase
+performance and reduce SSD wear (erase/write cycles).
+
+Having idle IOs bypass the cache can increase performance elsewhere
+since you probably don't care about their performance.  In addition,
+this prevents idle IOs from promoting into (polluting) your cache and
+evicting blocks that are more important elsewhere.
+
+Default sysfs values:
+	2,7: ioprio_bypass is hinted for process IOs at-or-below best-effort-7.
+	0,0: ioprio_writeback hinting is disabled by default.
+
+Cache hinting is configured by writing 'class,level' pairs to sysfs.
+In this example, we write the following:
+
+    echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
+    echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback
+
+Thus, processes with the following IO class (ionice -c) and level (-n)
+will the behave as shown in this table:
+
+	(-c) IO Class    (-n) Class level       Action
+	-----------------------------------------------------
+	(1) Realtime      0-7                   Writeback
+	(2) Best-effort     0                   Writeback
+	(2) Best-effort   1-6                   Normal, as if hinting were disabled
+	(2) Best-effort     7                   Bypass cache
+	(3) Idle          n/a                   Bypass cache
+
+For processes at-or-below best-effort-7 (ionice -c2 -n7), the
+ioprio_bypass behavior is as follows:
+
+* Reads will come from the backing device and will not promote into
+  (pollute) your cache.  If the block being read was already in the cache,
+  then it will be read from the cache (and remain cached).
+
+* If you are using writeback mode, then low-priority bypass-hinted writes
+  will go directly to the backing device.  If the write was dirty in
+  cache, it will cache-invalidate and write directly to the backing
+  device.  If a high-priority task later writes the same block then it
+  will writeback so no performance is lost for write-after-write.
+
+  For read-after-bypassed-write, the block will be read from the backing
+  device (not cached) so there may be a miss penalty when a low-priority
+  process write bypasses the cache followed by a high-priority read that
+  would otherwise have hit.  In practice, this is not an issue; to date,
+  none have wanted low-priority writes and high-priority reads of the
+  same block.
+
+For processes in our example at-or-above best-effort-0 (ionice -c2 -n0),
+the ioprio_writeback behavior is as follows:
+
+* The writeback hint has no effect unless your 'cache_mode' is writeback.
+  Assuming writeback mode, all writes at this priority will writeback.
+  Of course this will increase SSD wear, so only use writeback hinting
+  if you need it.
+
+* Reads are unaffected by ioprio_writeback, except that read-after-write
+  will of course read from the cache.
+
+Linux assigns processes the best-effort class with a level of 4 if
+no process is assigned  Thus, without `ionice` your processes will
+follow normal bcache should_writeback/should_bypass symantecs as if the
+ioprio_writeback/ioprio_bypass sysfs flags were disabled.
+
+Also note that in order to be hinted by ioprio_writeback/ioprio_bypass,
+the process must have a valid ioprio setting as returned by
+get_task_io_context()->ioprio. Thus, a process without an IO context
+will be ignored by the ioprio_writeback/ioprio_bypass hints even if your
+sysfs hints specify that best-effort-4 should be flagged for bypass
+or writeback.  If in doubt, explicitly set the process IO priority with
+`ionice`.
+
+See `man ionice` for more detail about per-process IO priority in Linux.
 
 HOWTO/COOKBOOK
 --------------
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 09/19] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (6 preceding siblings ...)
  2017-06-30 20:42       ` [PATCH 08/19] bcache: documentation for sysfs entries describing bcache cache hinting bcache
@ 2017-06-30 20:42       ` bcache
  2017-07-01 18:49         ` Coly Li
  2017-06-30 20:42       ` [PATCH 10/19] bcache: initialize stripe_sectors_dirty correctly for thin flash device bcache
                         ` (11 subsequent siblings)
  19 siblings, 1 reply; 120+ messages in thread
From: bcache @ 2017-06-30 20:42 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Eric Wheeler, Eric Wheeler

From: Eric Wheeler <git@linux.ewheeler.net>

Bypass if:     bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND)

Writeback if:  op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)

Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
---
 drivers/md/bcache/request.c   | 3 +++
 drivers/md/bcache/writeback.h | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index a95609f..4629b0c 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -386,6 +386,9 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	     op_is_write(bio_op(bio))))
 		goto skip;
 
+	if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))
+		goto skip;
+
 	/* If the ioprio already exists on the bio, use that.  We assume that
 	 * the upper layer properly assigned the calling process's ioprio to
 	 * the bio being passed to bcache. Otherwise, use current's ioc. */
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index cd82fe8..ea2f92e 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -81,7 +81,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 		return true;
 	}
 
-	return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
+	return op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)
+		|| in_use <= CUTOFF_WRITEBACK;
 }
 
 static inline void bch_writeback_queue(struct cached_dev *dc)
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 10/19] bcache: initialize stripe_sectors_dirty correctly for thin flash device
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (7 preceding siblings ...)
  2017-06-30 20:42       ` [PATCH 09/19] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints bcache
@ 2017-06-30 20:42       ` bcache
  2017-07-01 18:52         ` Coly Li
  2017-06-30 20:43       ` [PATCH 11/19] bcache: Subtract dirty sectors of thin flash from cache_sectors in calculating writeback rate bcache
                         ` (10 subsequent siblings)
  19 siblings, 1 reply; 120+ messages in thread
From: bcache @ 2017-06-30 20:42 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Tang Junhui, stable

From: Tang Junhui <tang.junhui@zte.com.cn>

Thin flash device does not initialize stripe_sectors_dirty correctly, this
patch fixes this issue.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/super.c     | 3 ++-
 drivers/md/bcache/writeback.c | 8 ++++----
 drivers/md/bcache/writeback.h | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 1f84791..e06641e 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1030,7 +1030,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 	}
 
 	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
-		bch_sectors_dirty_init(dc);
+		bch_sectors_dirty_init(&dc->disk);
 		atomic_set(&dc->has_dirty, 1);
 		atomic_inc(&dc->count);
 		bch_writeback_queue(dc);
@@ -1232,6 +1232,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
 		goto err;
 
 	bcache_device_attach(d, c, u - c->uuids);
+	bch_sectors_dirty_init(d);
 	bch_flash_dev_request_init(d);
 	add_disk(d->disk);
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 3d463f0..4ac8b13 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -482,17 +482,17 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
 	return MAP_CONTINUE;
 }
 
-void bch_sectors_dirty_init(struct cached_dev *dc)
+void bch_sectors_dirty_init(struct bcache_device *d)
 {
 	struct sectors_dirty_init op;
 
 	bch_btree_op_init(&op.op, -1);
-	op.inode = dc->disk.id;
+	op.inode = d->id;
 
-	bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
+	bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
 			   sectors_dirty_init_fn, 0);
 
-	dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
+	d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
 }
 
 void bch_cached_dev_writeback_init(struct cached_dev *dc)
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index ea2f92e..c2ab4b4 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -109,7 +109,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
 
 void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
 
-void bch_sectors_dirty_init(struct cached_dev *dc);
+void bch_sectors_dirty_init(struct bcache_device *);
 void bch_cached_dev_writeback_init(struct cached_dev *);
 int bch_cached_dev_writeback_start(struct cached_dev *);
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 11/19] bcache: Subtract dirty sectors of thin flash from cache_sectors in calculating writeback rate
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (8 preceding siblings ...)
  2017-06-30 20:42       ` [PATCH 10/19] bcache: initialize stripe_sectors_dirty correctly for thin flash device bcache
@ 2017-06-30 20:43       ` bcache
  2017-07-10 18:11         ` Coly Li
  2017-06-30 20:43       ` [PATCH 12/19] bcache: update bucket_in_use periodically bcache
                         ` (9 subsequent siblings)
  19 siblings, 1 reply; 120+ messages in thread
From: bcache @ 2017-06-30 20:43 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Tang Junhui, stable

From: Tang Junhui <tang.junhui@zte.com.cn>

Since dirty sectors of thin flash cannot be used to cache data for backend
device, so we should subtract it in calculating writeback rate.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/writeback.c |  2 +-
 drivers/md/bcache/writeback.h | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 4ac8b13..25289e4 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -21,7 +21,7 @@
 static void __update_writeback_rate(struct cached_dev *dc)
 {
 	struct cache_set *c = dc->disk.c;
-	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
+	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - bcache_flash_devs_sectors_dirty(c);
 	uint64_t cache_dirty_target =
 		div_u64(cache_sectors * dc->writeback_percent, 100);
 
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index c2ab4b4..24ff589 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -14,6 +14,25 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
 	return ret;
 }
 
+static inline uint64_t  bcache_flash_devs_sectors_dirty(struct cache_set *c)
+{
+	uint64_t i, ret = 0;
+
+	mutex_lock(&bch_register_lock);
+
+	for (i = 0; i < c->nr_uuids; i++) {
+		struct bcache_device *d = c->devices[i];
+
+		if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
+			continue;
+	   ret += bcache_dev_sectors_dirty(d);
+	}
+
+	mutex_unlock(&bch_register_lock);
+
+	return ret;
+}
+
 static inline unsigned offset_to_stripe(struct bcache_device *d,
 					uint64_t offset)
 {
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 12/19] bcache: update bucket_in_use periodically
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (9 preceding siblings ...)
  2017-06-30 20:43       ` [PATCH 11/19] bcache: Subtract dirty sectors of thin flash from cache_sectors in calculating writeback rate bcache
@ 2017-06-30 20:43       ` bcache
  2017-07-11  5:05         ` Coly Li
  2017-06-30 20:43       ` [PATCH 13/19] bcache: delete redundant calling set_gc_sectors() bcache
                         ` (8 subsequent siblings)
  19 siblings, 1 reply; 120+ messages in thread
From: bcache @ 2017-06-30 20:43 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Tang Junhui

From: Tang Junhui <tang.junhui@zte.com.cn>

bucket_in_use is updated in gc thread which triggered by invalidating or
writing sectors_to_gc dirty data, It's been too long, Therefore, when we
use it to compare with the threshold, it is often not timely, which leads
to inaccurate judgment and often results in bucket depletion.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
---
 drivers/md/bcache/btree.c | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 866dcf7..77aa20b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -90,6 +90,8 @@
 #define MAX_NEED_GC		64
 #define MAX_SAVE_PRIO		72
 
+#define GC_THREAD_TIMEOUT_MS	(30 * 1000)
+
 #define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
 
 #define PTR_HASH(c, k)							\
@@ -1760,6 +1762,23 @@ static void bch_btree_gc(struct cache_set *c)
 	bch_moving_gc(c);
 }
 
+void bch_update_bucket_in_use(struct cache_set *c)
+{
+	struct cache *ca;
+	struct bucket *b;
+	unsigned i;
+	size_t available = 0;
+
+	for_each_cache(ca, c, i) {
+		for_each_bucket(b, ca) {
+			if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
+				available++;
+		}
+	}
+
+	c->gc_stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
+}
+
 static bool gc_should_run(struct cache_set *c)
 {
 	struct cache *ca;
@@ -1778,10 +1797,16 @@ static bool gc_should_run(struct cache_set *c)
 static int bch_gc_thread(void *arg)
 {
 	struct cache_set *c = arg;
+	long  ret;
+	unsigned long timeout = msecs_to_jiffies(GC_THREAD_TIMEOUT_MS);
 
 	while (1) {
-		wait_event_interruptible(c->gc_wait,
-			   kthread_should_stop() || gc_should_run(c));
+		ret = wait_event_interruptible_timeout(c->gc_wait,
+			   kthread_should_stop() || gc_should_run(c), timeout);
+		if (!ret) {
+			bch_update_bucket_in_use(c);
+			continue;
+		}
 
 		if (kthread_should_stop())
 			break;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 13/19] bcache: delete redundant calling set_gc_sectors()
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (10 preceding siblings ...)
  2017-06-30 20:43       ` [PATCH 12/19] bcache: update bucket_in_use periodically bcache
@ 2017-06-30 20:43       ` bcache
  2017-07-13  3:41         ` Eric Wheeler
  2017-06-30 20:43       ` [PATCH 14/19] bcache: Correct return value for sysfs attach errors bcache
                         ` (7 subsequent siblings)
  19 siblings, 1 reply; 120+ messages in thread
From: bcache @ 2017-06-30 20:43 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Tang Junhui

From: Tang Junhui <tang.junhui@zte.com.cn>

set_gc_sectors() has been called in bch_gc_thread(), and it was called
again in bch_btree_gc_finish() . The following call is unnecessary, so
delete it.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
---
 drivers/md/bcache/btree.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 77aa20b..66d8036 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1662,7 +1662,6 @@ static size_t bch_btree_gc_finish(struct cache_set *c)
 
 	mutex_lock(&c->bucket_lock);
 
-	set_gc_sectors(c);
 	c->gc_mark_valid = 1;
 	c->need_gc	= 0;
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 14/19] bcache: Correct return value for sysfs attach errors
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (11 preceding siblings ...)
  2017-06-30 20:43       ` [PATCH 13/19] bcache: delete redundant calling set_gc_sectors() bcache
@ 2017-06-30 20:43       ` bcache
  2017-06-30 20:43       ` [PATCH 15/19] bcache: fix issue of writeback rate at minimum 1 key per second bcache
                         ` (6 subsequent siblings)
  19 siblings, 0 replies; 120+ messages in thread
From: bcache @ 2017-06-30 20:43 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Tony Asleson, stable

From: Tony Asleson <tasleson@redhat.com>

If you encounter any errors in bch_cached_dev_attach it will return a negative
error code.  The variable 'v' which stores the result is unsigned, thus user
space sees a very large value returned for bytes written which can cause
incorrect user space behavior.  Utilize 1 signed variable to use throughout
the function to preserve error return capability.

Signed-off-by: Tony Asleson <tasleson@redhat.com>
Acked-by: Coly Li <colyli@suse.de>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/sysfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index cc0076d..7579ca6 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -206,7 +206,7 @@ STORE(__cached_dev)
 {
 	struct cached_dev *dc = container_of(kobj, struct cached_dev,
 					     disk.kobj);
-	unsigned v = size;
+	ssize_t v = size;
 	struct cache_set *c;
 	struct kobj_uevent_env *env;
 	unsigned ioprio_class = 0; /* invalid initial ioprio values */
@@ -245,7 +245,7 @@ STORE(__cached_dev)
 		bch_cached_dev_run(dc);
 
 	if (attr == &sysfs_cache_mode) {
-		ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1);
+		v = bch_read_string_list(buf, bch_cache_modes + 1);
 
 		if (v < 0)
 			return v;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 15/19] bcache: fix issue of writeback rate at minimum 1 key per second
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (12 preceding siblings ...)
  2017-06-30 20:43       ` [PATCH 14/19] bcache: Correct return value for sysfs attach errors bcache
@ 2017-06-30 20:43       ` bcache
  2017-07-16 10:04         ` Coly Li
  2017-06-30 20:43         ` bcache
                         ` (5 subsequent siblings)
  19 siblings, 1 reply; 120+ messages in thread
From: bcache @ 2017-06-30 20:43 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Tang Junhui

From: Tang Junhui <tang.junhui@zte.com.cn>

When there is not enough dirty data in writeback cache,
writeback rate is at minimum 1 key per second
util all dirty data to be cleaned, it is inefficiency,
and also causes waste of energy;

in this patch, When there is not enough dirty data,
let the writeback rate to be 0, and writeback re-schedule
in bch_writeback_thread() periodically with schedule_timeout(),
the behaviors are as follows :

1) If no dirty data have been read into dc->writeback_keys,
goto step 2), otherwise keep writing these dirty data to
back-end device at 1 key per second, until all these dirty data
write over, then goto step 2).

2) Loop in bch_writeback_thread() to check if there is enough
dirty data for writeback. if there is not enough diry data for
writing, then sleep 10 seconds, otherwise, write dirty data to
back-end device.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
---
 drivers/md/bcache/util.c      |  9 ++++++++-
 drivers/md/bcache/writeback.c | 11 +++++++----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index 8c3a938..49dcf09 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -210,7 +210,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
 {
 	uint64_t now = local_clock();
 
-	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
+	/*
+	  if d->rate is zero, write the left dirty data
+	  at the speed of one key per second
+	*/
+	if(!d->rate)
+		d->next = now + NSEC_PER_SEC;
+	else
+		d->next += div_u64(done * NSEC_PER_SEC, d->rate);
 
 	if (time_before64(now + NSEC_PER_SEC, d->next))
 		d->next = now + NSEC_PER_SEC;
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 25289e4..4104eaa 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -16,6 +16,8 @@
 #include <linux/sched/clock.h>
 #include <trace/events/bcache.h>
 
+#define WRITE_BACK_WAIT_CYCLE		10 * HZ
+
 /* Rate limiting */
 
 static void __update_writeback_rate(struct cached_dev *dc)
@@ -55,13 +57,14 @@ static void __update_writeback_rate(struct cached_dev *dc)
 
 	/* Don't increase writeback rate if the device isn't keeping up */
 	if (change > 0 &&
+	    dc->writeback_rate.rate >0 &&
 	    time_after64(local_clock(),
 			 dc->writeback_rate.next + NSEC_PER_MSEC))
 		change = 0;
 
 	dc->writeback_rate.rate =
 		clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
-			1, NSEC_PER_MSEC);
+			0, NSEC_PER_MSEC);
 
 	dc->writeback_rate_proportional = proportional;
 	dc->writeback_rate_derivative = derivative;
@@ -420,15 +423,15 @@ static int bch_writeback_thread(void *arg)
 	while (!kthread_should_stop()) {
 		down_write(&dc->writeback_lock);
 		if (!atomic_read(&dc->has_dirty) ||
-		    (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
-		     !dc->writeback_running)) {
+		    ((!dc->writeback_rate.rate || !dc->writeback_running) &&
+		      !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))) {
 			up_write(&dc->writeback_lock);
 			set_current_state(TASK_INTERRUPTIBLE);
 
 			if (kthread_should_stop())
 				return 0;
 
-			schedule();
+			schedule_timeout(WRITE_BACK_WAIT_CYCLE);
 			continue;
 		}
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 16/19] bcache: increase the number of open buckets
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
@ 2017-06-30 20:43         ` bcache
  2017-06-30 20:42       ` [PATCH 03/19] bcache: do not subtract sectors_to_gc for bypassed IO bcache
                           ` (18 subsequent siblings)
  19 siblings, 0 replies; 120+ messages in thread
From: bcache @ 2017-06-30 20:43 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Tang Junhui

From: Tang Junhui <tang.junhui@zte.com.cn>

In currently, we only alloc 6 open buckets for each cache set,
but in usually, we always attach about 10 or so backend devices for
each cache set, and the each bcache device are always accessed by
about 10 or so threads in top application layer. So 6 open buckets
are too few, It has led to that each of the same thread write data
to different buckets, which would cause low efficiency write-back,
and also cause buckets inefficient, and would be Very easy to run
out of.

I add debug message in bch_open_buckets_alloc() to print alloc bucket
info, and test with ten bcache devices with a cache set, and each
bcache device is accessed by ten threads.

>From the debug message, we can see that, after the modification, One
bucket is more likely to assign to the same thread, and the data from
the same thread are more likely to write the same bucket. Usually the
same thread always write/read the same backend device, so it is good
for write-back and also promote the usage efficiency of buckets.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
---
 drivers/md/bcache/alloc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index ca4abe1..cacbe2d 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -68,6 +68,8 @@
 #include <linux/random.h>
 #include <trace/events/bcache.h>
 
+#define MAX_OPEN_BUCKETS 128
+
 /* Bucket heap / gen */
 
 uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
@@ -671,7 +673,7 @@ int bch_open_buckets_alloc(struct cache_set *c)
 
 	spin_lock_init(&c->data_bucket_lock);
 
-	for (i = 0; i < 6; i++) {
+	for (i = 0; i < MAX_OPEN_BUCKETS; i++) {
 		struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
 		if (!b)
 			return -ENOMEM;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 16/19] bcache: increase the number of open buckets
@ 2017-06-30 20:43         ` bcache
  0 siblings, 0 replies; 120+ messages in thread
From: bcache @ 2017-06-30 20:43 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Tang Junhui

From: Tang Junhui <tang.junhui@zte.com.cn>

In currently, we only alloc 6 open buckets for each cache set,
but in usually, we always attach about 10 or so backend devices for
each cache set, and the each bcache device are always accessed by
about 10 or so threads in top application layer. So 6 open buckets
are too few, It has led to that each of the same thread write data
to different buckets, which would cause low efficiency write-back,
and also cause buckets inefficient, and would be Very easy to run
out of.

I add debug message in bch_open_buckets_alloc() to print alloc bucket
info, and test with ten bcache devices with a cache set, and each
bcache device is accessed by ten threads.

From the debug message, we can see that, after the modification, One
bucket is more likely to assign to the same thread, and the data from
the same thread are more likely to write the same bucket. Usually the
same thread always write/read the same backend device, so it is good
for write-back and also promote the usage efficiency of buckets.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
---
 drivers/md/bcache/alloc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index ca4abe1..cacbe2d 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -68,6 +68,8 @@
 #include <linux/random.h>
 #include <trace/events/bcache.h>
 
+#define MAX_OPEN_BUCKETS 128
+
 /* Bucket heap / gen */
 
 uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
@@ -671,7 +673,7 @@ int bch_open_buckets_alloc(struct cache_set *c)
 
 	spin_lock_init(&c->data_bucket_lock);
 
-	for (i = 0; i < 6; i++) {
+	for (i = 0; i < MAX_OPEN_BUCKETS; i++) {
 		struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
 		if (!b)
 			return -ENOMEM;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 17/19] bcache: fix for gc and write-back race
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (14 preceding siblings ...)
  2017-06-30 20:43         ` bcache
@ 2017-06-30 20:43       ` bcache
  2017-08-03 16:20         ` Coly Li
  2017-06-30 20:43       ` [PATCH 18/19] bcache: silence static checker warning bcache
                         ` (3 subsequent siblings)
  19 siblings, 1 reply; 120+ messages in thread
From: bcache @ 2017-06-30 20:43 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Tang Junhui, stable

From: Tang Junhui <tang.junhui@zte.com.cn>

gc and write-back get raced (see the email "bcache get stucked" I sended
before):
gc thread						write-back thread
|							|bch_writeback_thread()
|bch_gc_thread()					|
|							|==>read_dirty()
|==>bch_btree_gc()					|
|==>btree_root() //get btree root			|
|			node write locker		|
|==>bch_btree_gc_root()					|
|							|==>read_dirty_submit()
|							|==>write_dirty()
|							|==>continue_at(cl, write_dirty_finish, system_wq);
|							|==>write_dirty_finish()//excute in system_wq
|							|==>bch_btree_insert()
|							|==>bch_btree_map_leaf_nodes()
|							|==>__bch_btree_map_nodes()
|							|==>btree_root //try to get btree root node read lock
|							|-----stuck here
|==>bch_btree_set_root()				|
|==>bch_journal_meta()					|
|==>bch_journal()					|
|==>journal_try_write()					|
|==>journal_write_unlocked() //journal_full(&c->journal) condition satisfied
|==>continue_at(cl, journal_write, system_wq); //try to excute journal_write in system_wq
|					//but work queue is excuting write_dirty_finish()
|==>closure_sync(); //wait journal_write execute over and wake up gc,
|			--stuck here
|==>release root node write locker

This patch alloc a separate work-queue for write-back thread to avoid such
race.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Cc: stable@vger.kernel.org
---
 drivers/md/bcache/bcache.h    | 1 +
 drivers/md/bcache/super.c     | 2 ++
 drivers/md/bcache/writeback.c | 8 ++++++--
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 44123e4..deb0a6c 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -333,6 +333,7 @@ struct cached_dev {
 	/* Limit number of writeback bios in flight */
 	struct semaphore	in_flight;
 	struct task_struct	*writeback_thread;
+	struct workqueue_struct	*writeback_write_wq;
 
 	struct keybuf		writeback_keys;
 
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e06641e..24cb9b7 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1063,6 +1063,8 @@ static void cached_dev_free(struct closure *cl)
 	cancel_delayed_work_sync(&dc->writeback_rate_update);
 	if (!IS_ERR_OR_NULL(dc->writeback_thread))
 		kthread_stop(dc->writeback_thread);
+	if (dc->writeback_write_wq)
+		destroy_workqueue(dc->writeback_write_wq);
 
 	mutex_lock(&bch_register_lock);
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 4104eaa..4bc5daa 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -189,7 +189,7 @@ static void write_dirty(struct closure *cl)
 
 	closure_bio_submit(&io->bio, cl);
 
-	continue_at(cl, write_dirty_finish, system_wq);
+	continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
 }
 
 static void read_dirty_endio(struct bio *bio)
@@ -209,7 +209,7 @@ static void read_dirty_submit(struct closure *cl)
 
 	closure_bio_submit(&io->bio, cl);
 
-	continue_at(cl, write_dirty, system_wq);
+	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
 }
 
 static void read_dirty(struct cached_dev *dc)
@@ -527,6 +527,10 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 
 int bch_cached_dev_writeback_start(struct cached_dev *dc)
 {
+	dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", WQ_MEM_RECLAIM, 0);
+	if (!dc->writeback_write_wq)
+		return -ENOMEM;
+
 	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
 					      "bcache_writeback");
 	if (IS_ERR(dc->writeback_thread))
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 18/19] bcache: silence static checker warning
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (15 preceding siblings ...)
  2017-06-30 20:43       ` [PATCH 17/19] bcache: fix for gc and write-back race bcache
@ 2017-06-30 20:43       ` bcache
  2017-07-13  9:44         ` Coly Li
  2017-06-30 20:43       ` [PATCH 19/19] bcache: Update continue_at() documentation bcache
                         ` (2 subsequent siblings)
  19 siblings, 1 reply; 120+ messages in thread
From: bcache @ 2017-06-30 20:43 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Dan Carpenter

From: Dan Carpenter <dan.carpenter@oracle.com>

In olden times, closure_return() used to have a hidden return built in.
We removed the hidden return but forgot to add a new return here.  If
"c" were NULL we would oops on the next line, but fortunately "c" is
never NULL.  Let's just remove the if statement.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
---
 drivers/md/bcache/super.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 24cb9b7..243391d 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1381,9 +1381,6 @@ static void cache_set_flush(struct closure *cl)
 	struct btree *b;
 	unsigned i;
 
-	if (!c)
-		closure_return(cl);
-
 	bch_cache_accounting_destroy(&c->accounting);
 
 	kobject_put(&c->internal);
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* [PATCH 19/19] bcache: Update continue_at() documentation
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (16 preceding siblings ...)
  2017-06-30 20:43       ` [PATCH 18/19] bcache: silence static checker warning bcache
@ 2017-06-30 20:43       ` bcache
  2017-07-05 18:48         ` Christoph Hellwig
  2017-07-08 18:12         ` Coly Li
  2017-07-01 16:55       ` [PATCH 01/19] bcache: Fix leak of bdev reference Coly Li
  2017-07-05 18:24       ` Christoph Hellwig
  19 siblings, 2 replies; 120+ messages in thread
From: bcache @ 2017-06-30 20:43 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, Dan Carpenter

From: Dan Carpenter <dan.carpenter@oracle.com>

continue_at() doesn't have a return statement anymore.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
---
 drivers/md/bcache/closure.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index 1ec84ca..295b7e4 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -312,8 +312,6 @@ static inline void closure_wake_up(struct closure_waitlist *list)
  * been dropped with closure_put()), it will resume execution at @fn running out
  * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
  *
- * NOTE: This macro expands to a return in the calling function!
- *
  * This is because after calling continue_at() you no longer have a ref on @cl,
  * and whatever @cl owns may be freed out from under you - a running closure fn
  * has a ref on its own closure which continue_at() drops.
@@ -340,8 +338,6 @@ do {									\
  * Causes @fn to be executed out of @cl, in @wq context (or called directly if
  * @wq is NULL).
  *
- * NOTE: like continue_at(), this macro expands to a return in the caller!
- *
  * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
  * thus it's not safe to touch anything protected by @cl after a
  * continue_at_nobarrier().
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* Re: [PATCH 01/19] bcache: Fix leak of bdev reference
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (17 preceding siblings ...)
  2017-06-30 20:43       ` [PATCH 19/19] bcache: Update continue_at() documentation bcache
@ 2017-07-01 16:55       ` Coly Li
  2017-07-05 18:24       ` Christoph Hellwig
  19 siblings, 0 replies; 120+ messages in thread
From: Coly Li @ 2017-07-01 16:55 UTC (permalink / raw)
  To: bcache, linux-block; +Cc: linux-bcache, hch, axboe, Jan Kara, stable

On 2017/7/1 上午4:42, bcache@lists.ewheeler.net wrote:
> From: Jan Kara <jack@suse.cz>
> 
> If blkdev_get_by_path() in register_bcache() fails, we try to lookup the
> block device using lookup_bdev() to detect which situation we are in to
> properly report error. However we never drop the reference returned to
> us from lookup_bdev(). Fix that.
> 
> Signed-off-by: Jan Kara <jack@suse.cz>
> Cc: stable@vger.kernel.org

Acked-by: Coly Li <colyli@suse.de>

Thanks.

> ---
>  drivers/md/bcache/super.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index 8352fad..9a2c190 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -1964,6 +1964,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
>  			else
>  				err = "device busy";
>  			mutex_unlock(&bch_register_lock);
> +			if (!IS_ERR(bdev))
> +				bdput(bdev);
>  			if (attr == &ksysfs_register_quiet)
>  				goto out;
>  		}
> 


-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 03/19] bcache: do not subtract sectors_to_gc for bypassed IO
  2017-06-30 20:42       ` [PATCH 03/19] bcache: do not subtract sectors_to_gc for bypassed IO bcache
@ 2017-07-01 17:26         ` Coly Li
  2017-07-05 18:25         ` Christoph Hellwig
  1 sibling, 0 replies; 120+ messages in thread
From: Coly Li @ 2017-07-01 17:26 UTC (permalink / raw)
  To: bcache, linux-block; +Cc: linux-bcache, hch, axboe, Tang Junhui, stable

On 2017/7/1 上午4:42, bcache@lists.ewheeler.net wrote:
> From: Tang Junhui <tang.junhui@zte.com.cn>
> 
> Since bypassed IOs use no bucket, so do not subtract sectors_to_gc to
> trigger gc thread.
> 
> Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
> Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
> Cc: stable@vger.kernel.org

Acked-by: Coly Li <colyli@suse.de>

Thanks.

> ---
>  drivers/md/bcache/request.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index 958072a..4b413db 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -196,12 +196,12 @@ static void bch_data_insert_start(struct closure *cl)
>  	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
>  	struct bio *bio = op->bio, *n;
>  
> -	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
> -		wake_up_gc(op->c);
> -
>  	if (op->bypass)
>  		return bch_data_invalidate(cl);
>  
> +	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
> +		wake_up_gc(op->c);
> +
>  	/*
>  	 * Journal writes are marked REQ_PREFLUSH; if the original write was a
>  	 * flush, it'll wait on the journal write.
> 


-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 04/19] bcache: fix wrong cache_misses statistics
  2017-06-30 20:42       ` [PATCH 04/19] bcache: fix wrong cache_misses statistics bcache
@ 2017-07-01 17:58         ` Coly Li
  2017-07-13  4:09           ` Eric Wheeler
  0 siblings, 1 reply; 120+ messages in thread
From: Coly Li @ 2017-07-01 17:58 UTC (permalink / raw)
  To: Tang Junhui; +Cc: bcache, linux-block, linux-bcache, hch, axboe, stable

On 2017/7/1 上午4:42, bcache@lists.ewheeler.net wrote:
> From: Tang Junhui <tang.junhui@zte.com.cn>
> 
> Some missed IOs are not counted into cache_misses, this patch fix this
> issue.

Could you please explain more about,
- which kind of missed I/O are mot counted
- where cache_missed is located

This will help the patch to be more understandable.

> 
> Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
> Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
> Cc: stable@vger.kernel.org

[snip]

> @@ -758,7 +760,7 @@ static void cached_dev_read_done_bh(struct closure *cl)
>  	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
>  
>  	bch_mark_cache_accounting(s->iop.c, s->d,
> -				  !s->cache_miss, s->iop.bypass);
> +				  !s->cache_missed, s->iop.bypass);
>  	trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);


Should the above line be changed to,
	trace_bcache_read(s->orig_bio, !s->cache_missed, s->iop.bypass);
as well ?


[snip]

Thanks.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 06/19] bcache: explicitly destory mutex while exiting
  2017-06-30 20:42       ` [PATCH 06/19] bcache: explicitly destory mutex while exiting bcache
@ 2017-07-01 18:43         ` Coly Li
  2017-07-05 11:58             ` Liang Chen
  2017-07-05 18:27         ` Christoph Hellwig
  1 sibling, 1 reply; 120+ messages in thread
From: Coly Li @ 2017-07-01 18:43 UTC (permalink / raw)
  To: Liang Chen; +Cc: bcache, linux-block, linux-bcache, hch, axboe, stable

On 2017/7/1 上午4:42, bcache@lists.ewheeler.net wrote:
> From: Liang Chen <liangchen.linux@gmail.com>
> 
> mutex_destroy does nothing most of time, but it's better to call
> it to make the code future proof and it also has some meaning
> for like mutex debug.
> 
> Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
> Cc: stable@vger.kernel.org
> ---
>  drivers/md/bcache/super.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index 48b8c20..1f84791 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -2089,6 +2089,7 @@ static void bcache_exit(void)
>  	if (bcache_major)
>  		unregister_blkdev(bcache_major, "bcache");
>  	unregister_reboot_notifier(&reboot);
> +	mutex_destroy(&bch_register_lock>  }
>  
>  static int __init bcache_init(void)
> @@ -2106,6 +2107,7 @@ static int __init bcache_init(void)
>  
>  	bcache_major = register_blkdev(0, "bcache");
>  	if (bcache_major < 0) {
> +		mutex_destroy(&bch_register_lock);
>  		unregister_reboot_notifier(&reboot);
>  		return bcache_major;
>  	}
> 

Hi Liang,

Current code might have a potential race in a very corner case, see,
2084 static int __init bcache_init(void)
2085 {
2086         static const struct attribute *files[] = {
2087                 &ksysfs_register.attr,
2088                 &ksysfs_register_quiet.attr,
2089                 NULL
2090         };
2091
2092         mutex_init(&bch_register_lock);
2093         init_waitqueue_head(&unregister_wait);
2094         register_reboot_notifier(&reboot);
2095         closure_debug_init();
2096
2097         bcache_major = register_blkdev(0, "bcache");
2098         if (bcache_major < 0) {
2099                 unregister_reboot_notifier(&reboot);
2100                 return bcache_major;
2101         }
2102
2103         if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM,
0)) ||
2104             !(bcache_kobj = kobject_create_and_add("bcache",
fs_kobj)) ||
2105             sysfs_create_files(bcache_kobj, files) ||
2106             bch_request_init() ||
2107             bch_debug_init(bcache_kobj))
2108                 goto err;
2109
2110         return 0;
2111 err:
2112         bcache_exit();
2113         return -ENOMEM;
2114 }

At line 2107, most of bache stuffs are ready to work, only a debugfs
entry not created yet. If in the time gap between line 2106 and line
2017, another user space tool just registers cache and backing devices.
Then bch_debug_init() failed, and bcache_exit() gets called. In this
case, I doubt bcache_exit() can handle all the references correctly.

The race is very rare, and almost won't happen in real life. So, if you
don't care about it, the patch can be simpler like this,
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e57353e39168..fb5453a46a03 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2070,6 +2070,7 @@ static struct notifier_block reboot = {

 static void bcache_exit(void)
 {
+       mutex_destroy(&bch_register_lock);
        bch_debug_exit();
        bch_request_exit();
        if (bcache_kobj)
@@ -2089,7 +2090,6 @@ static int __init bcache_init(void)
                NULL
        };

-       mutex_init(&bch_register_lock);
        init_waitqueue_head(&unregister_wait);
        register_reboot_notifier(&reboot);
        closure_debug_init();
@@ -2107,6 +2107,7 @@ static int __init bcache_init(void)
            bch_debug_init(bcache_kobj))
                goto err;

+       mutex_init(&bch_register_lock);
        return 0;
 err:
        bcache_exit();
---
And if you do care about the race, maybe you should do something like this,
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index e57353e39168..ca1d8b7a7815 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2079,6 +2079,7 @@ static void bcache_exit(void)
        if (bcache_major)
                unregister_blkdev(bcache_major, "bcache");
        unregister_reboot_notifier(&reboot);
+       mutex_unlock(&bch_register_lock);
 }

 static int __init bcache_init(void)
@@ -2090,6 +2091,7 @@ static int __init bcache_init(void)
        };

        mutex_init(&bch_register_lock);
+       mutex_lock(&bch_register_lock);
        init_waitqueue_head(&unregister_wait);
        register_reboot_notifier(&reboot);
        closure_debug_init();
@@ -2097,6 +2099,8 @@ static int __init bcache_init(void)
        bcache_major = register_blkdev(0, "bcache");
        if (bcache_major < 0) {
                unregister_reboot_notifier(&reboot);
+               mutex_unlock(&bch_register_lock);
+               mutex_destroy(&bch_register_lock);
                return bcache_major;
        }

@@ -2104,9 +2108,12 @@ static int __init bcache_init(void)
            !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
            sysfs_create_files(bcache_kobj, files) ||
            bch_request_init() ||
-           bch_debug_init(bcache_kobj))
+           bch_debug_init(bcache_kobj)) {
+               mutex_unlock(&bch_register_lock);
                goto err;
+       }

+       mutex_unlock(&bch_register_lock);
        return 0;
 err:
        bcache_exit();
---

Personally I think the first approach with only one new line code added,
your original version will add two new lines of code.

Just FYI. Thanks.

-- 
Coly Li

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* Re: [PATCH 09/19] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints
  2017-06-30 20:42       ` [PATCH 09/19] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints bcache
@ 2017-07-01 18:49         ` Coly Li
  2017-07-01 19:39           ` Eric Wheeler
  0 siblings, 1 reply; 120+ messages in thread
From: Coly Li @ 2017-07-01 18:49 UTC (permalink / raw)
  To: bcache; +Cc: linux-block, linux-bcache, hch, axboe, Eric Wheeler, Eric Wheeler

On 2017/7/1 上午4:42, bcache@lists.ewheeler.net wrote:
> From: Eric Wheeler <git@linux.ewheeler.net>
> 
> Bypass if:     bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND)
> 
> Writeback if:  op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)
> 

Hi Eric,

Could you please explain a bit how the above policy is designed ? I'd
like to understand it more.

Thanks.

Coly


> Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
> ---
>  drivers/md/bcache/request.c   | 3 +++
>  drivers/md/bcache/writeback.h | 3 ++-
>  2 files changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index a95609f..4629b0c 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -386,6 +386,9 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
>  	     op_is_write(bio_op(bio))))
>  		goto skip;
>  
> +	if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))
> +		goto skip;
> +
>  	/* If the ioprio already exists on the bio, use that.  We assume that
>  	 * the upper layer properly assigned the calling process's ioprio to
>  	 * the bio being passed to bcache. Otherwise, use current's ioc. */
> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
> index cd82fe8..ea2f92e 100644
> --- a/drivers/md/bcache/writeback.h
> +++ b/drivers/md/bcache/writeback.h
> @@ -81,7 +81,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
>  		return true;
>  	}
>  
> -	return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
> +	return op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)
> +		|| in_use <= CUTOFF_WRITEBACK;
>  }
>  
>  static inline void bch_writeback_queue(struct cached_dev *dc)
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 10/19] bcache: initialize stripe_sectors_dirty correctly for thin flash device
  2017-06-30 20:42       ` [PATCH 10/19] bcache: initialize stripe_sectors_dirty correctly for thin flash device bcache
@ 2017-07-01 18:52         ` Coly Li
  2017-07-13  4:10           ` Eric Wheeler
  0 siblings, 1 reply; 120+ messages in thread
From: Coly Li @ 2017-07-01 18:52 UTC (permalink / raw)
  To: Tang Junhui; +Cc: bcache, linux-block, linux-bcache, hch, axboe, stable

On 2017/7/1 上午4:42, bcache@lists.ewheeler.net wrote:
> From: Tang Junhui <tang.junhui@zte.com.cn>
> 
> Thin flash device does not initialize stripe_sectors_dirty correctly, this
> patch fixes this issue.

Hi Junhui,

Could you please explain why stripe_sectors_ditry is not correctly
initialized and how about its negative result ?

Thanks.

Coly

> 
> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
> Cc: stable@vger.kernel.org
> ---
>  drivers/md/bcache/super.c     | 3 ++-
>  drivers/md/bcache/writeback.c | 8 ++++----
>  drivers/md/bcache/writeback.h | 2 +-
>  3 files changed, 7 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index 1f84791..e06641e 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -1030,7 +1030,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
>  	}
>  
>  	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
> -		bch_sectors_dirty_init(dc);
> +		bch_sectors_dirty_init(&dc->disk);
>  		atomic_set(&dc->has_dirty, 1);
>  		atomic_inc(&dc->count);
>  		bch_writeback_queue(dc);
> @@ -1232,6 +1232,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
>  		goto err;
>  
>  	bcache_device_attach(d, c, u - c->uuids);
> +	bch_sectors_dirty_init(d);
>  	bch_flash_dev_request_init(d);
>  	add_disk(d->disk);
>  
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index 3d463f0..4ac8b13 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -482,17 +482,17 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
>  	return MAP_CONTINUE;
>  }
>  
> -void bch_sectors_dirty_init(struct cached_dev *dc)
> +void bch_sectors_dirty_init(struct bcache_device *d)
>  {
>  	struct sectors_dirty_init op;
>  
>  	bch_btree_op_init(&op.op, -1);
> -	op.inode = dc->disk.id;
> +	op.inode = d->id;
>  
> -	bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
> +	bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
>  			   sectors_dirty_init_fn, 0);
>  
> -	dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
> +	d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
>  }
>  
>  void bch_cached_dev_writeback_init(struct cached_dev *dc)
> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
> index ea2f92e..c2ab4b4 100644
> --- a/drivers/md/bcache/writeback.h
> +++ b/drivers/md/bcache/writeback.h
> @@ -109,7 +109,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
>  
>  void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
>  
> -void bch_sectors_dirty_init(struct cached_dev *dc);
> +void bch_sectors_dirty_init(struct bcache_device *);
>  void bch_cached_dev_writeback_init(struct cached_dev *);
>  int bch_cached_dev_writeback_start(struct cached_dev *);

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 09/19] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints
  2017-07-01 18:49         ` Coly Li
@ 2017-07-01 19:39           ` Eric Wheeler
  2017-07-02  6:51             ` Coly Li
  0 siblings, 1 reply; 120+ messages in thread
From: Eric Wheeler @ 2017-07-01 19:39 UTC (permalink / raw)
  To: Coly Li; +Cc: bcache, linux-block, linux-bcache, hch, axboe

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2909 bytes --]

On Sun, 2 Jul 2017, Coly Li wrote:

> On 2017/7/1 上午4:42, bcache@lists.ewheeler.net wrote:
> > From: Eric Wheeler <git@linux.ewheeler.net>
> > 
> > Bypass if:     bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND)
> > 
> > Writeback if:  op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)
> > 
> 
> Hi Eric,
> 
> Could you please explain a bit how the above policy is designed ? I'd
> like to understand it more.

Hi Coly,

It is pretty trivial, all of the processing code exists already.  This 
patch only adds to the existing functions for the constraints noted in the 
patch.

What happens is this: cached_dev_make_request() in request.c takes the IO 
decides where the IO should go by using these two functions to decide 
whether a bio should writeback or bypass:
	check_should_bypass()
and
	should_writeback()

In check_should_bypass(), `if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))`
means bypass the cache because the bio has been flagged as a readahead or 
background IO (no sense in polluting cache in either case).

Later, if the bio is a write, then cached_dev_write() checks 
should_writeback()

In bio->bi_opf & (REQ_META|REQ_PRIO), it means writeback to cache.  Some 
filesystems use REQ_META for metadata operations which are best to keep 
low-latency for high transactional performance.  REQ_PRIO is a CFQ hint 
that the priority of the IO has been raised, so writeback is faster here.  


--
Eric Wheeler



> 
> Thanks.
> 
> Coly
> 
> 
> > Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
> > ---
> >  drivers/md/bcache/request.c   | 3 +++
> >  drivers/md/bcache/writeback.h | 3 ++-
> >  2 files changed, 5 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> > index a95609f..4629b0c 100644
> > --- a/drivers/md/bcache/request.c
> > +++ b/drivers/md/bcache/request.c
> > @@ -386,6 +386,9 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
> >  	     op_is_write(bio_op(bio))))
> >  		goto skip;
> >  
> > +	if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))
> > +		goto skip;
> > +
> >  	/* If the ioprio already exists on the bio, use that.  We assume that
> >  	 * the upper layer properly assigned the calling process's ioprio to
> >  	 * the bio being passed to bcache. Otherwise, use current's ioc. */
> > diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
> > index cd82fe8..ea2f92e 100644
> > --- a/drivers/md/bcache/writeback.h
> > +++ b/drivers/md/bcache/writeback.h
> > @@ -81,7 +81,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
> >  		return true;
> >  	}
> >  
> > -	return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
> > +	return op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)
> > +		|| in_use <= CUTOFF_WRITEBACK;
> >  }
> >  
> >  static inline void bch_writeback_queue(struct cached_dev *dc)
> > 
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 09/19] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints
  2017-07-01 19:39           ` Eric Wheeler
@ 2017-07-02  6:51             ` Coly Li
  2017-07-03 22:51               ` [PATCH 09/19 v2] " bcache
  0 siblings, 1 reply; 120+ messages in thread
From: Coly Li @ 2017-07-02  6:51 UTC (permalink / raw)
  To: Eric Wheeler; +Cc: linux-block, linux-bcache, hch, axboe

On 2017/7/2 上午3:39, Eric Wheeler wrote:
> On Sun, 2 Jul 2017, Coly Li wrote:
> 
>> On 2017/7/1 上午4:42, bcache@lists.ewheeler.net wrote:
>>> From: Eric Wheeler <git@linux.ewheeler.net>
>>>
>>> Bypass if:     bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND)
>>>
>>> Writeback if:  op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)
>>>
>>
>> Hi Eric,
>>
>> Could you please explain a bit how the above policy is designed ? I'd
>> like to understand it more.
> 
> Hi Coly,
> 
> It is pretty trivial, all of the processing code exists already.  This 
> patch only adds to the existing functions for the constraints noted in the 
> patch.
> 
> What happens is this: cached_dev_make_request() in request.c takes the IO 
> decides where the IO should go by using these two functions to decide 
> whether a bio should writeback or bypass:
> 	check_should_bypass()
> and
> 	should_writeback()
> 
> In check_should_bypass(), `if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))`
> means bypass the cache because the bio has been flagged as a readahead or 
> background IO (no sense in polluting cache in either case).
> 
> Later, if the bio is a write, then cached_dev_write() checks 
> should_writeback()
> 
> In bio->bi_opf & (REQ_META|REQ_PRIO), it means writeback to cache.  Some 
> filesystems use REQ_META for metadata operations which are best to keep 
> low-latency for high transactional performance.  REQ_PRIO is a CFQ hint 
> that the priority of the IO has been raised, so writeback is faster here.  
> 
> 

Thanks for the detailed information. I come to understand :-)

For writing case, I agree that metadata should be kept in cache device.
For reading case, there is a special case for gfs2. gfs2 sets
(REQ_RAHEAD | REQ_META) both for meta data read ahead code path, all
other file systems use (REQ_META | REQ_PRIO) when doing metadata read
ahead. I don't know whether this is something should be fixed in gfs2,
but currently maybe we should also check REQ_META,

+ /* if the read ahead request is for metadata, don't skip it */
+ if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND)) &&
+     !(bio->bi_opf & REQ_META))
+ 	goto skip;

At least we can avoid a potential performance regression here.

And could you please add the above information in patch comments.

Thank you in advance.

Coly

>>
>>> Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
>>> ---
>>>  drivers/md/bcache/request.c   | 3 +++
>>>  drivers/md/bcache/writeback.h | 3 ++-
>>>  2 files changed, 5 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
>>> index a95609f..4629b0c 100644
>>> --- a/drivers/md/bcache/request.c
>>> +++ b/drivers/md/bcache/request.c
>>> @@ -386,6 +386,9 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
>>>  	     op_is_write(bio_op(bio))))
>>>  		goto skip;
>>>  
>>> +	if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))
>>> +		goto skip;
>>> +
>>>  	/* If the ioprio already exists on the bio, use that.  We assume that
>>>  	 * the upper layer properly assigned the calling process's ioprio to
>>>  	 * the bio being passed to bcache. Otherwise, use current's ioc. */
>>> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
>>> index cd82fe8..ea2f92e 100644
>>> --- a/drivers/md/bcache/writeback.h
>>> +++ b/drivers/md/bcache/writeback.h
>>> @@ -81,7 +81,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
>>>  		return true;
>>>  	}
>>>  
>>> -	return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
>>> +	return op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)
>>> +		|| in_use <= CUTOFF_WRITEBACK;
>>>  }
>>>  
>>>  static inline void bch_writeback_queue(struct cached_dev *dc)
>>>

^ permalink raw reply	[flat|nested] 120+ messages in thread

* [PATCH 09/19 v2] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints
  2017-07-02  6:51             ` Coly Li
@ 2017-07-03 22:51               ` bcache
  2017-07-04  4:08                 ` Coly Li
  2017-07-05 18:48                 ` Christoph Hellwig
  0 siblings, 2 replies; 120+ messages in thread
From: bcache @ 2017-07-03 22:51 UTC (permalink / raw)
  To: linux-block; +Cc: linux-bcache, hch, axboe, i, Eric Wheeler, Eric Wheeler

From: Eric Wheeler <git@linux.ewheeler.net>

Flag for bypass if the IO is for read-ahead or background, unless the
read-ahead request is for metadata (eg, from gfs2).
	Bypass if:
		bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && !(bio->bi_opf & REQ_META))

	Writeback if:
		op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)

Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
---
 drivers/md/bcache/request.c   | 7 +++++++
 drivers/md/bcache/writeback.h | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index a95609f..859d08d 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -386,6 +386,13 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	     op_is_write(bio_op(bio))))
 		goto skip;
 
+	/* Flag for bypass if the IO is for read-ahead or background,
+	 * unless the read-ahead request is for metadata (eg, for gfs2).
+	 */
+	if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
+		!(bio->bi_opf & REQ_META))
+		goto skip;
+
 	/* If the ioprio already exists on the bio, use that.  We assume that
 	 * the upper layer properly assigned the calling process's ioprio to
 	 * the bio being passed to bcache. Otherwise, use current's ioc. */
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index cd82fe8..ea2f92e 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -81,7 +81,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 		return true;
 	}
 
-	return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
+	return op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)
+		|| in_use <= CUTOFF_WRITEBACK;
 }
 
 static inline void bch_writeback_queue(struct cached_dev *dc)
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 120+ messages in thread

* Re: [PATCH 09/19 v2] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints
  2017-07-03 22:51               ` [PATCH 09/19 v2] " bcache
@ 2017-07-04  4:08                 ` Coly Li
  2017-07-05 18:48                 ` Christoph Hellwig
  1 sibling, 0 replies; 120+ messages in thread
From: Coly Li @ 2017-07-04  4:08 UTC (permalink / raw)
  To: bcache, linux-block; +Cc: linux-bcache, hch, axboe, Eric Wheeler, Eric Wheeler

On 2017/7/4 上午6:51, bcache@lists.ewheeler.net wrote:
> From: Eric Wheeler <git@linux.ewheeler.net>
> 
> Flag for bypass if the IO is for read-ahead or background, unless the
> read-ahead request is for metadata (eg, from gfs2).
> 	Bypass if:
> 		bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && !(bio->bi_opf & REQ_META))
> 
> 	Writeback if:
> 		op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)
> 
> Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>

Reviewed-by: Coly Li <colyli@suse.de>

I don't have any more suggestion, thanks.

Coly

> ---
>  drivers/md/bcache/request.c   | 7 +++++++
>  drivers/md/bcache/writeback.h | 3 ++-
>  2 files changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index a95609f..859d08d 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -386,6 +386,13 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
>  	     op_is_write(bio_op(bio))))
>  		goto skip;
>  
> +	/* Flag for bypass if the IO is for read-ahead or background,
> +	 * unless the read-ahead request is for metadata (eg, for gfs2).
> +	 */
> +	if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) &&
> +		!(bio->bi_opf & REQ_META))
> +		goto skip;
> +
>  	/* If the ioprio already exists on the bio, use that.  We assume that
>  	 * the upper layer properly assigned the calling process's ioprio to
>  	 * the bio being passed to bcache. Otherwise, use current's ioc. */
> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
> index cd82fe8..ea2f92e 100644
> --- a/drivers/md/bcache/writeback.h
> +++ b/drivers/md/bcache/writeback.h
> @@ -81,7 +81,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
>  		return true;
>  	}
>  
> -	return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK;
> +	return op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO)
> +		|| in_use <= CUTOFF_WRITEBACK;
>  }
>  
>  static inline void bch_writeback_queue(struct cached_dev *dc)

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 06/19] bcache: explicitly destory mutex while exiting
  2017-07-01 18:43         ` Coly Li
@ 2017-07-05 11:58             ` Liang Chen
  0 siblings, 0 replies; 120+ messages in thread
From: Liang Chen @ 2017-07-05 11:58 UTC (permalink / raw)
  To: Coly Li; +Cc: bcache, linux-block, linux-bcache, hch, axboe, stable

Hi Coly,
Thanks for reviewing the patch! You raised a good point about the race. I a=
lso
think it should be addressed. Even though the time window is small, it will
still happen sooner or later.

I would like to keep this "destory mutex" patch unchanged, and send another
patch to fix the issue based on your approach. Please take a look. Thanks!

Thanks,
Liang

On Sun, Jul 2, 2017 at 2:43 AM, Coly Li <i@coly.li> wrote:
> On 2017/7/1 =E4=B8=8A=E5=8D=884:42, bcache@lists.ewheeler.net wrote:
>> From: Liang Chen <liangchen.linux@gmail.com>
>>
>> mutex_destroy does nothing most of time, but it's better to call
>> it to make the code future proof and it also has some meaning
>> for like mutex debug.
>>
>> Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
>> Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
>> Cc: stable@vger.kernel.org
>> ---
>>  drivers/md/bcache/super.c | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
>> index 48b8c20..1f84791 100644
>> --- a/drivers/md/bcache/super.c
>> +++ b/drivers/md/bcache/super.c
>> @@ -2089,6 +2089,7 @@ static void bcache_exit(void)
>>       if (bcache_major)
>>               unregister_blkdev(bcache_major, "bcache");
>>       unregister_reboot_notifier(&reboot);
>> +     mutex_destroy(&bch_register_lock>  }
>>
>>  static int __init bcache_init(void)
>> @@ -2106,6 +2107,7 @@ static int __init bcache_init(void)
>>
>>       bcache_major =3D register_blkdev(0, "bcache");
>>       if (bcache_major < 0) {
>> +             mutex_destroy(&bch_register_lock);
>>               unregister_reboot_notifier(&reboot);
>>               return bcache_major;
>>       }
>>
>
> Hi Liang,
>
> Current code might have a potential race in a very corner case, see,
> 2084 static int __init bcache_init(void)
> 2085 {
> 2086         static const struct attribute *files[] =3D {
> 2087                 &ksysfs_register.attr,
> 2088                 &ksysfs_register_quiet.attr,
> 2089                 NULL
> 2090         };
> 2091
> 2092         mutex_init(&bch_register_lock);
> 2093         init_waitqueue_head(&unregister_wait);
> 2094         register_reboot_notifier(&reboot);
> 2095         closure_debug_init();
> 2096
> 2097         bcache_major =3D register_blkdev(0, "bcache");
> 2098         if (bcache_major < 0) {
> 2099                 unregister_reboot_notifier(&reboot);
> 2100                 return bcache_major;
> 2101         }
> 2102
> 2103         if (!(bcache_wq =3D alloc_workqueue("bcache", WQ_MEM_RECLAIM=
,
> 0)) ||
> 2104             !(bcache_kobj =3D kobject_create_and_add("bcache",
> fs_kobj)) ||
> 2105             sysfs_create_files(bcache_kobj, files) ||
> 2106             bch_request_init() ||
> 2107             bch_debug_init(bcache_kobj))
> 2108                 goto err;
> 2109
> 2110         return 0;
> 2111 err:
> 2112         bcache_exit();
> 2113         return -ENOMEM;
> 2114 }
>
> At line 2107, most of bache stuffs are ready to work, only a debugfs
> entry not created yet. If in the time gap between line 2106 and line
> 2017, another user space tool just registers cache and backing devices.
> Then bch_debug_init() failed, and bcache_exit() gets called. In this
> case, I doubt bcache_exit() can handle all the references correctly.
>
> The race is very rare, and almost won't happen in real life. So, if you
> don't care about it, the patch can be simpler like this,
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index e57353e39168..fb5453a46a03 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -2070,6 +2070,7 @@ static struct notifier_block reboot =3D {
>
>  static void bcache_exit(void)
>  {
> +       mutex_destroy(&bch_register_lock);
>         bch_debug_exit();
>         bch_request_exit();
>         if (bcache_kobj)
> @@ -2089,7 +2090,6 @@ static int __init bcache_init(void)
>                 NULL
>         };
>
> -       mutex_init(&bch_register_lock);
>         init_waitqueue_head(&unregister_wait);
>         register_reboot_notifier(&reboot);
>         closure_debug_init();
> @@ -2107,6 +2107,7 @@ static int __init bcache_init(void)
>             bch_debug_init(bcache_kobj))
>                 goto err;
>
> +       mutex_init(&bch_register_lock);
>         return 0;
>  err:
>         bcache_exit();
> ---
> And if you do care about the race, maybe you should do something like thi=
s,
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index e57353e39168..ca1d8b7a7815 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -2079,6 +2079,7 @@ static void bcache_exit(void)
>         if (bcache_major)
>                 unregister_blkdev(bcache_major, "bcache");
>         unregister_reboot_notifier(&reboot);
> +       mutex_unlock(&bch_register_lock);
>  }
>
>  static int __init bcache_init(void)
> @@ -2090,6 +2091,7 @@ static int __init bcache_init(void)
>         };
>
>         mutex_init(&bch_register_lock);
> +       mutex_lock(&bch_register_lock);
>         init_waitqueue_head(&unregister_wait);
>         register_reboot_notifier(&reboot);
>         closure_debug_init();
> @@ -2097,6 +2099,8 @@ static int __init bcache_init(void)
>         bcache_major =3D register_blkdev(0, "bcache");
>         if (bcache_major < 0) {
>                 unregister_reboot_notifier(&reboot);
> +               mutex_unlock(&bch_register_lock);
> +               mutex_destroy(&bch_register_lock);
>                 return bcache_major;
>         }
>
> @@ -2104,9 +2108,12 @@ static int __init bcache_init(void)
>             !(bcache_kobj =3D kobject_create_and_add("bcache", fs_kobj)) =
||
>             sysfs_create_files(bcache_kobj, files) ||
>             bch_request_init() ||
> -           bch_debug_init(bcache_kobj))
> +           bch_debug_init(bcache_kobj)) {
> +               mutex_unlock(&bch_register_lock);
>                 goto err;
> +       }
>
> +       mutex_unlock(&bch_register_lock);
>         return 0;
>  err:
>         bcache_exit();
> ---
>
> Personally I think the first approach with only one new line code added,
> your original version will add two new lines of code.
>
> Just FYI. Thanks.
>
> --
> Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 06/19] bcache: explicitly destory mutex while exiting
@ 2017-07-05 11:58             ` Liang Chen
  0 siblings, 0 replies; 120+ messages in thread
From: Liang Chen @ 2017-07-05 11:58 UTC (permalink / raw)
  To: Coly Li; +Cc: bcache, linux-block, linux-bcache, hch, axboe, stable

Hi Coly,
Thanks for reviewing the patch! You raised a good point about the race. I also
think it should be addressed. Even though the time window is small, it will
still happen sooner or later.

I would like to keep this "destory mutex" patch unchanged, and send another
patch to fix the issue based on your approach. Please take a look. Thanks!

Thanks,
Liang

On Sun, Jul 2, 2017 at 2:43 AM, Coly Li <i@coly.li> wrote:
> On 2017/7/1 上午4:42, bcache@lists.ewheeler.net wrote:
>> From: Liang Chen <liangchen.linux@gmail.com>
>>
>> mutex_destroy does nothing most of time, but it's better to call
>> it to make the code future proof and it also has some meaning
>> for like mutex debug.
>>
>> Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
>> Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
>> Cc: stable@vger.kernel.org
>> ---
>>  drivers/md/bcache/super.c | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
>> index 48b8c20..1f84791 100644
>> --- a/drivers/md/bcache/super.c
>> +++ b/drivers/md/bcache/super.c
>> @@ -2089,6 +2089,7 @@ static void bcache_exit(void)
>>       if (bcache_major)
>>               unregister_blkdev(bcache_major, "bcache");
>>       unregister_reboot_notifier(&reboot);
>> +     mutex_destroy(&bch_register_lock>  }
>>
>>  static int __init bcache_init(void)
>> @@ -2106,6 +2107,7 @@ static int __init bcache_init(void)
>>
>>       bcache_major = register_blkdev(0, "bcache");
>>       if (bcache_major < 0) {
>> +             mutex_destroy(&bch_register_lock);
>>               unregister_reboot_notifier(&reboot);
>>               return bcache_major;
>>       }
>>
>
> Hi Liang,
>
> Current code might have a potential race in a very corner case, see,
> 2084 static int __init bcache_init(void)
> 2085 {
> 2086         static const struct attribute *files[] = {
> 2087                 &ksysfs_register.attr,
> 2088                 &ksysfs_register_quiet.attr,
> 2089                 NULL
> 2090         };
> 2091
> 2092         mutex_init(&bch_register_lock);
> 2093         init_waitqueue_head(&unregister_wait);
> 2094         register_reboot_notifier(&reboot);
> 2095         closure_debug_init();
> 2096
> 2097         bcache_major = register_blkdev(0, "bcache");
> 2098         if (bcache_major < 0) {
> 2099                 unregister_reboot_notifier(&reboot);
> 2100                 return bcache_major;
> 2101         }
> 2102
> 2103         if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM,
> 0)) ||
> 2104             !(bcache_kobj = kobject_create_and_add("bcache",
> fs_kobj)) ||
> 2105             sysfs_create_files(bcache_kobj, files) ||
> 2106             bch_request_init() ||
> 2107             bch_debug_init(bcache_kobj))
> 2108                 goto err;
> 2109
> 2110         return 0;
> 2111 err:
> 2112         bcache_exit();
> 2113         return -ENOMEM;
> 2114 }
>
> At line 2107, most of bache stuffs are ready to work, only a debugfs
> entry not created yet. If in the time gap between line 2106 and line
> 2017, another user space tool just registers cache and backing devices.
> Then bch_debug_init() failed, and bcache_exit() gets called. In this
> case, I doubt bcache_exit() can handle all the references correctly.
>
> The race is very rare, and almost won't happen in real life. So, if you
> don't care about it, the patch can be simpler like this,
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index e57353e39168..fb5453a46a03 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -2070,6 +2070,7 @@ static struct notifier_block reboot = {
>
>  static void bcache_exit(void)
>  {
> +       mutex_destroy(&bch_register_lock);
>         bch_debug_exit();
>         bch_request_exit();
>         if (bcache_kobj)
> @@ -2089,7 +2090,6 @@ static int __init bcache_init(void)
>                 NULL
>         };
>
> -       mutex_init(&bch_register_lock);
>         init_waitqueue_head(&unregister_wait);
>         register_reboot_notifier(&reboot);
>         closure_debug_init();
> @@ -2107,6 +2107,7 @@ static int __init bcache_init(void)
>             bch_debug_init(bcache_kobj))
>                 goto err;
>
> +       mutex_init(&bch_register_lock);
>         return 0;
>  err:
>         bcache_exit();
> ---
> And if you do care about the race, maybe you should do something like this,
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index e57353e39168..ca1d8b7a7815 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -2079,6 +2079,7 @@ static void bcache_exit(void)
>         if (bcache_major)
>                 unregister_blkdev(bcache_major, "bcache");
>         unregister_reboot_notifier(&reboot);
> +       mutex_unlock(&bch_register_lock);
>  }
>
>  static int __init bcache_init(void)
> @@ -2090,6 +2091,7 @@ static int __init bcache_init(void)
>         };
>
>         mutex_init(&bch_register_lock);
> +       mutex_lock(&bch_register_lock);
>         init_waitqueue_head(&unregister_wait);
>         register_reboot_notifier(&reboot);
>         closure_debug_init();
> @@ -2097,6 +2099,8 @@ static int __init bcache_init(void)
>         bcache_major = register_blkdev(0, "bcache");
>         if (bcache_major < 0) {
>                 unregister_reboot_notifier(&reboot);
> +               mutex_unlock(&bch_register_lock);
> +               mutex_destroy(&bch_register_lock);
>                 return bcache_major;
>         }
>
> @@ -2104,9 +2108,12 @@ static int __init bcache_init(void)
>             !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
>             sysfs_create_files(bcache_kobj, files) ||
>             bch_request_init() ||
> -           bch_debug_init(bcache_kobj))
> +           bch_debug_init(bcache_kobj)) {
> +               mutex_unlock(&bch_register_lock);
>                 goto err;
> +       }
>
> +       mutex_unlock(&bch_register_lock);
>         return 0;
>  err:
>         bcache_exit();
> ---
>
> Personally I think the first approach with only one new line code added,
> your original version will add two new lines of code.
>
> Just FYI. Thanks.
>
> --
> Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 01/19] bcache: Fix leak of bdev reference
  2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
                         ` (18 preceding siblings ...)
  2017-07-01 16:55       ` [PATCH 01/19] bcache: Fix leak of bdev reference Coly Li
@ 2017-07-05 18:24       ` Christoph Hellwig
  2017-09-04 17:30         ` Coly Li
  19 siblings, 1 reply; 120+ messages in thread
From: Christoph Hellwig @ 2017-07-05 18:24 UTC (permalink / raw)
  To: bcache; +Cc: linux-block, linux-bcache, hch, axboe, Jan Kara, stable

On Fri, Jun 30, 2017 at 01:42:50PM -0700, bcache@lists.ewheeler.net wrote:
> From: Jan Kara <jack@suse.cz>
> 
> If blkdev_get_by_path() in register_bcache() fails, we try to lookup the
> block device using lookup_bdev() to detect which situation we are in to
> properly report error. However we never drop the reference returned to
> us from lookup_bdev(). Fix that.

This look ok, but I think that whole chunk of code should just go
away - adding a lookup_bdev and resulting mess just for a slightly
different error message is just insane.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 02/19] bcache: fix sequential large write IO bypass
  2017-06-30 20:42       ` [PATCH 02/19] bcache: fix sequential large write IO bypass bcache
@ 2017-07-05 18:25         ` Christoph Hellwig
  0 siblings, 0 replies; 120+ messages in thread
From: Christoph Hellwig @ 2017-07-05 18:25 UTC (permalink / raw)
  To: bcache; +Cc: linux-block, linux-bcache, hch, axboe, Tang Junhui, stable

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 03/19] bcache: do not subtract sectors_to_gc for bypassed IO
  2017-06-30 20:42       ` [PATCH 03/19] bcache: do not subtract sectors_to_gc for bypassed IO bcache
  2017-07-01 17:26         ` Coly Li
@ 2017-07-05 18:25         ` Christoph Hellwig
  1 sibling, 0 replies; 120+ messages in thread
From: Christoph Hellwig @ 2017-07-05 18:25 UTC (permalink / raw)
  To: bcache; +Cc: linux-block, linux-bcache, hch, axboe, Tang Junhui, stable

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 05/19] bcache: fix calling ida_simple_remove() with incorrect minor
  2017-06-30 20:42       ` [PATCH 05/19] bcache: fix calling ida_simple_remove() with incorrect minor bcache
@ 2017-07-05 18:26         ` Christoph Hellwig
  2017-07-06  6:21           ` tang.junhui
  0 siblings, 1 reply; 120+ messages in thread
From: Christoph Hellwig @ 2017-07-05 18:26 UTC (permalink / raw)
  To: bcache
  Cc: linux-block, linux-bcache, hch, axboe, Tang Junhui, stable, Stefan Bader

> +#define BCACHE_TO_IDA_MINORS(first_minor) ((first_minor) >> BCACHE_MINORS_BITS)
> +#define IDA_TO_BCACHE_MINORS(minor)       ((minor) << BCACHE_MINORS_BITS)

Please use inline functions and lower case for these.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 06/19] bcache: explicitly destory mutex while exiting
  2017-06-30 20:42       ` [PATCH 06/19] bcache: explicitly destory mutex while exiting bcache
  2017-07-01 18:43         ` Coly Li
@ 2017-07-05 18:27         ` Christoph Hellwig
  2017-07-06  1:56           ` Liang Chen
  1 sibling, 1 reply; 120+ messages in thread
From: Christoph Hellwig @ 2017-07-05 18:27 UTC (permalink / raw)
  To: bcache; +Cc: linux-block, linux-bcache, hch, axboe, Liang Chen, stable

On Fri, Jun 30, 2017 at 01:42:55PM -0700, bcache@lists.ewheeler.net wrote:
> From: Liang Chen <liangchen.linux@gmail.com>
> 
> mutex_destroy does nothing most of time, but it's better to call
> it to make the code future proof and it also has some meaning
> for like mutex debug.

It shouldn't really - we should get the destroy behavior for free
when doing a slab free of the area.

What issue are you trying to solve?

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 08/19] bcache: documentation for sysfs entries describing bcache cache hinting
  2017-06-30 20:42       ` [PATCH 08/19] bcache: documentation for sysfs entries describing bcache cache hinting bcache
@ 2017-07-05 18:27         ` Christoph Hellwig
  0 siblings, 0 replies; 120+ messages in thread
From: Christoph Hellwig @ 2017-07-05 18:27 UTC (permalink / raw)
  To: bcache; +Cc: linux-block, linux-bcache, hch, axboe, Eric Wheeler, Eric Wheeler

should go into the previous patch.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 07/19] bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints
  2017-06-30 20:42       ` [PATCH 07/19] bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints bcache
@ 2017-07-05 18:47         ` Christoph Hellwig
  2017-07-05 21:49           ` Eric Wheeler
  0 siblings, 1 reply; 120+ messages in thread
From: Christoph Hellwig @ 2017-07-05 18:47 UTC (permalink / raw)
  To: bcache
  Cc: linux-block, linux-bcache, hch, axboe, Eric Wheeler, Eric Wheeler, nix

On Fri, Jun 30, 2017 at 01:42:56PM -0700, bcache@lists.ewheeler.net wrote:
> From: Eric Wheeler <git@linux.ewheeler.net>
> 
> Add sysfs entries to support to hint for bypass/writeback by the ioprio
> assigned to the bio.  If the bio is unassigned, use current's io-context
> ioprio for cache writeback or bypass (configured per-process with
> `ionice`).
> 
> Having idle IOs bypass the cache can increase performance elsewhere
> since you probably don't care about their performance.  In addition,
> this prevents idle IOs from promoting into (polluting) your cache and
> evicting blocks that are more important elsewhere.
> 
> If you really nead the performance at the expense of SSD wearout,
> then configure ioprio_writeback and set your `ionice` appropriately.
> 
> For example:
> 	echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
> 	echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback
> 
> See the documentation commit for details.

I'm really worried about this interface, as it basically uses the
ioprio field for side channel communication - your app must know
which value it wants, and you need to configure bcache to fit
exacltly that scheme.

> +	/* If the ioprio already exists on the bio, use that.  We assume that
> +	 * the upper layer properly assigned the calling process's ioprio to
> +	 * the bio being passed to bcache. Otherwise, use current's ioc. */

Please make this fit the normal kernel comment style.

> +	ioprio = bio_prio(bio);
> +	if (!ioprio_valid(ioprio)) {
> +		ioc = get_task_io_context(current, GFP_NOIO, NUMA_NO_NODE);
> +		if (ioc) {
> +			if (ioprio_valid(ioc->ioprio))
> +				ioprio = ioc->ioprio;
> +			put_io_context(ioc);
> +			ioc = NULL;
> +		}
> +	}

While get_task_io_context currently is exported it really should not
be - we should only allocate on when setting the io priority or when
forking.

What this code really wants is the ioprio related lines of code from
blk_init_request_from_bio, which should be factored into a new helper.

> +	if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback)
> +		&& ioprio >= dc->ioprio_bypass) {
> +		return true;
> +	}

Incorrect indentation, this shold be:

	if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback) &&
	    ioprio >= dc->ioprio_bypass)
		return true;

And there is some more of this in this and the following patches.
Please run them through something like checkpatch.pl

> +
>  SHOW(__bch_cached_dev)
>  {
>  	struct cached_dev *dc = container_of(kobj, struct cached_dev,
> @@ -183,6 +186,17 @@ SHOW(__bch_cached_dev)
>  		return strlen(buf);
>  	}
>  
> +	if (attr == &sysfs_ioprio_bypass)
> +		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
> +			IOPRIO_PRIO_CLASS(dc->ioprio_bypass),
> +			IOPRIO_PRIO_DATA(dc->ioprio_bypass));
> +
> +	if (attr == &sysfs_ioprio_writeback)
> +		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
> +			IOPRIO_PRIO_CLASS(dc->ioprio_writeback),
> +			IOPRIO_PRIO_DATA(dc->ioprio_writeback));
> +
> +

Please implement separate sysfs show and store function for your new
attributes instead of overloading all of them into a giant mess.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 09/19 v2] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints
  2017-07-03 22:51               ` [PATCH 09/19 v2] " bcache
  2017-07-04  4:08                 ` Coly Li
@ 2017-07-05 18:48                 ` Christoph Hellwig
  2017-07-06  7:35                   ` Coly Li
  1 sibling, 1 reply; 120+ messages in thread
From: Christoph Hellwig @ 2017-07-05 18:48 UTC (permalink / raw)
  To: bcache
  Cc: linux-block, linux-bcache, hch, axboe, i, Eric Wheeler, Eric Wheeler

REQ_META should be purely a hint for blktrace, please don't use it
in the I/O path - that's what REQ_PRIO is for.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 19/19] bcache: Update continue_at() documentation
  2017-06-30 20:43       ` [PATCH 19/19] bcache: Update continue_at() documentation bcache
@ 2017-07-05 18:48         ` Christoph Hellwig
  2017-07-08 18:12         ` Coly Li
  1 sibling, 0 replies; 120+ messages in thread
From: Christoph Hellwig @ 2017-07-05 18:48 UTC (permalink / raw)
  To: bcache; +Cc: linux-block, linux-bcache, hch, axboe, Dan Carpenter

Looks good,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 07/19] bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints
  2017-07-05 18:47         ` Christoph Hellwig
@ 2017-07-05 21:49           ` Eric Wheeler
  0 siblings, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2017-07-05 21:49 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block, linux-bcache, axboe, nix

On Wed, 5 Jul 2017, Christoph Hellwig wrote:
> On Fri, Jun 30, 2017 at 01:42:56PM -0700, bcache@lists.ewheeler.net wrote:
> > From: Eric Wheeler <git@linux.ewheeler.net>
> > 
> > Add sysfs entries to support to hint for bypass/writeback by the ioprio
> > assigned to the bio.  If the bio is unassigned, use current's io-context
> > ioprio for cache writeback or bypass (configured per-process with
> > `ionice`).
> > 
> > Having idle IOs bypass the cache can increase performance elsewhere
> > since you probably don't care about their performance.  In addition,
> > this prevents idle IOs from promoting into (polluting) your cache and
> > evicting blocks that are more important elsewhere.
> > 
> > If you really nead the performance at the expense of SSD wearout,
> > then configure ioprio_writeback and set your `ionice` appropriately.
> > 
> > For example:
> > 	echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
> > 	echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback
> > 
> > See the documentation commit for details.
> 
> I'm really worried about this interface, as it basically uses the
> ioprio field for side channel communication - your app must know
> which value it wants, and you need to configure bcache to fit
> exacltly that scheme.
>
> 
> > +	/* If the ioprio already exists on the bio, use that.  We assume that
> > +	 * the upper layer properly assigned the calling process's ioprio to
> > +	 * the bio being passed to bcache. Otherwise, use current's ioc. */
> 
> Please make this fit the normal kernel comment style.

ok
 
> > +	ioprio = bio_prio(bio);
> > +	if (!ioprio_valid(ioprio)) {
> > +		ioc = get_task_io_context(current, GFP_NOIO, NUMA_NO_NODE);
> > +		if (ioc) {
> > +			if (ioprio_valid(ioc->ioprio))
> > +				ioprio = ioc->ioprio;
> > +			put_io_context(ioc);
> > +			ioc = NULL;
> > +		}
> > +	}
> 
> While get_task_io_context currently is exported it really should not
> be - we should only allocate on when setting the io priority or when
> forking.
> 
> What this code really wants is the ioprio related lines of code from
> blk_init_request_from_bio, which should be factored into a new helper.
> 
> > +	if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback)
> > +		&& ioprio >= dc->ioprio_bypass) {
> > +		return true;
> > +	}
> 
> Incorrect indentation, this shold be:
> 
> 	if (ioprio_valid(ioprio) && ioprio_valid(dc->ioprio_writeback) &&
> 	    ioprio >= dc->ioprio_bypass)
> 		return true;
> 
> And there is some more of this in this and the following patches.
> Please run them through something like checkpatch.pl

Good idea, will do.

> 
> > +
> >  SHOW(__bch_cached_dev)
> >  {
> >  	struct cached_dev *dc = container_of(kobj, struct cached_dev,
> > @@ -183,6 +186,17 @@ SHOW(__bch_cached_dev)
> >  		return strlen(buf);
> >  	}
> >  
> > +	if (attr == &sysfs_ioprio_bypass)
> > +		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
> > +			IOPRIO_PRIO_CLASS(dc->ioprio_bypass),
> > +			IOPRIO_PRIO_DATA(dc->ioprio_bypass));
> > +
> > +	if (attr == &sysfs_ioprio_writeback)
> > +		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
> > +			IOPRIO_PRIO_CLASS(dc->ioprio_writeback),
> > +			IOPRIO_PRIO_DATA(dc->ioprio_writeback));
> > +
> > +
> 
> Please implement separate sysfs show and store function for your new
> attributes instead of overloading all of them into a giant mess.

ok.

>>

Christoph, thank you for your commentary and quick turn around on all of 
these patches!

--
Eric Wheeler




> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 06/19] bcache: explicitly destory mutex while exiting
  2017-07-05 18:27         ` Christoph Hellwig
@ 2017-07-06  1:56           ` Liang Chen
  0 siblings, 0 replies; 120+ messages in thread
From: Liang Chen @ 2017-07-06  1:56 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: bcache, linux-block, linux-bcache, axboe, stable

mutex_destroy does nothing normally (may not be true in the future), but
when debug mutex is turned on it helps with debugging - mutex_destroy
in mutex-debug.c.
It's not about freeing of the memory. It's more about consistency of the
use of mutex and making the code future proof.

Thanks,
Linag

On Thu, Jul 6, 2017 at 2:27 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Fri, Jun 30, 2017 at 01:42:55PM -0700, bcache@lists.ewheeler.net wrote:
>> From: Liang Chen <liangchen.linux@gmail.com>
>>
>> mutex_destroy does nothing most of time, but it's better to call
>> it to make the code future proof and it also has some meaning
>> for like mutex debug.
>
> It shouldn't really - we should get the destroy behavior for free
> when doing a slab free of the area.
>
> What issue are you trying to solve?

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: Re: [PATCH 05/19] bcache: fix calling ida_simple_remove() with incorrect minor
  2017-07-05 18:26         ` Christoph Hellwig
@ 2017-07-06  6:21           ` tang.junhui
  0 siblings, 0 replies; 120+ messages in thread
From: tang.junhui @ 2017-07-06  6:21 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe, bcache, linux-bcache, linux-block, stable, Stefan Bader

[-- Attachment #1: Type: text/plain, Size: 318 bytes --]

Hello Christoph

> > +#define BCACHE_TO_IDA_MINORS(first_minor) ((first_minor) >> 
BCACHE_MINORS_BITS)
> > +#define IDA_TO_BCACHE_MINORS(minor)       ((minor) << 
BCACHE_MINORS_BITS)
> 
> Please use inline functions and lower case for these.

I have send you a v2 patch, please have a review again.

Thanks
Tang Junhui

[-- Attachment #2: Type: text/html, Size: 582 bytes --]

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 09/19 v2] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints
  2017-07-05 18:48                 ` Christoph Hellwig
@ 2017-07-06  7:35                   ` Coly Li
  2017-07-06 15:24                     ` Christoph Hellwig
  0 siblings, 1 reply; 120+ messages in thread
From: Coly Li @ 2017-07-06  7:35 UTC (permalink / raw)
  To: Christoph Hellwig, bcache
  Cc: linux-block, linux-bcache, axboe, Eric Wheeler, Eric Wheeler

On 2017/7/6 上午2:48, Christoph Hellwig wrote:
> REQ_META should be purely a hint for blktrace, please don't use it
> in the I/O path - that's what REQ_PRIO is for.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

Hi Christoph,

Then gfs2 breaks the above rule ?  in gfs2_metapath_ra() and
gfs2_dir_readahead(), only REQ_META is used in submit_bh(). It seems an
extra REQ_PRIO should be there.

I see 'commit 70246286e94c3 ("block: get rid of bio_rw and READA")' is
from you, could you please provide your point on adding REQ_PRIO on the
above 2 locations ?

IMHO another patch to add REQ_PRIO to gfs2 meta data read ahead code
path should be necessary.

Thanks.


-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 09/19 v2] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints
  2017-07-06  7:35                   ` Coly Li
@ 2017-07-06 15:24                     ` Christoph Hellwig
  2017-07-11  3:48                       ` Coly Li
  0 siblings, 1 reply; 120+ messages in thread
From: Christoph Hellwig @ 2017-07-06 15:24 UTC (permalink / raw)
  To: Coly Li
  Cc: Christoph Hellwig, bcache, linux-block, linux-bcache, axboe,
	Eric Wheeler, Eric Wheeler

On Thu, Jul 06, 2017 at 03:35:48PM +0800, Coly Li wrote:
> Then gfs2 breaks the above rule ?  in gfs2_metapath_ra() and
> gfs2_dir_readahead(), only REQ_META is used in submit_bh(). It seems an
> extra REQ_PRIO should be there.

Or maybe not.  E.g. XFS absolutely avoids using REQ_PRIO for any of
the buffer writeback as it does not have priority.  You'll need to
ask gfs2 folks if they want this I/O to have priority or not.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 19/19] bcache: Update continue_at() documentation
  2017-06-30 20:43       ` [PATCH 19/19] bcache: Update continue_at() documentation bcache
  2017-07-05 18:48         ` Christoph Hellwig
@ 2017-07-08 18:12         ` Coly Li
  1 sibling, 0 replies; 120+ messages in thread
From: Coly Li @ 2017-07-08 18:12 UTC (permalink / raw)
  To: bcache, linux-block; +Cc: linux-bcache, hch, axboe, Dan Carpenter

On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
> From: Dan Carpenter <dan.carpenter@oracle.com>
> 
> continue_at() doesn't have a return statement anymore.
> 
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>

Acked-by: Coly Li <colyli@suse.de>

Thanks.

Coly

> ---
>  drivers/md/bcache/closure.h | 4 ----
>  1 file changed, 4 deletions(-)
> 
> diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
> index 1ec84ca..295b7e4 100644
> --- a/drivers/md/bcache/closure.h
> +++ b/drivers/md/bcache/closure.h
> @@ -312,8 +312,6 @@ static inline void closure_wake_up(struct closure_waitlist *list)
>   * been dropped with closure_put()), it will resume execution at @fn running out
>   * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly).
>   *
> - * NOTE: This macro expands to a return in the calling function!
> - *
>   * This is because after calling continue_at() you no longer have a ref on @cl,
>   * and whatever @cl owns may be freed out from under you - a running closure fn
>   * has a ref on its own closure which continue_at() drops.
> @@ -340,8 +338,6 @@ do {									\
>   * Causes @fn to be executed out of @cl, in @wq context (or called directly if
>   * @wq is NULL).
>   *
> - * NOTE: like continue_at(), this macro expands to a return in the caller!
> - *
>   * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn,
>   * thus it's not safe to touch anything protected by @cl after a
>   * continue_at_nobarrier().
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 11/19] bcache: Subtract dirty sectors of thin flash from cache_sectors in calculating writeback rate
  2017-06-30 20:43       ` [PATCH 11/19] bcache: Subtract dirty sectors of thin flash from cache_sectors in calculating writeback rate bcache
@ 2017-07-10 18:11         ` Coly Li
       [not found]           ` <OF92BDA950.86AA00FA-ON4825815A.001F33D9-4825815A.001F5C89@zte.com.cn>
  0 siblings, 1 reply; 120+ messages in thread
From: Coly Li @ 2017-07-10 18:11 UTC (permalink / raw)
  To: Tang Junhui; +Cc: bcache, linux-block, linux-bcache, hch, axboe, stable

On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
> From: Tang Junhui <tang.junhui@zte.com.cn>
> 
> Since dirty sectors of thin flash cannot be used to cache data for backend
> device, so we should subtract it in calculating writeback rate.
> 

I see you want to get ride of the noise of flash only cache device for
writeback rate calculation. It makes sense, because flash only cache
device won't have write back happen at all.


> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
> Cc: stable@vger.kernel.org
> ---
>  drivers/md/bcache/writeback.c |  2 +-
>  drivers/md/bcache/writeback.h | 19 +++++++++++++++++++
>  2 files changed, 20 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index 4ac8b13..25289e4 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -21,7 +21,7 @@
>  static void __update_writeback_rate(struct cached_dev *dc)
>  {
>  	struct cache_set *c = dc->disk.c;
> -	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
> +	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - bcache_flash_devs_sectors_dirty(c);

See flash_dev_run(), the flash volume is created per struct
bcache_device of a cache set. That means, all data allocated for the
flash volume will be from a flash only bcache device. Regular dirty data
won't mixed allocating with flash volume dirty data on identical struct
bcache device.

Based on the above implementation, non-dirty space from flash only
bcache device will mislead writeback rate calculation too. So I suggest
to subtract all buckets size from all flash only bcache devices. Then it
might be something like,

uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
			bcache_flash_devs_nbuckets(c);



Just FYI. Thanks.

Coly

>  	uint64_t cache_dirty_target =
>  		div_u64(cache_sectors * dc->writeback_percent, 100);
>  
> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
> index c2ab4b4..24ff589 100644
> --- a/drivers/md/bcache/writeback.h
> +++ b/drivers/md/bcache/writeback.h
> @@ -14,6 +14,25 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
>  	return ret;
>  }
>  
> +static inline uint64_t  bcache_flash_devs_sectors_dirty(struct cache_set *c)
> +{
> +	uint64_t i, ret = 0;
> +
> +	mutex_lock(&bch_register_lock);
> +
> +	for (i = 0; i < c->nr_uuids; i++) {
> +		struct bcache_device *d = c->devices[i];
> +
> +		if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
> +			continue;
> +	   ret += bcache_dev_sectors_dirty(d);
> +	}
> +
> +	mutex_unlock(&bch_register_lock);
> +
> +	return ret;
> +}
> +
>  static inline unsigned offset_to_stripe(struct bcache_device *d,
>  					uint64_t offset)
>  {
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 09/19 v2] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints
  2017-07-06 15:24                     ` Christoph Hellwig
@ 2017-07-11  3:48                       ` Coly Li
  2017-07-12  9:18                         ` Coly Li
  0 siblings, 1 reply; 120+ messages in thread
From: Coly Li @ 2017-07-11  3:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: bcache, linux-block, linux-bcache, axboe, Eric Wheeler, Eric Wheeler

On 2017/7/6 下午11:24, Christoph Hellwig wrote:
> On Thu, Jul 06, 2017 at 03:35:48PM +0800, Coly Li wrote:
>> Then gfs2 breaks the above rule ?  in gfs2_metapath_ra() and
>> gfs2_dir_readahead(), only REQ_META is used in submit_bh(). It seems an
>> extra REQ_PRIO should be there.
> 
> Or maybe not.  E.g. XFS absolutely avoids using REQ_PRIO for any of
> the buffer writeback as it does not have priority.  You'll need to
> ask gfs2 folks if they want this I/O to have priority or not.
> 

I see. Just sent a RFC patch to gfs2 development list, and wait for
response.

Thanks for the hint.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 12/19] bcache: update bucket_in_use periodically
  2017-06-30 20:43       ` [PATCH 12/19] bcache: update bucket_in_use periodically bcache
@ 2017-07-11  5:05         ` Coly Li
       [not found]           ` <OF5C19A8FA.5FF48E0C-ON4825815A.001E6DB1-4825815A.001F14F2@zte.com.cn>
  0 siblings, 1 reply; 120+ messages in thread
From: Coly Li @ 2017-07-11  5:05 UTC (permalink / raw)
  To: linux-block, Tang Junhui; +Cc: bcache, linux-bcache, hch, axboe

On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
> From: Tang Junhui <tang.junhui@zte.com.cn>
> 
> bucket_in_use is updated in gc thread which triggered by invalidating or
> writing sectors_to_gc dirty data, It's been too long, Therefore, when we
> use it to compare with the threshold, it is often not timely, which leads
> to inaccurate judgment and often results in bucket depletion.
> 
> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
> ---
>  drivers/md/bcache/btree.c | 29 +++++++++++++++++++++++++++--
>  1 file changed, 27 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
> index 866dcf7..77aa20b 100644
> --- a/drivers/md/bcache/btree.c
> +++ b/drivers/md/bcache/btree.c
> @@ -90,6 +90,8 @@
>  #define MAX_NEED_GC		64
>  #define MAX_SAVE_PRIO		72
>  
> +#define GC_THREAD_TIMEOUT_MS	(30 * 1000)
> +
>  #define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
>  
>  #define PTR_HASH(c, k)							\
> @@ -1760,6 +1762,23 @@ static void bch_btree_gc(struct cache_set *c)
>  	bch_moving_gc(c);
>  }
>  
> +void bch_update_bucket_in_use(struct cache_set *c)
> +{
> +	struct cache *ca;
> +	struct bucket *b;
> +	unsigned i;
> +	size_t available = 0;
> +
> +	for_each_cache(ca, c, i) {
> +		for_each_bucket(b, ca) {
> +			if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
> +				available++;
> +		}
> +	}
> +

bucket_lock of cache set should be held before accessing buckets.


> +	c->gc_stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets;
> +}
> +
>  static bool gc_should_run(struct cache_set *c)
>  {
>  	struct cache *ca;
> @@ -1778,10 +1797,16 @@ static bool gc_should_run(struct cache_set *c)
>  static int bch_gc_thread(void *arg)
>  {
>  	struct cache_set *c = arg;
> +	long  ret;
> +	unsigned long timeout = msecs_to_jiffies(GC_THREAD_TIMEOUT_MS);
>  
>  	while (1) {
> -		wait_event_interruptible(c->gc_wait,
> -			   kthread_should_stop() || gc_should_run(c));
> +		ret = wait_event_interruptible_timeout(c->gc_wait,
> +			   kthread_should_stop() || gc_should_run(c), timeout);
> +		if (!ret) {
> +			bch_update_bucket_in_use(c);
> +			continue;

A continue here will ignore status returned from kthread_should_stop(),
which might not be expected behavior.


> +		}
>  
>  		if (kthread_should_stop())
>  			break;
> 

Iterating all buckets from the cache set requires bucket_lock to be
held. Waiting for bucket_lock may take quite a long time for either
bucket allocating code or bch_gc_thread(). What I concern is, this patch
may introduce bucket allocation delay in period of GC_THREAD_TIMEOUT_MS.

We need to find out a way to avoid such a performance regression.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 12/19] bcache: update bucket_in_use periodically
       [not found]           ` <OF5C19A8FA.5FF48E0C-ON4825815A.001E6DB1-4825815A.001F14F2@zte.com.cn>
@ 2017-07-11  7:20             ` Coly Li
  2017-07-11 13:06             ` Coly Li
  1 sibling, 0 replies; 120+ messages in thread
From: Coly Li @ 2017-07-11  7:20 UTC (permalink / raw)
  To: tang.junhui
  Cc: axboe, bcache, hch, linux-bcache, linux-bcache-owner, linux-block

On 2017/7/11 下午1:39, tang.junhui@zte.com.cn wrote:
> Compared to bucket depletion, resulting in hanging dead,
> It is worthy to consumes a little time to update the bucket_in_use.
> If you have any better solution, please show to us,
> We should solve it as soon as possible, not wait for it forever.
> 
> 

Your response makes sense, but not all people have same opinion with
yours. The major issue here is, we need to hold bucket_lock here,
otherwise we may access wild pointer to freed kobj and panic kernel.

And we need to measure how much time will be occupied to iterate dirty
buckets number with bucket_lock held. If the latency is not that much,
we can ignore the cost, otherwise we do need to find a better solution.

Thanks.

Coly

> 
> 
> 发件人:         Coly Li <i@coly.li>
> 收件人:         linux-block@vger.kernel.org, Tang Junhui
> <tang.junhui@zte.com.cn>,
> 抄送:        bcache@lists.ewheeler.net, linux-bcache@vger.kernel.org,
> hch@infradead.org, axboe@kernel.dk
> 日期:         2017/07/11 13:06
> 主题:        Re: [PATCH 12/19] bcache: update bucket_in_use periodically
> 发件人:        linux-bcache-owner@vger.kernel.org
> ------------------------------------------------------------------------
> 
> 
> 
> On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
>> From: Tang Junhui <tang.junhui@zte.com.cn>
>>
>> bucket_in_use is updated in gc thread which triggered by invalidating or
>> writing sectors_to_gc dirty data, It's been too long, Therefore, when we
>> use it to compare with the threshold, it is often not timely, which leads
>> to inaccurate judgment and often results in bucket depletion.
>>
>> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
>> ---
>>  drivers/md/bcache/btree.c | 29 +++++++++++++++++++++++++++--
>>  1 file changed, 27 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
>> index 866dcf7..77aa20b 100644
>> --- a/drivers/md/bcache/btree.c
>> +++ b/drivers/md/bcache/btree.c
>> @@ -90,6 +90,8 @@
>>  #define MAX_NEED_GC                                  64
>>  #define MAX_SAVE_PRIO                                  72
>>  
>> +#define GC_THREAD_TIMEOUT_MS                 (30 * 1000)
>> +
>>  #define PTR_DIRTY_BIT                                  (((uint64_t) 1
> << 36))
>>  
>>  #define PTR_HASH(c, k)                                              
>                                                                         \
>> @@ -1760,6 +1762,23 @@ static void bch_btree_gc(struct cache_set *c)
>>                   bch_moving_gc(c);
>>  }
>>  
>> +void bch_update_bucket_in_use(struct cache_set *c)
>> +{
>> +                 struct cache *ca;
>> +                 struct bucket *b;
>> +                 unsigned i;
>> +                 size_t available = 0;
>> +
>> +                 for_each_cache(ca, c, i) {
>> +                                  for_each_bucket(b, ca) {
>> +                                                   if (!GC_MARK(b) ||
> GC_MARK(b) == GC_MARK_RECLAIMABLE)
>> +                                                                  
>  available++;
>> +                                  }
>> +                 }
>> +
> 
> bucket_lock of cache set should be held before accessing buckets.
> 
> 
>> +                 c->gc_stats.in_use = (c->nbuckets - available) * 100
> / c->nbuckets;
>> +}
>> +
>>  static bool gc_should_run(struct cache_set *c)
>>  {
>>                   struct cache *ca;
>> @@ -1778,10 +1797,16 @@ static bool gc_should_run(struct cache_set *c)
>>  static int bch_gc_thread(void *arg)
>>  {
>>                   struct cache_set *c = arg;
>> +                 long  ret;
>> +                 unsigned long timeout =
> msecs_to_jiffies(GC_THREAD_TIMEOUT_MS);
>>  
>>                   while (1) {
>> -                                  wait_event_interruptible(c->gc_wait,
>> -                                                    
>  kthread_should_stop() || gc_should_run(c));
>> +                                  ret =
> wait_event_interruptible_timeout(c->gc_wait,
>> +                                                    
>  kthread_should_stop() || gc_should_run(c), timeout);
>> +                                  if (!ret) {
>> +                                                  
> bch_update_bucket_in_use(c);
>> +                                                   continue;
> 
> A continue here will ignore status returned from kthread_should_stop(),
> which might not be expected behavior.
> 
> 
>> +                                  }
>>  
>>                                    if (kthread_should_stop())
>>                                                     break;
>>
> 
> Iterating all buckets from the cache set requires bucket_lock to be
> held. Waiting for bucket_lock may take quite a long time for either
> bucket allocating code or bch_gc_thread(). What I concern is, this patch
> may introduce bucket allocation delay in period of GC_THREAD_TIMEOUT_MS.
> 
> We need to find out a way to avoid such a performance regression.
> 
> -- 
> Coly Li
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 


-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 06/19] bcache: explicitly destory mutex while exiting
  2017-07-05 11:58             ` Liang Chen
  (?)
@ 2017-07-11  7:22             ` Coly Li
  -1 siblings, 0 replies; 120+ messages in thread
From: Coly Li @ 2017-07-11  7:22 UTC (permalink / raw)
  To: Liang Chen; +Cc: bcache, linux-block, linux-bcache, hch, axboe, stable

On 2017/7/5 下午7:58, Liang Chen wrote:
> Hi Coly,
> Thanks for reviewing the patch! You raised a good point about the race. I also
> think it should be addressed. Even though the time window is small, it will
> still happen sooner or later.
> 
> I would like to keep this "destory mutex" patch unchanged, and send another
> patch to fix the issue based on your approach. Please take a look. Thanks!
> 

Sure, good idea. I'd like to review the next fix, and provide my feed
back together. Thanks.

Coly



> Thanks,
> Liang
> 
> On Sun, Jul 2, 2017 at 2:43 AM, Coly Li <i@coly.li> wrote:
>> On 2017/7/1 上午4:42, bcache@lists.ewheeler.net wrote:
>>> From: Liang Chen <liangchen.linux@gmail.com>
>>>
>>> mutex_destroy does nothing most of time, but it's better to call
>>> it to make the code future proof and it also has some meaning
>>> for like mutex debug.
>>>
>>> Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
>>> Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
>>> Cc: stable@vger.kernel.org
>>> ---
>>>  drivers/md/bcache/super.c | 2 ++
>>>  1 file changed, 2 insertions(+)
>>>
>>> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
>>> index 48b8c20..1f84791 100644
>>> --- a/drivers/md/bcache/super.c
>>> +++ b/drivers/md/bcache/super.c
>>> @@ -2089,6 +2089,7 @@ static void bcache_exit(void)
>>>       if (bcache_major)
>>>               unregister_blkdev(bcache_major, "bcache");
>>>       unregister_reboot_notifier(&reboot);
>>> +     mutex_destroy(&bch_register_lock>  }
>>>
>>>  static int __init bcache_init(void)
>>> @@ -2106,6 +2107,7 @@ static int __init bcache_init(void)
>>>
>>>       bcache_major = register_blkdev(0, "bcache");
>>>       if (bcache_major < 0) {
>>> +             mutex_destroy(&bch_register_lock);
>>>               unregister_reboot_notifier(&reboot);
>>>               return bcache_major;
>>>       }
>>>
>>
>> Hi Liang,
>>
>> Current code might have a potential race in a very corner case, see,
>> 2084 static int __init bcache_init(void)
>> 2085 {
>> 2086         static const struct attribute *files[] = {
>> 2087                 &ksysfs_register.attr,
>> 2088                 &ksysfs_register_quiet.attr,
>> 2089                 NULL
>> 2090         };
>> 2091
>> 2092         mutex_init(&bch_register_lock);
>> 2093         init_waitqueue_head(&unregister_wait);
>> 2094         register_reboot_notifier(&reboot);
>> 2095         closure_debug_init();
>> 2096
>> 2097         bcache_major = register_blkdev(0, "bcache");
>> 2098         if (bcache_major < 0) {
>> 2099                 unregister_reboot_notifier(&reboot);
>> 2100                 return bcache_major;
>> 2101         }
>> 2102
>> 2103         if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM,
>> 0)) ||
>> 2104             !(bcache_kobj = kobject_create_and_add("bcache",
>> fs_kobj)) ||
>> 2105             sysfs_create_files(bcache_kobj, files) ||
>> 2106             bch_request_init() ||
>> 2107             bch_debug_init(bcache_kobj))
>> 2108                 goto err;
>> 2109
>> 2110         return 0;
>> 2111 err:
>> 2112         bcache_exit();
>> 2113         return -ENOMEM;
>> 2114 }
>>
>> At line 2107, most of bache stuffs are ready to work, only a debugfs
>> entry not created yet. If in the time gap between line 2106 and line
>> 2017, another user space tool just registers cache and backing devices.
>> Then bch_debug_init() failed, and bcache_exit() gets called. In this
>> case, I doubt bcache_exit() can handle all the references correctly.
>>
>> The race is very rare, and almost won't happen in real life. So, if you
>> don't care about it, the patch can be simpler like this,
>> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
>> index e57353e39168..fb5453a46a03 100644
>> --- a/drivers/md/bcache/super.c
>> +++ b/drivers/md/bcache/super.c
>> @@ -2070,6 +2070,7 @@ static struct notifier_block reboot = {
>>
>>  static void bcache_exit(void)
>>  {
>> +       mutex_destroy(&bch_register_lock);
>>         bch_debug_exit();
>>         bch_request_exit();
>>         if (bcache_kobj)
>> @@ -2089,7 +2090,6 @@ static int __init bcache_init(void)
>>                 NULL
>>         };
>>
>> -       mutex_init(&bch_register_lock);
>>         init_waitqueue_head(&unregister_wait);
>>         register_reboot_notifier(&reboot);
>>         closure_debug_init();
>> @@ -2107,6 +2107,7 @@ static int __init bcache_init(void)
>>             bch_debug_init(bcache_kobj))
>>                 goto err;
>>
>> +       mutex_init(&bch_register_lock);
>>         return 0;
>>  err:
>>         bcache_exit();
>> ---
>> And if you do care about the race, maybe you should do something like this,
>> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
>> index e57353e39168..ca1d8b7a7815 100644
>> --- a/drivers/md/bcache/super.c
>> +++ b/drivers/md/bcache/super.c
>> @@ -2079,6 +2079,7 @@ static void bcache_exit(void)
>>         if (bcache_major)
>>                 unregister_blkdev(bcache_major, "bcache");
>>         unregister_reboot_notifier(&reboot);
>> +       mutex_unlock(&bch_register_lock);
>>  }
>>
>>  static int __init bcache_init(void)
>> @@ -2090,6 +2091,7 @@ static int __init bcache_init(void)
>>         };
>>
>>         mutex_init(&bch_register_lock);
>> +       mutex_lock(&bch_register_lock);
>>         init_waitqueue_head(&unregister_wait);
>>         register_reboot_notifier(&reboot);
>>         closure_debug_init();
>> @@ -2097,6 +2099,8 @@ static int __init bcache_init(void)
>>         bcache_major = register_blkdev(0, "bcache");
>>         if (bcache_major < 0) {
>>                 unregister_reboot_notifier(&reboot);
>> +               mutex_unlock(&bch_register_lock);
>> +               mutex_destroy(&bch_register_lock);
>>                 return bcache_major;
>>         }
>>
>> @@ -2104,9 +2108,12 @@ static int __init bcache_init(void)
>>             !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
>>             sysfs_create_files(bcache_kobj, files) ||
>>             bch_request_init() ||
>> -           bch_debug_init(bcache_kobj))
>> +           bch_debug_init(bcache_kobj)) {
>> +               mutex_unlock(&bch_register_lock);
>>                 goto err;
>> +       }
>>
>> +       mutex_unlock(&bch_register_lock);
>>         return 0;
>>  err:
>>         bcache_exit();
>> ---
>>
>> Personally I think the first approach with only one new line code added,
>> your original version will add two new lines of code.
>>
>> Just FYI. Thanks.
>>
>> --
>> Coly Li


-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 12/19] bcache: update bucket_in_use periodically
       [not found]           ` <OF5C19A8FA.5FF48E0C-ON4825815A.001E6DB1-4825815A.001F14F2@zte.com.cn>
  2017-07-11  7:20             ` Coly Li
@ 2017-07-11 13:06             ` Coly Li
  2017-07-13  4:13               ` Eric Wheeler
  1 sibling, 1 reply; 120+ messages in thread
From: Coly Li @ 2017-07-11 13:06 UTC (permalink / raw)
  To: tang.junhui
  Cc: axboe, bcache, hch, linux-bcache, linux-bcache-owner, linux-block

On 2017/7/11 下午1:39, tang.junhui@zte.com.cn wrote:
> Compared to bucket depletion, resulting in hanging dead,
> It is worthy to consumes a little time to update the bucket_in_use.
> If you have any better solution, please show to us,
> We should solve it as soon as possible, not wait for it forever.
> 

I also test this patch on a cache device with 4x3.8TB size, all buckets
iteration takes around 40-50ms. If the iteration needs to hold
bucket_lock of cache set, it is very probably to introduce a huge I/O
latency in period of every 30 seconds.

For database people, this is not good news.

Coly


> 
> 
> 
> 发件人:         Coly Li <i@coly.li>
> 收件人:         linux-block@vger.kernel.org, Tang Junhui
> <tang.junhui@zte.com.cn>,
> 抄送:        bcache@lists.ewheeler.net, linux-bcache@vger.kernel.org,
> hch@infradead.org, axboe@kernel.dk
> 日期:         2017/07/11 13:06
> 主题:        Re: [PATCH 12/19] bcache: update bucket_in_use periodically
> 发件人:        linux-bcache-owner@vger.kernel.org
> ------------------------------------------------------------------------
> 
> 
> 
> On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
>> From: Tang Junhui <tang.junhui@zte.com.cn>
>>
>> bucket_in_use is updated in gc thread which triggered by invalidating or
>> writing sectors_to_gc dirty data, It's been too long, Therefore, when we
>> use it to compare with the threshold, it is often not timely, which leads
>> to inaccurate judgment and often results in bucket depletion.
>>
>> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
>> ---
>>  drivers/md/bcache/btree.c | 29 +++++++++++++++++++++++++++--
>>  1 file changed, 27 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
>> index 866dcf7..77aa20b 100644
>> --- a/drivers/md/bcache/btree.c
>> +++ b/drivers/md/bcache/btree.c
>> @@ -90,6 +90,8 @@
>>  #define MAX_NEED_GC                                  64
>>  #define MAX_SAVE_PRIO                                  72
>>  
>> +#define GC_THREAD_TIMEOUT_MS                 (30 * 1000)
>> +
>>  #define PTR_DIRTY_BIT                                  (((uint64_t) 1
> << 36))
>>  
>>  #define PTR_HASH(c, k)                                              
>                                                                         \
>> @@ -1760,6 +1762,23 @@ static void bch_btree_gc(struct cache_set *c)
>>                   bch_moving_gc(c);
>>  }
>>  
>> +void bch_update_bucket_in_use(struct cache_set *c)
>> +{
>> +                 struct cache *ca;
>> +                 struct bucket *b;
>> +                 unsigned i;
>> +                 size_t available = 0;
>> +
>> +                 for_each_cache(ca, c, i) {
>> +                                  for_each_bucket(b, ca) {
>> +                                                   if (!GC_MARK(b) ||
> GC_MARK(b) == GC_MARK_RECLAIMABLE)
>> +                                                                  
>  available++;
>> +                                  }
>> +                 }
>> +
> 
> bucket_lock of cache set should be held before accessing buckets.
> 
> 
>> +                 c->gc_stats.in_use = (c->nbuckets - available) * 100
> / c->nbuckets;
>> +}
>> +
>>  static bool gc_should_run(struct cache_set *c)
>>  {
>>                   struct cache *ca;
>> @@ -1778,10 +1797,16 @@ static bool gc_should_run(struct cache_set *c)
>>  static int bch_gc_thread(void *arg)
>>  {
>>                   struct cache_set *c = arg;
>> +                 long  ret;
>> +                 unsigned long timeout =
> msecs_to_jiffies(GC_THREAD_TIMEOUT_MS);
>>  
>>                   while (1) {
>> -                                  wait_event_interruptible(c->gc_wait,
>> -                                                    
>  kthread_should_stop() || gc_should_run(c));
>> +                                  ret =
> wait_event_interruptible_timeout(c->gc_wait,
>> +                                                    
>  kthread_should_stop() || gc_should_run(c), timeout);
>> +                                  if (!ret) {
>> +                                                  
> bch_update_bucket_in_use(c);
>> +                                                   continue;
> 
> A continue here will ignore status returned from kthread_should_stop(),
> which might not be expected behavior.
> 
> 
>> +                                  }
>>  
>>                                    if (kthread_should_stop())
>>                                                     break;
>>
> 
> Iterating all buckets from the cache set requires bucket_lock to be
> held. Waiting for bucket_lock may take quite a long time for either
> bucket allocating code or bch_gc_thread(). What I concern is, this patch
> may introduce bucket allocation delay in period of GC_THREAD_TIMEOUT_MS.
> 
> We need to find out a way to avoid such a performance regression.
> 
> -- 
> Coly Li
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 


-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 09/19 v2] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints
  2017-07-11  3:48                       ` Coly Li
@ 2017-07-12  9:18                         ` Coly Li
  0 siblings, 0 replies; 120+ messages in thread
From: Coly Li @ 2017-07-12  9:18 UTC (permalink / raw)
  To: Coly Li, bcache, Eric Wheeler, Eric Wheeler
  Cc: Christoph Hellwig, linux-block, linux-bcache, axboe

On 2017/7/11 上午11:48, Coly Li wrote:
> On 2017/7/6 下午11:24, Christoph Hellwig wrote:
>> On Thu, Jul 06, 2017 at 03:35:48PM +0800, Coly Li wrote:
>>> Then gfs2 breaks the above rule ?  in gfs2_metapath_ra() and
>>> gfs2_dir_readahead(), only REQ_META is used in submit_bh(). It seems an
>>> extra REQ_PRIO should be there.
>>
>> Or maybe not.  E.g. XFS absolutely avoids using REQ_PRIO for any of
>> the buffer writeback as it does not have priority.  You'll need to
>> ask gfs2 folks if they want this I/O to have priority or not.
>>
> 
> I see. Just sent a RFC patch to gfs2 development list, and wait for
> response.
> 
> Thanks for the hint.

Hi Eric,

By comments from Christoph, I think the original version is correct
enough. gfs2 is the only file system uses REQ_READAHEAD for metadata
readahead, if gfs2 developers agree to add REQ_PRIO when issue metadata
bio, we can use another patch to cache its metadata in bcache. Currently
we can just let this patch go ahead.

Thanks for your patient explaining to my question, and thanks to
Christoph to correct the concept of REQ_META and REQ_PRIO.

Please add Reviewed-by: Coly Li <colyli@suse.de> to your original
version patch.


Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 13/19] bcache: delete redundant calling set_gc_sectors()
  2017-06-30 20:43       ` [PATCH 13/19] bcache: delete redundant calling set_gc_sectors() bcache
@ 2017-07-13  3:41         ` Eric Wheeler
  0 siblings, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2017-07-13  3:41 UTC (permalink / raw)
  To: Tang Junhui; +Cc: linux-block, linux-bcache

On Fri, 30 Jun 2017, bcache@lists.ewheeler.net wrote:

> From: Tang Junhui <tang.junhui@zte.com.cn>
> 
> set_gc_sectors() has been called in bch_gc_thread(), and it was called
> again in bch_btree_gc_finish() . The following call is unnecessary, so
> delete it.


I'm trying to follow the call path that proves that this is safe, but I'm 
not seeing it.  Can you explain this in more detail?

--
Eric Wheeler



> 
> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
> ---
>  drivers/md/bcache/btree.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
> index 77aa20b..66d8036 100644
> --- a/drivers/md/bcache/btree.c
> +++ b/drivers/md/bcache/btree.c
> @@ -1662,7 +1662,6 @@ static size_t bch_btree_gc_finish(struct cache_set *c)
>  
>  	mutex_lock(&c->bucket_lock);
>  
> -	set_gc_sectors(c);
>  	c->gc_mark_valid = 1;
>  	c->need_gc	= 0;
>  
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 04/19] bcache: fix wrong cache_misses statistics
  2017-07-01 17:58         ` Coly Li
@ 2017-07-13  4:09           ` Eric Wheeler
  2017-10-27 19:14             ` Eric Wheeler
  0 siblings, 1 reply; 120+ messages in thread
From: Eric Wheeler @ 2017-07-13  4:09 UTC (permalink / raw)
  To: Tang Junhui; +Cc: Coly Li, linux-block, linux-bcache

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1253 bytes --]

On Sun, 2 Jul 2017, Coly Li wrote:

> On 2017/7/1 上午4:42, bcache@lists.ewheeler.net wrote:
> > From: Tang Junhui <tang.junhui@zte.com.cn>
> > 
> > Some missed IOs are not counted into cache_misses, this patch fix this
> > issue.
> 
> Could you please explain more about,
> - which kind of missed I/O are mot counted
> - where cache_missed is located
> 
> This will help the patch to be more understandable.

Hi Tang,

I'm waiting to queue this patch pending your response to Coly.  Can you 
update the message send a v2?

--
Eric Wheeler



> 
> > 
> > Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
> > Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
> > Cc: stable@vger.kernel.org
> 
> [snip]
> 
> > @@ -758,7 +760,7 @@ static void cached_dev_read_done_bh(struct closure *cl)
> >  	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
> >  
> >  	bch_mark_cache_accounting(s->iop.c, s->d,
> > -				  !s->cache_miss, s->iop.bypass);
> > +				  !s->cache_missed, s->iop.bypass);
> >  	trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
> 
> 
> Should the above line be changed to,
> 	trace_bcache_read(s->orig_bio, !s->cache_missed, s->iop.bypass);
> as well ?
> 
> 
> [snip]
> 
> Thanks.
> 
> -- 
> Coly Li
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 10/19] bcache: initialize stripe_sectors_dirty correctly for thin flash device
  2017-07-01 18:52         ` Coly Li
@ 2017-07-13  4:10           ` Eric Wheeler
  0 siblings, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2017-07-13  4:10 UTC (permalink / raw)
  To: Tang Junhui; +Cc: Coly Li, linux-block, linux-bcache

[-- Attachment #1: Type: TEXT/PLAIN, Size: 3222 bytes --]

On Sun, 2 Jul 2017, Coly Li wrote:

> On 2017/7/1 上午4:42, bcache@lists.ewheeler.net wrote:
> > From: Tang Junhui <tang.junhui@zte.com.cn>
> > 
> > Thin flash device does not initialize stripe_sectors_dirty correctly, this
> > patch fixes this issue.
> 
> Hi Junhui,
> 
> Could you please explain why stripe_sectors_ditry is not correctly
> initialized and how about its negative result ?

Hi Tang,
   
I'm waiting to queue this patch pending your response to Coly.  Can you   
update the message send a v2?

--
Eric Wheeler



> 
> Thanks.
> 
> Coly
> 
> > 
> > Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
> > Cc: stable@vger.kernel.org
> > ---
> >  drivers/md/bcache/super.c     | 3 ++-
> >  drivers/md/bcache/writeback.c | 8 ++++----
> >  drivers/md/bcache/writeback.h | 2 +-
> >  3 files changed, 7 insertions(+), 6 deletions(-)
> > 
> > diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> > index 1f84791..e06641e 100644
> > --- a/drivers/md/bcache/super.c
> > +++ b/drivers/md/bcache/super.c
> > @@ -1030,7 +1030,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
> >  	}
> >  
> >  	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
> > -		bch_sectors_dirty_init(dc);
> > +		bch_sectors_dirty_init(&dc->disk);
> >  		atomic_set(&dc->has_dirty, 1);
> >  		atomic_inc(&dc->count);
> >  		bch_writeback_queue(dc);
> > @@ -1232,6 +1232,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
> >  		goto err;
> >  
> >  	bcache_device_attach(d, c, u - c->uuids);
> > +	bch_sectors_dirty_init(d);
> >  	bch_flash_dev_request_init(d);
> >  	add_disk(d->disk);
> >  
> > diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> > index 3d463f0..4ac8b13 100644
> > --- a/drivers/md/bcache/writeback.c
> > +++ b/drivers/md/bcache/writeback.c
> > @@ -482,17 +482,17 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
> >  	return MAP_CONTINUE;
> >  }
> >  
> > -void bch_sectors_dirty_init(struct cached_dev *dc)
> > +void bch_sectors_dirty_init(struct bcache_device *d)
> >  {
> >  	struct sectors_dirty_init op;
> >  
> >  	bch_btree_op_init(&op.op, -1);
> > -	op.inode = dc->disk.id;
> > +	op.inode = d->id;
> >  
> > -	bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
> > +	bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
> >  			   sectors_dirty_init_fn, 0);
> >  
> > -	dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
> > +	d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
> >  }
> >  
> >  void bch_cached_dev_writeback_init(struct cached_dev *dc)
> > diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
> > index ea2f92e..c2ab4b4 100644
> > --- a/drivers/md/bcache/writeback.h
> > +++ b/drivers/md/bcache/writeback.h
> > @@ -109,7 +109,7 @@ static inline void bch_writeback_add(struct cached_dev *dc)
> >  
> >  void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
> >  
> > -void bch_sectors_dirty_init(struct cached_dev *dc);
> > +void bch_sectors_dirty_init(struct bcache_device *);
> >  void bch_cached_dev_writeback_init(struct cached_dev *);
> >  int bch_cached_dev_writeback_start(struct cached_dev *);
> 
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: Re: [PATCH 11/19] bcache: Subtract dirty sectors of thin flash from cache_sectors in calculating writeback rate
       [not found]           ` <OF92BDA950.86AA00FA-ON4825815A.001F33D9-4825815A.001F5C89@zte.com.cn>
@ 2017-07-13  4:12             ` Eric Wheeler
  2017-07-13  4:15               ` Coly Li
  0 siblings, 1 reply; 120+ messages in thread
From: Eric Wheeler @ 2017-07-13  4:12 UTC (permalink / raw)
  To: tang.junhui; +Cc: Coly Li, linux-bcache, linux-block, stable

[-- Attachment #1: Type: TEXT/PLAIN, Size: 5081 bytes --]

On Tue, 11 Jul 2017, tang.junhui@zte.com.cn wrote:

> > Based on the above implementation, non-dirty space from flash only
> > bcache device will mislead writeback rate calculation too. So I suggest
> > to subtract all buckets size from all flash only bcache devices. Then it
> > might be something like,
> 
> what is non-dirty space from flash only bcache device?
> Where is non-dirty space from flash only bcache device?

Hi Tang, Coly:
   
Waas there more discussion on this thread, or is the patch good to go?  

Please send your ack if you're happy with it so I can queue it up.

--
Eric Wheeler

> 
> 
> 
> 发件人:         Coly Li <i@coly.li>
> 收件人:         Tang Junhui <tang.junhui@zte.com.cn>,
> 抄送:        bcache@lists.ewheeler.net, linux-block@vger.kernel.org, linux-bcache@vger.kernel.org,
> hch@infradead.org, axboe@kernel.dk, stable@vger.kernel.org
> 日期:         2017/07/11 02:11
> 主题:        Re: [PATCH 11/19] bcache: Subtract dirty sectors of thin flash from cache_sectors in calculating
> writeback rate
> 发件人:        linux-bcache-owner@vger.kernel.org
> 
> _________________________________________________________________________________________________________________
> 
> 
> 
> On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
> > From: Tang Junhui <tang.junhui@zte.com.cn>
> >
> > Since dirty sectors of thin flash cannot be used to cache data for backend
> > device, so we should subtract it in calculating writeback rate.
> >
> 
> I see you want to get ride of the noise of flash only cache device for
> writeback rate calculation. It makes sense, because flash only cache
> device won't have write back happen at all.
> 
> 
> > Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
> > Cc: stable@vger.kernel.org
> > ---
> >  drivers/md/bcache/writeback.c |  2 +-
> >  drivers/md/bcache/writeback.h | 19 +++++++++++++++++++
> >  2 files changed, 20 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> > index 4ac8b13..25289e4 100644
> > --- a/drivers/md/bcache/writeback.c
> > +++ b/drivers/md/bcache/writeback.c
> > @@ -21,7 +21,7 @@
> >  static void __update_writeback_rate(struct cached_dev *dc)
> >  {
> >                   struct cache_set *c = dc->disk.c;
> > -                 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
> > +                 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
> bcache_flash_devs_sectors_dirty(c);
> 
> See flash_dev_run(), the flash volume is created per struct
> bcache_device of a cache set. That means, all data allocated for the
> flash volume will be from a flash only bcache device. Regular dirty data
> won't mixed allocating with flash volume dirty data on identical struct
> bcache device.
> 
> Based on the above implementation, non-dirty space from flash only
> bcache device will mislead writeback rate calculation too. So I suggest
> to subtract all buckets size from all flash only bcache devices. Then it
> might be something like,
> 
> uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
>                                                   bcache_flash_devs_nbuckets(c);
> 
> 
> 
> Just FYI. Thanks.
> 
> Coly
> 
> >                   uint64_t cache_dirty_target =
> >                                    div_u64(cache_sectors * dc->writeback_percent, 100);
> >  
> > diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
> > index c2ab4b4..24ff589 100644
> > --- a/drivers/md/bcache/writeback.h
> > +++ b/drivers/md/bcache/writeback.h
> > @@ -14,6 +14,25 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
> >                   return ret;
> >  }
> >  
> > +static inline uint64_t  bcache_flash_devs_sectors_dirty(struct cache_set *c)
> > +{
> > +                 uint64_t i, ret = 0;
> > +
> > +                 mutex_lock(&bch_register_lock);
> > +
> > +                 for (i = 0; i < c->nr_uuids; i++) {
> > +                                  struct bcache_device *d = c->devices[i];
> > +
> > +                                  if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
> > +                                                   continue;
> > +                    ret += bcache_dev_sectors_dirty(d);
> > +                 }
> > +
> > +                 mutex_unlock(&bch_register_lock);
> > +
> > +                 return ret;
> > +}
> > +
> >  static inline unsigned offset_to_stripe(struct bcache_device *d,
> >                                                                                       uint64_t offset)
> >  {
> >
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> 
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 12/19] bcache: update bucket_in_use periodically
  2017-07-11 13:06             ` Coly Li
@ 2017-07-13  4:13               ` Eric Wheeler
  2017-07-13  4:27                 ` Coly Li
  0 siblings, 1 reply; 120+ messages in thread
From: Eric Wheeler @ 2017-07-13  4:13 UTC (permalink / raw)
  To: tang.junhui; +Cc: Coly Li, linux-bcache, linux-block

[-- Attachment #1: Type: TEXT/PLAIN, Size: 5594 bytes --]

On Tue, 11 Jul 2017, Coly Li wrote:

> On 2017/7/11 下午1:39, tang.junhui@zte.com.cn wrote:
> > Compared to bucket depletion, resulting in hanging dead,
> > It is worthy to consumes a little time to update the bucket_in_use.
> > If you have any better solution, please show to us,
> > We should solve it as soon as possible, not wait for it forever.
> > 
> 
> I also test this patch on a cache device with 4x3.8TB size, all buckets
> iteration takes around 40-50ms. If the iteration needs to hold
> bucket_lock of cache set, it is very probably to introduce a huge I/O
> latency in period of every 30 seconds.
> 
> For database people, this is not good news.


Hi Tang,
   
I'm waiting to queue this patch pending your response to Coly.  

Please send a v2 when you're ready.

Thanks!

--
Eric Wheeler

> 
> Coly
> 
> 
> > 
> > 
> > 
> > 发件人:         Coly Li <i@coly.li>
> > 收件人:         linux-block@vger.kernel.org, Tang Junhui
> > <tang.junhui@zte.com.cn>,
> > 抄送:        bcache@lists.ewheeler.net, linux-bcache@vger.kernel.org,
> > hch@infradead.org, axboe@kernel.dk
> > 日期:         2017/07/11 13:06
> > 主题:        Re: [PATCH 12/19] bcache: update bucket_in_use periodically
> > 发件人:        linux-bcache-owner@vger.kernel.org
> > ------------------------------------------------------------------------
> > 
> > 
> > 
> > On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
> >> From: Tang Junhui <tang.junhui@zte.com.cn>
> >>
> >> bucket_in_use is updated in gc thread which triggered by invalidating or
> >> writing sectors_to_gc dirty data, It's been too long, Therefore, when we
> >> use it to compare with the threshold, it is often not timely, which leads
> >> to inaccurate judgment and often results in bucket depletion.
> >>
> >> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
> >> ---
> >>  drivers/md/bcache/btree.c | 29 +++++++++++++++++++++++++++--
> >>  1 file changed, 27 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
> >> index 866dcf7..77aa20b 100644
> >> --- a/drivers/md/bcache/btree.c
> >> +++ b/drivers/md/bcache/btree.c
> >> @@ -90,6 +90,8 @@
> >>  #define MAX_NEED_GC                                  64
> >>  #define MAX_SAVE_PRIO                                  72
> >>  
> >> +#define GC_THREAD_TIMEOUT_MS                 (30 * 1000)
> >> +
> >>  #define PTR_DIRTY_BIT                                  (((uint64_t) 1
> > << 36))
> >>  
> >>  #define PTR_HASH(c, k)                                              
> >                                                                         \
> >> @@ -1760,6 +1762,23 @@ static void bch_btree_gc(struct cache_set *c)
> >>                   bch_moving_gc(c);
> >>  }
> >>  
> >> +void bch_update_bucket_in_use(struct cache_set *c)
> >> +{
> >> +                 struct cache *ca;
> >> +                 struct bucket *b;
> >> +                 unsigned i;
> >> +                 size_t available = 0;
> >> +
> >> +                 for_each_cache(ca, c, i) {
> >> +                                  for_each_bucket(b, ca) {
> >> +                                                   if (!GC_MARK(b) ||
> > GC_MARK(b) == GC_MARK_RECLAIMABLE)
> >> +                                                                  
> >  available++;
> >> +                                  }
> >> +                 }
> >> +
> > 
> > bucket_lock of cache set should be held before accessing buckets.
> > 
> > 
> >> +                 c->gc_stats.in_use = (c->nbuckets - available) * 100
> > / c->nbuckets;
> >> +}
> >> +
> >>  static bool gc_should_run(struct cache_set *c)
> >>  {
> >>                   struct cache *ca;
> >> @@ -1778,10 +1797,16 @@ static bool gc_should_run(struct cache_set *c)
> >>  static int bch_gc_thread(void *arg)
> >>  {
> >>                   struct cache_set *c = arg;
> >> +                 long  ret;
> >> +                 unsigned long timeout =
> > msecs_to_jiffies(GC_THREAD_TIMEOUT_MS);
> >>  
> >>                   while (1) {
> >> -                                  wait_event_interruptible(c->gc_wait,
> >> -                                                    
> >  kthread_should_stop() || gc_should_run(c));
> >> +                                  ret =
> > wait_event_interruptible_timeout(c->gc_wait,
> >> +                                                    
> >  kthread_should_stop() || gc_should_run(c), timeout);
> >> +                                  if (!ret) {
> >> +                                                  
> > bch_update_bucket_in_use(c);
> >> +                                                   continue;
> > 
> > A continue here will ignore status returned from kthread_should_stop(),
> > which might not be expected behavior.
> > 
> > 
> >> +                                  }
> >>  
> >>                                    if (kthread_should_stop())
> >>                                                     break;
> >>
> > 
> > Iterating all buckets from the cache set requires bucket_lock to be
> > held. Waiting for bucket_lock may take quite a long time for either
> > bucket allocating code or bch_gc_thread(). What I concern is, this patch
> > may introduce bucket allocation delay in period of GC_THREAD_TIMEOUT_MS.
> > 
> > We need to find out a way to avoid such a performance regression.
> > 
> > -- 
> > Coly Li
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> > 
> 
> 
> -- 
> Coly Li
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 11/19] bcache: Subtract dirty sectors of thin flash from cache_sectors in calculating writeback rate
  2017-07-13  4:12             ` Eric Wheeler
@ 2017-07-13  4:15               ` Coly Li
  2017-10-27 19:12                 ` Eric Wheeler
  0 siblings, 1 reply; 120+ messages in thread
From: Coly Li @ 2017-07-13  4:15 UTC (permalink / raw)
  To: Eric Wheeler, tang.junhui; +Cc: linux-bcache, linux-block, stable

On 2017/7/13 下午12:12, Eric Wheeler wrote:
> On Tue, 11 Jul 2017, tang.junhui@zte.com.cn wrote:
> 
>>> Based on the above implementation, non-dirty space from flash only
>>> bcache device will mislead writeback rate calculation too. So I suggest
>>> to subtract all buckets size from all flash only bcache devices. Then it
>>> might be something like,
>>
>> what is non-dirty space from flash only bcache device?
>> Where is non-dirty space from flash only bcache device?
> 
> Hi Tang, Coly:
>    
> Waas there more discussion on this thread, or is the patch good to go?  
> 
> Please send your ack if you're happy with it so I can queue it up.

I discussed with Tang offline, this patch is correct. But the patch
commit log should be improved. Now I help to work on it, should be done
quite soon.

Coly

>>
>>
>> 发件人:         Coly Li <i@coly.li>
>> 收件人:         Tang Junhui <tang.junhui@zte.com.cn>,
>> 抄送:        bcache@lists.ewheeler.net, linux-block@vger.kernel.org, linux-bcache@vger.kernel.org,
>> hch@infradead.org, axboe@kernel.dk, stable@vger.kernel.org
>> 日期:         2017/07/11 02:11
>> 主题:        Re: [PATCH 11/19] bcache: Subtract dirty sectors of thin flash from cache_sectors in calculating
>> writeback rate
>> 发件人:        linux-bcache-owner@vger.kernel.org
>>
>> _________________________________________________________________________________________________________________
>>
>>
>>
>> On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
>>> From: Tang Junhui <tang.junhui@zte.com.cn>
>>>
>>> Since dirty sectors of thin flash cannot be used to cache data for backend
>>> device, so we should subtract it in calculating writeback rate.
>>>
>>
>> I see you want to get ride of the noise of flash only cache device for
>> writeback rate calculation. It makes sense, because flash only cache
>> device won't have write back happen at all.
>>
>>
>>> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
>>> Cc: stable@vger.kernel.org
>>> ---
>>>  drivers/md/bcache/writeback.c |  2 +-
>>>  drivers/md/bcache/writeback.h | 19 +++++++++++++++++++
>>>  2 files changed, 20 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
>>> index 4ac8b13..25289e4 100644
>>> --- a/drivers/md/bcache/writeback.c
>>> +++ b/drivers/md/bcache/writeback.c
>>> @@ -21,7 +21,7 @@
>>>  static void __update_writeback_rate(struct cached_dev *dc)
>>>  {
>>>                   struct cache_set *c = dc->disk.c;
>>> -                 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
>>> +                 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
>> bcache_flash_devs_sectors_dirty(c);
>>
>> See flash_dev_run(), the flash volume is created per struct
>> bcache_device of a cache set. That means, all data allocated for the
>> flash volume will be from a flash only bcache device. Regular dirty data
>> won't mixed allocating with flash volume dirty data on identical struct
>> bcache device.
>>
>> Based on the above implementation, non-dirty space from flash only
>> bcache device will mislead writeback rate calculation too. So I suggest
>> to subtract all buckets size from all flash only bcache devices. Then it
>> might be something like,
>>
>> uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
>>                                                   bcache_flash_devs_nbuckets(c);
>>
>>
>>
>> Just FYI. Thanks.
>>
>> Coly
>>
>>>                   uint64_t cache_dirty_target =
>>>                                    div_u64(cache_sectors * dc->writeback_percent, 100);
>>>  
>>> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
>>> index c2ab4b4..24ff589 100644
>>> --- a/drivers/md/bcache/writeback.h
>>> +++ b/drivers/md/bcache/writeback.h
>>> @@ -14,6 +14,25 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
>>>                   return ret;
>>>  }
>>>  
>>> +static inline uint64_t  bcache_flash_devs_sectors_dirty(struct cache_set *c)
>>> +{
>>> +                 uint64_t i, ret = 0;
>>> +
>>> +                 mutex_lock(&bch_register_lock);
>>> +
>>> +                 for (i = 0; i < c->nr_uuids; i++) {
>>> +                                  struct bcache_device *d = c->devices[i];
>>> +
>>> +                                  if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
>>> +                                                   continue;
>>> +                    ret += bcache_dev_sectors_dirty(d);
>>> +                 }
>>> +
>>> +                 mutex_unlock(&bch_register_lock);
>>> +
>>> +                 return ret;
>>> +}
>>> +
>>>  static inline unsigned offset_to_stripe(struct bcache_device *d,
>>>                                                                                       uint64_t offset)
>>>  {
>>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>
>>


-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 12/19] bcache: update bucket_in_use periodically
  2017-07-13  4:13               ` Eric Wheeler
@ 2017-07-13  4:27                 ` Coly Li
  2017-10-27 19:11                   ` Eric Wheeler
  0 siblings, 1 reply; 120+ messages in thread
From: Coly Li @ 2017-07-13  4:27 UTC (permalink / raw)
  To: Eric Wheeler, tang.junhui; +Cc: linux-bcache, linux-block

On 2017/7/13 下午12:13, Eric Wheeler wrote:
> On Tue, 11 Jul 2017, Coly Li wrote:
> 
>> On 2017/7/11 下午1:39, tang.junhui@zte.com.cn wrote:
>>> Compared to bucket depletion, resulting in hanging dead,
>>> It is worthy to consumes a little time to update the bucket_in_use.
>>> If you have any better solution, please show to us,
>>> We should solve it as soon as possible, not wait for it forever.
>>>
>>
>> I also test this patch on a cache device with 4x3.8TB size, all buckets
>> iteration takes around 40-50ms. If the iteration needs to hold
>> bucket_lock of cache set, it is very probably to introduce a huge I/O
>> latency in period of every 30 seconds.
>>
>> For database people, this is not good news.
> 
> 
> Hi Tang,
>    
> I'm waiting to queue this patch pending your response to Coly.  
> 
> Please send a v2 when you're ready.


Eric,

I guess Tang is working on the I/O hang issue during back ground garbage
collection running. From discussion from other email thread, it seems a
regular I/O request gets hung for 10+ second in some cases. Maybe that
issue is more urgent than this one.

From my personal opinion, updating bucket_in_use is for acting garbage
collection. If number of bucket in use is not updated in time, garbage
collection won't start due to old bucket_in_use still beyond
CUTOFF_WRITEBACK_SYNC.

We may maintain an atomic counter per-cache set for dirty buckets, and
update it at some locations when allocating or reclaiming bucket. This
counter is unnecessary to be very accurate, just accurate enough for
should_writeback() working correctly.

I am also looking at it for a better solution as well.

Coly


>>
>> Coly
>>
>>
>>>
>>>
>>>
>>> 发件人:         Coly Li <i@coly.li>
>>> 收件人:         linux-block@vger.kernel.org, Tang Junhui
>>> <tang.junhui@zte.com.cn>,
>>> 抄送:        bcache@lists.ewheeler.net, linux-bcache@vger.kernel.org,
>>> hch@infradead.org, axboe@kernel.dk
>>> 日期:         2017/07/11 13:06
>>> 主题:        Re: [PATCH 12/19] bcache: update bucket_in_use periodically
>>> 发件人:        linux-bcache-owner@vger.kernel.org
>>> ------------------------------------------------------------------------
>>>
>>>
>>>
>>> On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
>>>> From: Tang Junhui <tang.junhui@zte.com.cn>
>>>>
>>>> bucket_in_use is updated in gc thread which triggered by invalidating or
>>>> writing sectors_to_gc dirty data, It's been too long, Therefore, when we
>>>> use it to compare with the threshold, it is often not timely, which leads
>>>> to inaccurate judgment and often results in bucket depletion.
>>>>
>>>> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
>>>> ---
>>>>  drivers/md/bcache/btree.c | 29 +++++++++++++++++++++++++++--
>>>>  1 file changed, 27 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
>>>> index 866dcf7..77aa20b 100644
>>>> --- a/drivers/md/bcache/btree.c
>>>> +++ b/drivers/md/bcache/btree.c
>>>> @@ -90,6 +90,8 @@
>>>>  #define MAX_NEED_GC                                  64
>>>>  #define MAX_SAVE_PRIO                                  72
>>>>  
>>>> +#define GC_THREAD_TIMEOUT_MS                 (30 * 1000)
>>>> +
>>>>  #define PTR_DIRTY_BIT                                  (((uint64_t) 1
>>> << 36))
>>>>  
>>>>  #define PTR_HASH(c, k)                                              
>>>                                                                         \
>>>> @@ -1760,6 +1762,23 @@ static void bch_btree_gc(struct cache_set *c)
>>>>                   bch_moving_gc(c);
>>>>  }
>>>>  
>>>> +void bch_update_bucket_in_use(struct cache_set *c)
>>>> +{
>>>> +                 struct cache *ca;
>>>> +                 struct bucket *b;
>>>> +                 unsigned i;
>>>> +                 size_t available = 0;
>>>> +
>>>> +                 for_each_cache(ca, c, i) {
>>>> +                                  for_each_bucket(b, ca) {
>>>> +                                                   if (!GC_MARK(b) ||
>>> GC_MARK(b) == GC_MARK_RECLAIMABLE)
>>>> +                                                                  
>>>  available++;
>>>> +                                  }
>>>> +                 }
>>>> +
>>>
>>> bucket_lock of cache set should be held before accessing buckets.
>>>
>>>
>>>> +                 c->gc_stats.in_use = (c->nbuckets - available) * 100
>>> / c->nbuckets;
>>>> +}
>>>> +
>>>>  static bool gc_should_run(struct cache_set *c)
>>>>  {
>>>>                   struct cache *ca;
>>>> @@ -1778,10 +1797,16 @@ static bool gc_should_run(struct cache_set *c)
>>>>  static int bch_gc_thread(void *arg)
>>>>  {
>>>>                   struct cache_set *c = arg;
>>>> +                 long  ret;
>>>> +                 unsigned long timeout =
>>> msecs_to_jiffies(GC_THREAD_TIMEOUT_MS);
>>>>  
>>>>                   while (1) {
>>>> -                                  wait_event_interruptible(c->gc_wait,
>>>> -                                                    
>>>  kthread_should_stop() || gc_should_run(c));
>>>> +                                  ret =
>>> wait_event_interruptible_timeout(c->gc_wait,
>>>> +                                                    
>>>  kthread_should_stop() || gc_should_run(c), timeout);
>>>> +                                  if (!ret) {
>>>> +                                                  
>>> bch_update_bucket_in_use(c);
>>>> +                                                   continue;
>>>
>>> A continue here will ignore status returned from kthread_should_stop(),
>>> which might not be expected behavior.
>>>
>>>
>>>> +                                  }
>>>>  
>>>>                                    if (kthread_should_stop())
>>>>                                                     break;
>>>>
>>>
>>> Iterating all buckets from the cache set requires bucket_lock to be
>>> held. Waiting for bucket_lock may take quite a long time for either
>>> bucket allocating code or bch_gc_thread(). What I concern is, this patch
>>> may introduce bucket allocation delay in period of GC_THREAD_TIMEOUT_MS.
>>>
>>> We need to find out a way to avoid such a performance regression.
>>>
>>> -- 
>>> Coly Li
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
>>> the body of a message to majordomo@vger.kernel.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>>
>>
>>
>> -- 
>> Coly Li


-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 18/19] bcache: silence static checker warning
  2017-06-30 20:43       ` [PATCH 18/19] bcache: silence static checker warning bcache
@ 2017-07-13  9:44         ` Coly Li
  0 siblings, 0 replies; 120+ messages in thread
From: Coly Li @ 2017-07-13  9:44 UTC (permalink / raw)
  To: bcache, linux-block; +Cc: linux-bcache, hch, axboe, Dan Carpenter

On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
> From: Dan Carpenter <dan.carpenter@oracle.com>
> 
> In olden times, closure_return() used to have a hidden return built in.
> We removed the hidden return but forgot to add a new return here.  If
> "c" were NULL we would oops on the next line, but fortunately "c" is
> never NULL.  Let's just remove the if statement.
> 
> Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
> ---
>  drivers/md/bcache/super.c | 3 ---
>  1 file changed, 3 deletions(-)
> 
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index 24cb9b7..243391d 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -1381,9 +1381,6 @@ static void cache_set_flush(struct closure *cl)
>  	struct btree *b;
>  	unsigned i;
>  
> -	if (!c)
> -		closure_return(cl);
> -
>  	bch_cache_accounting_destroy(&c->accounting);
>  
>  	kobject_put(&c->internal);
> 
Agree, cache_set_flush() is only called from a continue_at() in
__cache_set_unregister(). In this case, cl is always not NULL.

Reviewed-by: Coly Li <colyli@suse.de>

Thanks.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 16/19] bcache: increase the number of open buckets
  2017-06-30 20:43         ` bcache
  (?)
@ 2017-07-13  9:56         ` Coly Li
  -1 siblings, 0 replies; 120+ messages in thread
From: Coly Li @ 2017-07-13  9:56 UTC (permalink / raw)
  To: bcache, linux-block; +Cc: linux-bcache, hch, axboe, Tang Junhui

On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
> From: Tang Junhui <tang.junhui@zte.com.cn>
> 
> In currently, we only alloc 6 open buckets for each cache set,
> but in usually, we always attach about 10 or so backend devices for
> each cache set, and the each bcache device are always accessed by
> about 10 or so threads in top application layer. So 6 open buckets
> are too few, It has led to that each of the same thread write data
> to different buckets, which would cause low efficiency write-back,
> and also cause buckets inefficient, and would be Very easy to run
> out of.
> 
> I add debug message in bch_open_buckets_alloc() to print alloc bucket
> info, and test with ten bcache devices with a cache set, and each
> bcache device is accessed by ten threads.
> 
> From the debug message, we can see that, after the modification, One
> bucket is more likely to assign to the same thread, and the data from
> the same thread are more likely to write the same bucket. Usually the
> same thread always write/read the same backend device, so it is good
> for write-back and also promote the usage efficiency of buckets.
> 
> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>

Nice catch for performance !

Reviewed-by: Coly Li <colyli@suse.de>

Thanks.


> ---
>  drivers/md/bcache/alloc.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
> index ca4abe1..cacbe2d 100644
> --- a/drivers/md/bcache/alloc.c
> +++ b/drivers/md/bcache/alloc.c
> @@ -68,6 +68,8 @@
>  #include <linux/random.h>
>  #include <trace/events/bcache.h>
>  
> +#define MAX_OPEN_BUCKETS 128
> +
>  /* Bucket heap / gen */
>  
>  uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
> @@ -671,7 +673,7 @@ int bch_open_buckets_alloc(struct cache_set *c)
>  
>  	spin_lock_init(&c->data_bucket_lock);
>  
> -	for (i = 0; i < 6; i++) {
> +	for (i = 0; i < MAX_OPEN_BUCKETS; i++) {
>  		struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
>  		if (!b)
>  			return -ENOMEM;
> 


-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PULL] bcache updates based on git.kernel.dk/linux-block:for-next
  2017-05-25 19:10 [PULL] bcache updates based on git.kernel.dk/linux-block:for-next Eric Wheeler
  2017-06-28 23:06   ` Eric Wheeler
@ 2017-07-14 11:40 ` Eddie Chapman
  2017-07-14 15:07   ` Coly Li
  1 sibling, 1 reply; 120+ messages in thread
From: Eddie Chapman @ 2017-07-14 11:40 UTC (permalink / raw)
  To: Eric Wheeler, Jens Axboe
  Cc: linux-bcache, Jan Kara, stable, tang.junhui, Kent Overstreet,
	Coly Li, Stefan Bader, Liang Chen, nix

On 25/05/17 20:10, Eric Wheeler wrote:
> Hi Jens,
> 
> Please pull these updates and bugfixes from the bcache community when you
> have a minute.  If you need a rebase against something else then please
> let me know and I would be happy to update for you.
> 
> Thank you for your help!

<snip>

Hi all,

(replying to all but as a subscriber to stable only)

I'm not a kernel coder but have several 4.4 (kernel.org stable series) 
servers using bcache, so I'd love to see (possibly some of?) these in 
4.4 if they are relevant and apply without any significant work needed.

This series was CC'd to stable but I don't see any info of how far back 
any of them might be applicable, if at all.

If any of you guys are able to give a hint with this series just along 
the lines of "this one is/is not applicable to 4.4" then I'm happy to 
apply them, resolve any simple context issues, use, and report back with 
clean patches.

thanks,
Eddie

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PULL] bcache updates based on git.kernel.dk/linux-block:for-next
  2017-07-14 11:40 ` [PULL] bcache updates based on git.kernel.dk/linux-block:for-next Eddie Chapman
@ 2017-07-14 15:07   ` Coly Li
  2017-07-14 17:33     ` Eddie Chapman
  0 siblings, 1 reply; 120+ messages in thread
From: Coly Li @ 2017-07-14 15:07 UTC (permalink / raw)
  To: Eddie Chapman; +Cc: Eric Wheeler, linux-bcache, tang.junhui

On 2017/7/14 下午7:40, Eddie Chapman wrote:
> On 25/05/17 20:10, Eric Wheeler wrote:
>> Hi Jens,
>>
>> Please pull these updates and bugfixes from the bcache community when you
>> have a minute.  If you need a rebase against something else then please
>> let me know and I would be happy to update for you.
>>
>> Thank you for your help!
> 
> <snip>
> 
> Hi all,
> 
> (replying to all but as a subscriber to stable only)
> 
> I'm not a kernel coder but have several 4.4 (kernel.org stable series)
> servers using bcache, so I'd love to see (possibly some of?) these in
> 4.4 if they are relevant and apply without any significant work needed.
> 
> This series was CC'd to stable but I don't see any info of how far back
> any of them might be applicable, if at all.
> 
> If any of you guys are able to give a hint with this series just along
> the lines of "this one is/is not applicable to 4.4" then I'm happy to
> apply them, resolve any simple context issues, use, and report back with
> clean patches.

(remove many unnecessary email receivers)

Hi Eddie,

I think some patches from Junhui Tang are important stable fixes.
After all the patches get reviewed, and accepted in mainline kernel, you
may find them in 4.4 stable tree (any way it won't be very soon for
these fixes show up in stable tree).

Thanks.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PULL] bcache updates based on git.kernel.dk/linux-block:for-next
  2017-07-14 15:07   ` Coly Li
@ 2017-07-14 17:33     ` Eddie Chapman
       [not found]       ` <OF92BA0158.87BDF9E3-ON4825815E.000736BF-4825815E.000833F7@zte.com.cn>
  0 siblings, 1 reply; 120+ messages in thread
From: Eddie Chapman @ 2017-07-14 17:33 UTC (permalink / raw)
  To: Coly Li; +Cc: Eric Wheeler, linux-bcache, stable, tang.junhui

On 14/07/17 16:07, Coly Li wrote:
> On 2017/7/14 下午7:40, Eddie Chapman wrote:
>> On 25/05/17 20:10, Eric Wheeler wrote:
>>> Hi Jens,
>>>
>>> Please pull these updates and bugfixes from the bcache community when you
>>> have a minute.  If you need a rebase against something else then please
>>> let me know and I would be happy to update for you.
>>>
>>> Thank you for your help!
>>
>> <snip>
>>
>> Hi all,
>>
>> (replying to all but as a subscriber to stable only)
>>
>> I'm not a kernel coder but have several 4.4 (kernel.org stable series)
>> servers using bcache, so I'd love to see (possibly some of?) these in
>> 4.4 if they are relevant and apply without any significant work needed.
>>
>> This series was CC'd to stable but I don't see any info of how far back
>> any of them might be applicable, if at all.
>>
>> If any of you guys are able to give a hint with this series just along
>> the lines of "this one is/is not applicable to 4.4" then I'm happy to
>> apply them, resolve any simple context issues, use, and report back with
>> clean patches.
> 
> (remove many unnecessary email receivers)

I'm re-adding the stable list to CC as we're discussing a stable kernel. 
Hope that's OK.

> Hi Eddie,
> 
> I think some patches from Junhui Tang are important stable fixes.
> After all the patches get reviewed, and accepted in mainline kernel, you
> may find them in 4.4 stable tree (any way it won't be very soon for
> these fixes show up in stable tree).
> 
> Thanks.

Thanks for your reply Coly.  You're right, forgot about that. Before 
they can go in 4.4 or any other stable kernel they must be in Linus' tree.

Of the 9 patches CC'd to stable, it looks to me that so far these 2 have 
subsequently received approval by you plus at least 1 other person other 
than the author (e.g. Christoph Hellwig):

- fix sequential large write IO bypass
- do not subtract sectors_to_gc for bypassed IO

The first one looks particularly important to me and Kent himself has 
also reviewed it.

This one also has not received any objections yet and you mentioned you 
discussed with the author and both concluded it is correct:

- Subtract dirty sectors of thin flash from cache_sectors in calculating 
writeback rate

So for me these 3 by Junhui Tang seem "safe" enough that I will take a 
little risk and try them already on 4.4 on my own machines (I'm guessing 
they are likely relevant to 4.4 but of course I'll check if they apply). 
I'll report back, FWIW.

If I'm brave (foolish) enough I might go through mainline bcache commits 
since 4.4 and see if there are any other goodies to try out with 4.4. Of 
course if anyone has any in particular to suggest to me to try, please 
do! If I do, I'll report back anything that seems to have worked.

Thanks again!
Eddie

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 15/19] bcache: fix issue of writeback rate at minimum 1 key per second
  2017-06-30 20:43       ` [PATCH 15/19] bcache: fix issue of writeback rate at minimum 1 key per second bcache
@ 2017-07-16 10:04         ` Coly Li
  2017-10-27 19:07           ` Eric Wheeler
  0 siblings, 1 reply; 120+ messages in thread
From: Coly Li @ 2017-07-16 10:04 UTC (permalink / raw)
  To: bcache, Tang Junhui; +Cc: linux-block, linux-bcache, hch, axboe

On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
> From: Tang Junhui <tang.junhui@zte.com.cn>
> 
> When there is not enough dirty data in writeback cache,
> writeback rate is at minimum 1 key per second
> util all dirty data to be cleaned, it is inefficiency,
> and also causes waste of energy;

Hi Junhui and Eric,

What: /sys/block/<disk>/bcache/writeback_percent
Description:
      For backing devices: If nonzero, writeback from cache to
      backing device only takes place when more than this percentage
      of the cache is used, allowing more write coalescing to take
      place and reducing total number of writes sent to the backing
      device. Integer between 0 and 40.

I see above text from Documentation/ABI/testing/sysfs-block-bcache (I
know this document is quite old), it seems if "not enough" means dirty
data percentage is less then writback_percent, bcache should not
performance writeback I/O. But in __update_writeback_rate(),
writeback_rate.rate is clamped in [1, NSEC_PER_MSEC]. It seems in PD
controller code of __update_writeback_rate(), writeback_percent is only
used to calculate dirty target number, its another functionality as
writeback threshold is not handled here.

> 
> in this patch, When there is not enough dirty data,
> let the writeback rate to be 0, and writeback re-schedule
> in bch_writeback_thread() periodically with schedule_timeout(),
> the behaviors are as follows :
> 
> 1) If no dirty data have been read into dc->writeback_keys,
> goto step 2), otherwise keep writing these dirty data to
> back-end device at 1 key per second, until all these dirty data
> write over, then goto step 2).
> 
> 2) Loop in bch_writeback_thread() to check if there is enough
> dirty data for writeback. if there is not enough diry data for
> writing, then sleep 10 seconds, otherwise, write dirty data to
> back-end device.

Bcache uses a Proportion-Differentiation Controller to control writeback
rate. When dirty data is far from target, writeback rate is higher; when
dirty data is close to target, writeback rate is slower. The advantage
of PD controller here is, when regular I/O and writeback I/O happens in
same time,
- When there are a lot of dirty data, writeback I/O can have more chance
to write them back to cached device, which in turns has positive impact
to regular I/O.
- When dirty data is decreased and close to target dirty number, less
writeback I/O can help regular I/O has better throughput and latency.

The root cause of 1 key per second is, the PD controller is designed for
better I/O performance, not less energy consumption. When the existing
dirty data gets closed to target dirty number, the PD controller chooses
to use longer writeback time to make a better regular I/O performance.
If it is designed for less energy consumption, it should keep the
writeback rate in a high level and finish writing back all dirty data as
soon as possible.

This patch may introduce an unexpected behavior of dirty data writeback
throughput, when regular write I/O and writeback I/O happen in same
time. In this case, dirty data number may shake up and down around
target dirty number, it is possible that change (the variable in
__update_writeback_rate()) is a minus value, and the result of
dc->writeback_rate.rate + change happens to be 0. This patch changes the
clamp range of writeback_rate.rate to [0, NSEC_PER_MSEC], so
writeback_rate.rate can be possible to be 0. And in bch_next_delay() if
d->rate is zero, the write back I/O will be delayed to now +
NSEC_PER_SEC. When there is no regular I/O it works well, but when there
is regular I/O, this longer delay may cause more dirty data piled in
cache device, and PD controller cannot generage a stable writeback rate.
This is not an expected behavior for the writeback rate PD controller.

Another method to fix might be,
1) define a sysfs to define writeback_rate with max/dynamic option.
2) dynamic writeback_rate as default
3) when max is set, in __update_writeback_rate() assign NSEC_PER_MSEC to
writeback_rate.rate
4) in bch_writeback_thread(), if no writeback I/O on fly, and dirty data
does not reach dc->writeback_percent, schedule out.
5) if writeback is necessary then do it in max rate and finish it as
soon as possible, to save laptop energy.

The above method might be helpful to energy save as well (perform dirty
dat write back in batch), and does not change default PD controller
behavior.

Just for your reference. Or if you are too busy to look at it, I can try
to compose a patch for review.

Coly

> 
> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
> ---
>  drivers/md/bcache/util.c      |  9 ++++++++-
>  drivers/md/bcache/writeback.c | 11 +++++++----
>  2 files changed, 15 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
> index 8c3a938..49dcf09 100644
> --- a/drivers/md/bcache/util.c
> +++ b/drivers/md/bcache/util.c
> @@ -210,7 +210,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
>  {
>  	uint64_t now = local_clock();
>  
> -	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
> +	/*
> +	  if d->rate is zero, write the left dirty data
> +	  at the speed of one key per second
> +	*/
> +	if(!d->rate)
> +		d->next = now + NSEC_PER_SEC;
> +	else
> +		d->next += div_u64(done * NSEC_PER_SEC, d->rate);
>  
>  	if (time_before64(now + NSEC_PER_SEC, d->next))
>  		d->next = now + NSEC_PER_SEC;
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index 25289e4..4104eaa 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -16,6 +16,8 @@
>  #include <linux/sched/clock.h>
>  #include <trace/events/bcache.h>
>  
> +#define WRITE_BACK_WAIT_CYCLE		10 * HZ
> +
>  /* Rate limiting */
>  
>  static void __update_writeback_rate(struct cached_dev *dc)
> @@ -55,13 +57,14 @@ static void __update_writeback_rate(struct cached_dev *dc)
>  
>  	/* Don't increase writeback rate if the device isn't keeping up */
>  	if (change > 0 &&
> +	    dc->writeback_rate.rate >0 &&
>  	    time_after64(local_clock(),
>  			 dc->writeback_rate.next + NSEC_PER_MSEC))
>  		change = 0;
>  
>  	dc->writeback_rate.rate =
>  		clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
> -			1, NSEC_PER_MSEC);
> +			0, NSEC_PER_MSEC);
>  
>  	dc->writeback_rate_proportional = proportional;
>  	dc->writeback_rate_derivative = derivative;
> @@ -420,15 +423,15 @@ static int bch_writeback_thread(void *arg)
>  	while (!kthread_should_stop()) {
>  		down_write(&dc->writeback_lock);
>  		if (!atomic_read(&dc->has_dirty) ||
> -		    (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
> -		     !dc->writeback_running)) {
> +		    ((!dc->writeback_rate.rate || !dc->writeback_running) &&
> +		      !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))) {
>  			up_write(&dc->writeback_lock);
>  			set_current_state(TASK_INTERRUPTIBLE);
>  
>  			if (kthread_should_stop())
>  				return 0;
>  
> -			schedule();
> +			schedule_timeout(WRITE_BACK_WAIT_CYCLE);
>  			continue;
>  		}
>  
> 


-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PULL] bcache updates based on git.kernel.dk/linux-block:for-next
       [not found]       ` <OF92BA0158.87BDF9E3-ON4825815E.000736BF-4825815E.000833F7@zte.com.cn>
@ 2017-07-18 18:24         ` Eddie Chapman
  2017-07-18 18:31           ` Eddie Chapman
  0 siblings, 1 reply; 120+ messages in thread
From: Eddie Chapman @ 2017-07-18 18:24 UTC (permalink / raw)
  To: tang.junhui; +Cc: Eric Wheeler, Coly Li, linux-bcache, stable

On 15/07/17 02:29, tang.junhui@zte.com.cn wrote:
> Hello Eddie
> 
> One thing, please see below:
> 
> 
>>On 14/07/17 16:07, Coly Li wrote:
>>> On 2017/7/14 下午7:40, Eddie Chapman wrote:
>>>> On 25/05/17 20:10, Eric Wheeler wrote:
>>>>> Hi Jens,
>>>>>
>>>>> Please pull these updates and bugfixes from the bcache community when you
>>>>> have a minute.  If you need a rebase against something else then please
>>>>> let me know and I would be happy to update for you.
>>>>>
>>>>> Thank you for your help!
>>>>
>>>> <snip>
>>>>
>>>> Hi all,
>>>>
>>>> (replying to all but as a subscriber to stable only)
>>>>
>>>> I'm not a kernel coder but have several 4.4 (kernel.org stable series)
>>>> servers using bcache, so I'd love to see (possibly some of?) these in
>>>> 4.4 if they are relevant and apply without any significant work needed.
>>>>
>>>> This series was CC'd to stable but I don't see any info of how far back
>>>> any of them might be applicable, if at all.
>>>>
>>>> If any of you guys are able to give a hint with this series just along
>>>> the lines of "this one is/is not applicable to 4.4" then I'm happy to
>>>> apply them, resolve any simple context issues, use, and report back with
>>>> clean patches.
>>>
>>> (remove many unnecessary email receivers)
>>
>>I'm re-adding the stable list to CC as we're discussing a stable kernel.
>>Hope that's OK.
>>
>>> Hi Eddie,
>>>
>>> I think some patches from Junhui Tang are important stable fixes.
>>> After all the patches get reviewed, and accepted in mainline kernel, you
>>> may find them in 4.4 stable tree (any way it won't be very soon for
>>> these fixes show up in stable tree).
>>>
>>> Thanks.
>>
>>Thanks for your reply Coly.  You're right, forgot about that. Before
>>they can go in 4.4 or any other stable kernel they must be in Linus' tree.
>>
>>Of the 9 patches CC'd to stable, it looks to me that so far these 2 have
>>subsequently received approval by you plus at least 1 other person other
>>than the author (e.g. Christoph Hellwig):
>>
>>- fix sequential large write IO bypass
>>- do not subtract sectors_to_gc for bypassed IO
>>
>>The first one looks particularly important to me and Kent himself has
>>also reviewed it.
>>
>>This one also has not received any objections yet and you mentioned you
>>discussed with the author and both concluded it is correct:
>>
>>- Subtract dirty sectors of thin flash from cache_sectors in calculating
>>writeback rate
> 
> If you want to apply this patch, please also apply this patch to make
> dirty sectors of thin flashInitializing correctly. Otherwise
> it will subtract a wrong dirty sectors of thin flash.
> 
> -bcache: initialize stripe_sectors_dirty correctly for thin flash device

Thank you very much Tang for pointing this out.

I applied all 4 patches on top of vanilla kernel.org 4.4.77 . So they are:

- bcache: fix sequential large write IO bypass
- bcache: do not subtract sectors_to_gc for bypassed IO
- bcache: Subtract dirty sectors of thin flash from cache_sectors in calculating writeback rate
- bcache: initialize stripe_sectors_dirty correctly for thin flash device

They applied without any issues just some fuzz.
There were no issues during build, and I've been running 4.4.77 with 
these 4 patches on one server using bcache for 3 days now without any 
problems or any unusual logs from bcache in dmesg.

I will send another mail separately, as a reply to this one, with the 4 
patches with context adjusted, generated from patched 4.4.77 using diff -up, 
in case they are any use to anyone. Not sure if they are any use to you guys 
though as they are not generated with git-send-email, but anyway, FWIW.

Thanks,
Eddie

>>So for me these 3 by Junhui Tang seem "safe" enough that I will take a
>>little risk and try them already on 4.4 on my own machines (I'm guessing
>>they are likely relevant to 4.4 but of course I'll check if they apply).
>>I'll report back, FWIW.
>>
>>If I'm brave (foolish) enough I might go through mainline bcache commits
>>since 4.4 and see if there are any other goodies to try out with 4.4. Of
>>course if anyone has any in particular to suggest to me to try, please
>>do! If I do, I'll report back anything that seems to have worked.
>>
>>Thanks again!
>>Eddie
>>--
> 
> Thanks to use it.
> 
> Tang Junhui
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PULL] bcache updates based on git.kernel.dk/linux-block:for-next
  2017-07-18 18:24         ` Eddie Chapman
@ 2017-07-18 18:31           ` Eddie Chapman
  2017-07-18 20:06             ` Greg KH
  0 siblings, 1 reply; 120+ messages in thread
From: Eddie Chapman @ 2017-07-18 18:31 UTC (permalink / raw)
  To: tang.junhui; +Cc: Eric Wheeler, Coly Li, linux-bcache, stable

As per previous mail, below are the 4 patches discussed, generated with 
diff -up, from kernel.org 4.4.77, so with correct context for 4.4. Just 
in case they are useful for anyone.

---
From: Tang Junhui <tang.junhui@zte.com.cn>

Sequential write IOs were tested with bs=1M by FIO in writeback cache
mode, these IOs were expected to be bypassed, but actually they did not.
We debug the code, and find in check_should_bypass():
    if (!congested &&
        mode == CACHE_MODE_WRITEBACK &&
        op_is_write(bio_op(bio)) &&
        (bio->bi_opf & REQ_SYNC))
        goto rescale
that means, If in writeback mode, a write IO with REQ_SYNC flag will not
be bypassed though it is a sequential large IO, It's not a correct thing
to do actually, so this patch remove these codes.

Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: stable@vger.kernel.org
---

--- a/drivers/md/bcache/request.c	2017-07-18 18:09:38.686156583 +0100
+++ b/drivers/md/bcache/request.c	2017-07-18 18:09:44.596167542 +0100
@@ -400,12 +400,6 @@ static bool check_should_bypass(struct c
 	if (!congested && !dc->sequential_cutoff)
 		goto rescale;
 
-	if (!congested &&
-	    mode == CACHE_MODE_WRITEBACK &&
-	    (bio->bi_rw & REQ_WRITE) &&
-	    (bio->bi_rw & REQ_SYNC))
-		goto rescale;
-
 	spin_lock(&dc->io_lock);
 
 	hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)


From: Tang Junhui <tang.junhui@zte.com.cn>

Since bypassed IOs use no bucket, so do not subtract sectors_to_gc to
trigger gc thread.

Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
Cc: stable@vger.kernel.org
---

--- a/drivers/md/bcache/request.c	2017-07-18 18:18:43.937169337 +0100
+++ b/drivers/md/bcache/request.c	2017-07-18 18:21:45.637507148 +0100
@@ -196,12 +196,12 @@ static void bch_data_insert_start(struct
 	struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
 	struct bio *bio = op->bio, *n;
 
-	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
-		wake_up_gc(op->c);
-
 	if (op->bypass)
 		return bch_data_invalidate(cl);
 
+	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
+		wake_up_gc(op->c);
+
 	/*
 	 * Journal writes are marked REQ_FLUSH; if the original write was a
 	 * flush, it'll wait on the journal write.


From: Tang Junhui <tang.junhui@zte.com.cn>

Thin flash device does not initialize stripe_sectors_dirty correctly, this
patch fixes this issue.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Cc: stable@vger.kernel.org
---

--- a/drivers/md/bcache/super.c	2017-07-18 18:31:38.968611871 +0100
+++ b/drivers/md/bcache/super.c	2017-07-18 18:32:36.078718382 +0100
@@ -1023,7 +1023,7 @@ int bch_cached_dev_attach(struct cached_
 	}
 
 	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
-		bch_sectors_dirty_init(dc);
+		bch_sectors_dirty_init(&dc->disk);
 		atomic_set(&dc->has_dirty, 1);
 		atomic_inc(&dc->count);
 		bch_writeback_queue(dc);
@@ -1227,6 +1227,7 @@ static int flash_dev_run(struct cache_se
 		goto err;
 
 	bcache_device_attach(d, c, u - c->uuids);
+	bch_sectors_dirty_init(d);
 	bch_flash_dev_request_init(d);
 	add_disk(d->disk);
 
--- a/drivers/md/bcache/writeback.c	2017-07-18 18:31:50.718633782 +0100
+++ b/drivers/md/bcache/writeback.c	2017-07-18 18:32:36.078718382 +0100
@@ -488,17 +488,17 @@ static int sectors_dirty_init_fn(struct
 	return MAP_CONTINUE;
 }
 
-void bch_sectors_dirty_init(struct cached_dev *dc)
+void bch_sectors_dirty_init(struct bcache_device *d)
 {
 	struct sectors_dirty_init op;
 
 	bch_btree_op_init(&op.op, -1);
-	op.inode = dc->disk.id;
+	op.inode = d->id;
 
-	bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0),
+	bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
 			   sectors_dirty_init_fn, 0);
 
-	dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk);
+	d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
 }
 
 void bch_cached_dev_writeback_init(struct cached_dev *dc)
--- a/drivers/md/bcache/writeback.h	2017-07-18 18:32:00.588652189 +0100
+++ b/drivers/md/bcache/writeback.h	2017-07-18 18:32:36.078718382 +0100
@@ -85,7 +85,7 @@ static inline void bch_writeback_add(str
 
 void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int);
 
-void bch_sectors_dirty_init(struct cached_dev *dc);
+void bch_sectors_dirty_init(struct bcache_device *);
 void bch_cached_dev_writeback_init(struct cached_dev *);
 int bch_cached_dev_writeback_start(struct cached_dev *);


From: Tang Junhui <tang.junhui@zte.com.cn>

Since dirty sectors of thin flash cannot be used to cache data for backend
device, so we should subtract it in calculating writeback rate.

Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
Cc: stable@vger.kernel.org
---

--- a/drivers/md/bcache/writeback.c	2017-07-18 18:38:46.929410077 +0100
+++ b/drivers/md/bcache/writeback.c	2017-07-18 18:39:00.979436233 +0100
@@ -21,7 +21,7 @@
 static void __update_writeback_rate(struct cached_dev *dc)
 {
 	struct cache_set *c = dc->disk.c;
-	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
+	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - bcache_flash_devs_sectors_dirty(c);
 	uint64_t cache_dirty_target =
 		div_u64(cache_sectors * dc->writeback_percent, 100);
 
--- a/drivers/md/bcache/writeback.h	2017-07-18 18:38:50.489416705 +0100
+++ b/drivers/md/bcache/writeback.h	2017-07-18 18:39:00.979436233 +0100
@@ -14,6 +14,25 @@ static inline uint64_t bcache_dev_sector
 	return ret;
 }
 
+static inline uint64_t  bcache_flash_devs_sectors_dirty(struct cache_set *c)
+{
+	uint64_t i, ret = 0;
+
+	mutex_lock(&bch_register_lock);
+
+	for (i = 0; i < c->nr_uuids; i++) {
+		struct bcache_device *d = c->devices[i];
+
+		if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
+			continue;
+	   ret += bcache_dev_sectors_dirty(d);
+	}
+
+	mutex_unlock(&bch_register_lock);
+
+	return ret;
+}
+
 static inline unsigned offset_to_stripe(struct bcache_device *d,
 					uint64_t offset)
 {

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PULL] bcache updates based on git.kernel.dk/linux-block:for-next
  2017-07-18 18:31           ` Eddie Chapman
@ 2017-07-18 20:06             ` Greg KH
  2017-07-18 20:36               ` Eddie Chapman
  0 siblings, 1 reply; 120+ messages in thread
From: Greg KH @ 2017-07-18 20:06 UTC (permalink / raw)
  To: Eddie Chapman; +Cc: tang.junhui, Eric Wheeler, Coly Li, linux-bcache, stable

On Tue, Jul 18, 2017 at 07:31:35PM +0100, Eddie Chapman wrote:
> As per previous mail, below are the 4 patches discussed, generated with 
> diff -up, from kernel.org 4.4.77, so with correct context for 4.4. Just 
> in case they are useful for anyone.

<formletter>

This is not the correct way to submit patches for inclusion in the
stable kernel tree.  Please read:
    https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
for how to do this properly.

</formletter>

Note, I can't do anything with patches until they are in Linus's tree.
Please work to get them accepted there, then we can worry about
backporting them to older kernels if they are needed.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PULL] bcache updates based on git.kernel.dk/linux-block:for-next
  2017-07-18 20:06             ` Greg KH
@ 2017-07-18 20:36               ` Eddie Chapman
  0 siblings, 0 replies; 120+ messages in thread
From: Eddie Chapman @ 2017-07-18 20:36 UTC (permalink / raw)
  To: Greg KH; +Cc: tang.junhui, Eric Wheeler, Coly Li, linux-bcache, stable

Hi Greg,

On 18/07/17 21:06, Greg KH wrote:
> On Tue, Jul 18, 2017 at 07:31:35PM +0100, Eddie Chapman wrote:
>> As per previous mail, below are the 4 patches discussed, generated with 
>> diff -up, from kernel.org 4.4.77, so with correct context for 4.4. Just 
>> in case they are useful for anyone.
> 
> <formletter>
> 
> This is not the correct way to submit patches for inclusion in the
> stable kernel tree.  Please read:
>     https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html
> for how to do this properly.
> 
> </formletter>

I know, I certainly did not want to submit these patches for inclusion in the stable kernel yet! As I said, I only sent them to Tang and others in case they were useful. I'm just trying to help with testing.

> Note, I can't do anything with patches until they are in Linus's tree.
> Please work to get them accepted there, then we can worry about
> backporting them to older kernels if they are needed.

Hold your fire! :-) I know, I said exactly the same myself earlier in this thread, that they must be in Linus' tree first.

Sorry if I gave the wrong idea. Maybe I should have removed stable@ from CC? But I thought that would not be good in case someone found the thread in future and wanted to know.

Eddie

> 
> thanks,
> 
> greg k-h

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 17/19] bcache: fix for gc and write-back race
  2017-06-30 20:43       ` [PATCH 17/19] bcache: fix for gc and write-back race bcache
@ 2017-08-03 16:20         ` Coly Li
  0 siblings, 0 replies; 120+ messages in thread
From: Coly Li @ 2017-08-03 16:20 UTC (permalink / raw)
  To: bcache, Tang Junhui; +Cc: linux-block, linux-bcache, hch, axboe, stable

On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
> From: Tang Junhui <tang.junhui@zte.com.cn>
> 
> gc and write-back get raced (see the email "bcache get stucked" I sended
> before):
> gc thread						write-back thread
> |							|bch_writeback_thread()
> |bch_gc_thread()					|
> |							|==>read_dirty()
> |==>bch_btree_gc()					|
> |==>btree_root() //get btree root			|
> |			node write locker		|
> |==>bch_btree_gc_root()					|
> |							|==>read_dirty_submit()
> |							|==>write_dirty()
> |							|==>continue_at(cl, write_dirty_finish, system_wq);
> |							|==>write_dirty_finish()//excute in system_wq
> |							|==>bch_btree_insert()
> |							|==>bch_btree_map_leaf_nodes()
> |							|==>__bch_btree_map_nodes()
> |							|==>btree_root //try to get btree root node read lock
> |							|-----stuck here
> |==>bch_btree_set_root()				|
> |==>bch_journal_meta()					|
> |==>bch_journal()					|
> |==>journal_try_write()					|
> |==>journal_write_unlocked() //journal_full(&c->journal) condition satisfied
> |==>continue_at(cl, journal_write, system_wq); //try to excute journal_write in system_wq
> |					//but work queue is excuting write_dirty_finish()
> |==>closure_sync(); //wait journal_write execute over and wake up gc,
> |			--stuck here
> |==>release root node write locker
> 
> This patch alloc a separate work-queue for write-back thread to avoid such
> race.
> 
> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
> Cc: stable@vger.kernel.org

Add a per-cached device work queue is a good idea, it's OK to me.

Acked-by: Coly Li <colyli@suse.de>

Thansk.

Coly

> ---
>  drivers/md/bcache/bcache.h    | 1 +
>  drivers/md/bcache/super.c     | 2 ++
>  drivers/md/bcache/writeback.c | 8 ++++++--
>  3 files changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 44123e4..deb0a6c 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -333,6 +333,7 @@ struct cached_dev {
>  	/* Limit number of writeback bios in flight */
>  	struct semaphore	in_flight;
>  	struct task_struct	*writeback_thread;
> +	struct workqueue_struct	*writeback_write_wq;
>  
>  	struct keybuf		writeback_keys;
>  
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index e06641e..24cb9b7 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -1063,6 +1063,8 @@ static void cached_dev_free(struct closure *cl)
>  	cancel_delayed_work_sync(&dc->writeback_rate_update);
>  	if (!IS_ERR_OR_NULL(dc->writeback_thread))
>  		kthread_stop(dc->writeback_thread);
> +	if (dc->writeback_write_wq)
> +		destroy_workqueue(dc->writeback_write_wq);
>  
>  	mutex_lock(&bch_register_lock);
>  
> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> index 4104eaa..4bc5daa 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -189,7 +189,7 @@ static void write_dirty(struct closure *cl)
>  
>  	closure_bio_submit(&io->bio, cl);
>  
> -	continue_at(cl, write_dirty_finish, system_wq);
> +	continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
>  }
>  
>  static void read_dirty_endio(struct bio *bio)
> @@ -209,7 +209,7 @@ static void read_dirty_submit(struct closure *cl)
>  
>  	closure_bio_submit(&io->bio, cl);
>  
> -	continue_at(cl, write_dirty, system_wq);
> +	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
>  }
>  
>  static void read_dirty(struct cached_dev *dc)
> @@ -527,6 +527,10 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
>  
>  int bch_cached_dev_writeback_start(struct cached_dev *dc)
>  {
> +	dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", WQ_MEM_RECLAIM, 0);
> +	if (!dc->writeback_write_wq)
> +		return -ENOMEM;
> +
>  	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
>  					      "bcache_writeback");
>  	if (IS_ERR(dc->writeback_thread))
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 01/19] bcache: Fix leak of bdev reference
  2017-07-05 18:24       ` Christoph Hellwig
@ 2017-09-04 17:30         ` Coly Li
  2017-09-05  6:43           ` Christoph Hellwig
  0 siblings, 1 reply; 120+ messages in thread
From: Coly Li @ 2017-09-04 17:30 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: bcache, linux-block, linux-bcache, axboe, Jan Kara

On 2017/7/6 上午2:24, Christoph Hellwig wrote:
> On Fri, Jun 30, 2017 at 01:42:50PM -0700, bcache@lists.ewheeler.net wrote:
>> From: Jan Kara <jack@suse.cz>
>>
>> If blkdev_get_by_path() in register_bcache() fails, we try to lookup the
>> block device using lookup_bdev() to detect which situation we are in to
>> properly report error. However we never drop the reference returned to
>> us from lookup_bdev(). Fix that.
> 
> This look ok, but I think that whole chunk of code should just go
> away - adding a lookup_bdev and resulting mess just for a slightly
> different error message is just insane.

Hi Christoph,

When you mentioned "whole chunk of code", do you mean the following
block of code ?


1960         if (IS_ERR(bdev)) {
========= start of whole chunk of code ============
1961                 if (bdev == ERR_PTR(-EBUSY)) {
1962                         bdev = lookup_bdev(strim(path));
1963                         mutex_lock(&bch_register_lock);
1964                         if (!IS_ERR(bdev) && bch_is_open(bdev))
1965                                 err = "device already registered";
1966                         else
1967                                 err = "device busy";
1968                         mutex_unlock(&bch_register_lock);
1969                         if (!IS_ERR(bdev))
1970                                 bdput(bdev);
1971                         if (attr == &ksysfs_register_quiet)
1972                                 goto out;
1973                 }
========= end of whole chunk of code ============
1974                 goto err;
1975         }

I don't mind to remove it, just double check I don't misunderstand what
you meant.

Thanks.


-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 01/19] bcache: Fix leak of bdev reference
  2017-09-04 17:30         ` Coly Li
@ 2017-09-05  6:43           ` Christoph Hellwig
  2017-09-05  6:55             ` Coly Li
  2017-09-06  5:25             ` Coly Li
  0 siblings, 2 replies; 120+ messages in thread
From: Christoph Hellwig @ 2017-09-05  6:43 UTC (permalink / raw)
  To: Coly Li
  Cc: Christoph Hellwig, bcache, linux-block, linux-bcache, axboe, Jan Kara

On Tue, Sep 05, 2017 at 01:30:04AM +0800, Coly Li wrote:
> 
> When you mentioned "whole chunk of code", do you mean the following
> block of code ?
> 
> 
> 1960         if (IS_ERR(bdev)) {
> ========= start of whole chunk of code ============
> 1961                 if (bdev == ERR_PTR(-EBUSY)) {
> 1962                         bdev = lookup_bdev(strim(path));
> 1963                         mutex_lock(&bch_register_lock);
> 1964                         if (!IS_ERR(bdev) && bch_is_open(bdev))
> 1965                                 err = "device already registered";
> 1966                         else
> 1967                                 err = "device busy";
> 1968                         mutex_unlock(&bch_register_lock);
> 1969                         if (!IS_ERR(bdev))
> 1970                                 bdput(bdev);
> 1971                         if (attr == &ksysfs_register_quiet)
> 1972                                 goto out;
> 1973                 }
> ========= end of whole chunk of code ============
> 1974                 goto err;
> 1975         }
> 
> I don't mind to remove it, just double check I don't misunderstand what
> you meant.

Yes, that's the problematic block.

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 01/19] bcache: Fix leak of bdev reference
  2017-09-05  6:43           ` Christoph Hellwig
@ 2017-09-05  6:55             ` Coly Li
  2017-09-06  5:25             ` Coly Li
  1 sibling, 0 replies; 120+ messages in thread
From: Coly Li @ 2017-09-05  6:55 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: bcache, linux-block, linux-bcache, axboe, Jan Kara

On 2017/9/5 下午2:43, Christoph Hellwig wrote:
> On Tue, Sep 05, 2017 at 01:30:04AM +0800, Coly Li wrote:
>>
>> When you mentioned "whole chunk of code", do you mean the following
>> block of code ?
>>
>>
>> 1960         if (IS_ERR(bdev)) {
>> ========= start of whole chunk of code ============
>> 1961                 if (bdev == ERR_PTR(-EBUSY)) {
>> 1962                         bdev = lookup_bdev(strim(path));
>> 1963                         mutex_lock(&bch_register_lock);
>> 1964                         if (!IS_ERR(bdev) && bch_is_open(bdev))
>> 1965                                 err = "device already registered";
>> 1966                         else
>> 1967                                 err = "device busy";
>> 1968                         mutex_unlock(&bch_register_lock);
>> 1969                         if (!IS_ERR(bdev))
>> 1970                                 bdput(bdev);
>> 1971                         if (attr == &ksysfs_register_quiet)
>> 1972                                 goto out;
>> 1973                 }
>> ========= end of whole chunk of code ============
>> 1974                 goto err;
>> 1975         }
>>
>> I don't mind to remove it, just double check I don't misunderstand what
>> you meant.
> 
> Yes, that's the problematic block.
> 
Understand, I will send out a patch, hopefully it can catch up 4.14
merge window.

Thanks for the hint.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 01/19] bcache: Fix leak of bdev reference
  2017-09-05  6:43           ` Christoph Hellwig
  2017-09-05  6:55             ` Coly Li
@ 2017-09-06  5:25             ` Coly Li
  1 sibling, 0 replies; 120+ messages in thread
From: Coly Li @ 2017-09-06  5:25 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: bcache, linux-block, linux-bcache, axboe, Jan Kara

On 2017/9/5 下午2:43, Christoph Hellwig wrote:
> On Tue, Sep 05, 2017 at 01:30:04AM +0800, Coly Li wrote:
>>
>> When you mentioned "whole chunk of code", do you mean the following
>> block of code ?
>>
>>
>> 1960         if (IS_ERR(bdev)) {
>> ========= start of whole chunk of code ============
>> 1961                 if (bdev == ERR_PTR(-EBUSY)) {
>> 1962                         bdev = lookup_bdev(strim(path));
>> 1963                         mutex_lock(&bch_register_lock);
>> 1964                         if (!IS_ERR(bdev) && bch_is_open(bdev))
>> 1965                                 err = "device already registered";
>> 1966                         else
>> 1967                                 err = "device busy";
>> 1968                         mutex_unlock(&bch_register_lock);
>> 1969                         if (!IS_ERR(bdev))
>> 1970                                 bdput(bdev);
>> 1971                         if (attr == &ksysfs_register_quiet)
>> 1972                                 goto out;
>> 1973                 }
>> ========= end of whole chunk of code ============
>> 1974                 goto err;
>> 1975         }
>>
>> I don't mind to remove it, just double check I don't misunderstand what
>> you meant.
> 
> Yes, that's the problematic block.
> 
Hi Christoph,

I tested the code, and my patch which removes the above code. The result
is, I feel the above chunk of code is useful. When I tried to register a
device and it was already registered. Without the above code, I only saw
"failed to open device", didn't realize it was because this device is
registered already. After adding the above code back, I knew where the
problem was.

The above chunk of code improves user experience and provides more
detailed diagnose information, it is useful. Then I suggest to keep the
code here and pick up Jan's patch.

Thanks.

-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 15/19] bcache: fix issue of writeback rate at minimum 1 key per second
  2017-07-16 10:04         ` Coly Li
@ 2017-10-27 19:07           ` Eric Wheeler
  2017-10-27 19:09             ` Eric Wheeler
  0 siblings, 1 reply; 120+ messages in thread
From: Eric Wheeler @ 2017-10-27 19:07 UTC (permalink / raw)
  To: Coly Li; +Cc: bcache, Tang Junhui, linux-block, linux-bcache, hch, axboe

[-- Attachment #1: Type: TEXT/PLAIN, Size: 7798 bytes --]

On Sun, 16 Jul 2017, Coly Li wrote:

> On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
> > From: Tang Junhui <tang.junhui@zte.com.cn>
> > 
> > When there is not enough dirty data in writeback cache,
> > writeback rate is at minimum 1 key per second
> > util all dirty data to be cleaned, it is inefficiency,
> > and also causes waste of energy;
> 
> Hi Junhui and Eric,
> 
> What: /sys/block/<disk>/bcache/writeback_percent
> Description:
>       For backing devices: If nonzero, writeback from cache to
>       backing device only takes place when more than this percentage
>       of the cache is used, allowing more write coalescing to take
>       place and reducing total number of writes sent to the backing
>       device. Integer between 0 and 40.
> 
> I see above text from Documentation/ABI/testing/sysfs-block-bcache (I
> know this document is quite old), it seems if "not enough" means dirty
> data percentage is less then writback_percent, bcache should not
> performance writeback I/O. But in __update_writeback_rate(),
> writeback_rate.rate is clamped in [1, NSEC_PER_MSEC]. It seems in PD
> controller code of __update_writeback_rate(), writeback_percent is only
> used to calculate dirty target number, its another functionality as
> writeback threshold is not handled here.
> 
> > 
> > in this patch, When there is not enough dirty data,
> > let the writeback rate to be 0, and writeback re-schedule
> > in bch_writeback_thread() periodically with schedule_timeout(),
> > the behaviors are as follows :
> > 
> > 1) If no dirty data have been read into dc->writeback_keys,
> > goto step 2), otherwise keep writing these dirty data to
> > back-end device at 1 key per second, until all these dirty data
> > write over, then goto step 2).
> > 
> > 2) Loop in bch_writeback_thread() to check if there is enough
> > dirty data for writeback. if there is not enough diry data for
> > writing, then sleep 10 seconds, otherwise, write dirty data to
> > back-end device.
> 
> Bcache uses a Proportion-Differentiation Controller to control writeback
> rate. When dirty data is far from target, writeback rate is higher; when
> dirty data is close to target, writeback rate is slower. The advantage
> of PD controller here is, when regular I/O and writeback I/O happens in
> same time,
> - When there are a lot of dirty data, writeback I/O can have more chance
> to write them back to cached device, which in turns has positive impact
> to regular I/O.
> - When dirty data is decreased and close to target dirty number, less
> writeback I/O can help regular I/O has better throughput and latency.
> 
> The root cause of 1 key per second is, the PD controller is designed for
> better I/O performance, not less energy consumption. When the existing
> dirty data gets closed to target dirty number, the PD controller chooses
> to use longer writeback time to make a better regular I/O performance.
> If it is designed for less energy consumption, it should keep the
> writeback rate in a high level and finish writing back all dirty data as
> soon as possible.
> 
> This patch may introduce an unexpected behavior of dirty data writeback
> throughput, when regular write I/O and writeback I/O happen in same
> time. In this case, dirty data number may shake up and down around
> target dirty number, it is possible that change (the variable in
> __update_writeback_rate()) is a minus value, and the result of
> dc->writeback_rate.rate + change happens to be 0. This patch changes the
> clamp range of writeback_rate.rate to [0, NSEC_PER_MSEC], so
> writeback_rate.rate can be possible to be 0. And in bch_next_delay() if
> d->rate is zero, the write back I/O will be delayed to now +
> NSEC_PER_SEC. When there is no regular I/O it works well, but when there
> is regular I/O, this longer delay may cause more dirty data piled in
> cache device, and PD controller cannot generage a stable writeback rate.
> This is not an expected behavior for the writeback rate PD controller.
> 
> Another method to fix might be,
> 1) define a sysfs to define writeback_rate with max/dynamic option.
> 2) dynamic writeback_rate as default
> 3) when max is set, in __update_writeback_rate() assign NSEC_PER_MSEC to
> writeback_rate.rate
> 4) in bch_writeback_thread(), if no writeback I/O on fly, and dirty data
> does not reach dc->writeback_percent, schedule out.
> 5) if writeback is necessary then do it in max rate and finish it as
> soon as possible, to save laptop energy.
> 
> The above method might be helpful to energy save as well (perform dirty
> dat write back in batch), and does not change default PD controller
> behavior.
> 
> Just for your reference. Or if you are too busy to look at it, I can try
> to compose a patch for review.

Hi Coli,

Did this go anywere?  I think the 1-key/sec fix is a good idea and your 
suggestion will help out mobile users.



--
Eric Wheeler


> 
> Coly
> 
> > 
> > Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
> > ---
> >  drivers/md/bcache/util.c      |  9 ++++++++-
> >  drivers/md/bcache/writeback.c | 11 +++++++----
> >  2 files changed, 15 insertions(+), 5 deletions(-)
> > 
> > diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
> > index 8c3a938..49dcf09 100644
> > --- a/drivers/md/bcache/util.c
> > +++ b/drivers/md/bcache/util.c
> > @@ -210,7 +210,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
> >  {
> >  	uint64_t now = local_clock();
> >  
> > -	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
> > +	/*
> > +	  if d->rate is zero, write the left dirty data
> > +	  at the speed of one key per second
> > +	*/
> > +	if(!d->rate)
> > +		d->next = now + NSEC_PER_SEC;
> > +	else
> > +		d->next += div_u64(done * NSEC_PER_SEC, d->rate);
> >  
> >  	if (time_before64(now + NSEC_PER_SEC, d->next))
> >  		d->next = now + NSEC_PER_SEC;
> > diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> > index 25289e4..4104eaa 100644
> > --- a/drivers/md/bcache/writeback.c
> > +++ b/drivers/md/bcache/writeback.c
> > @@ -16,6 +16,8 @@
> >  #include <linux/sched/clock.h>
> >  #include <trace/events/bcache.h>
> >  
> > +#define WRITE_BACK_WAIT_CYCLE		10 * HZ
> > +
> >  /* Rate limiting */
> >  
> >  static void __update_writeback_rate(struct cached_dev *dc)
> > @@ -55,13 +57,14 @@ static void __update_writeback_rate(struct cached_dev *dc)
> >  
> >  	/* Don't increase writeback rate if the device isn't keeping up */
> >  	if (change > 0 &&
> > +	    dc->writeback_rate.rate >0 &&
> >  	    time_after64(local_clock(),
> >  			 dc->writeback_rate.next + NSEC_PER_MSEC))
> >  		change = 0;
> >  
> >  	dc->writeback_rate.rate =
> >  		clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
> > -			1, NSEC_PER_MSEC);
> > +			0, NSEC_PER_MSEC);
> >  
> >  	dc->writeback_rate_proportional = proportional;
> >  	dc->writeback_rate_derivative = derivative;
> > @@ -420,15 +423,15 @@ static int bch_writeback_thread(void *arg)
> >  	while (!kthread_should_stop()) {
> >  		down_write(&dc->writeback_lock);
> >  		if (!atomic_read(&dc->has_dirty) ||
> > -		    (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
> > -		     !dc->writeback_running)) {
> > +		    ((!dc->writeback_rate.rate || !dc->writeback_running) &&
> > +		      !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))) {
> >  			up_write(&dc->writeback_lock);
> >  			set_current_state(TASK_INTERRUPTIBLE);
> >  
> >  			if (kthread_should_stop())
> >  				return 0;
> >  
> > -			schedule();
> > +			schedule_timeout(WRITE_BACK_WAIT_CYCLE);
> >  			continue;
> >  		}
> >  
> > 
> 
> 
> -- 
> Coly Li
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 15/19] bcache: fix issue of writeback rate at minimum 1 key per second
  2017-10-27 19:07           ` Eric Wheeler
@ 2017-10-27 19:09             ` Eric Wheeler
  2017-10-28  8:58               ` Coly Li
  0 siblings, 1 reply; 120+ messages in thread
From: Eric Wheeler @ 2017-10-27 19:09 UTC (permalink / raw)
  To: Coly Li; +Cc: Tang Junhui, Michael Lyle, linux-block, linux-bcache, hch, axboe

[-- Attachment #1: Type: TEXT/PLAIN, Size: 8249 bytes --]


[+cc Michael Lyle]

On Fri, 27 Oct 2017, Eric Wheeler wrote:

> On Sun, 16 Jul 2017, Coly Li wrote:
> 
> > On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
> > > From: Tang Junhui <tang.junhui@zte.com.cn>
> > > 
> > > When there is not enough dirty data in writeback cache,
> > > writeback rate is at minimum 1 key per second
> > > util all dirty data to be cleaned, it is inefficiency,
> > > and also causes waste of energy;
> > 
> > Hi Junhui and Eric,
> > 
> > What: /sys/block/<disk>/bcache/writeback_percent
> > Description:
> >       For backing devices: If nonzero, writeback from cache to
> >       backing device only takes place when more than this percentage
> >       of the cache is used, allowing more write coalescing to take
> >       place and reducing total number of writes sent to the backing
> >       device. Integer between 0 and 40.
> > 
> > I see above text from Documentation/ABI/testing/sysfs-block-bcache (I
> > know this document is quite old), it seems if "not enough" means dirty
> > data percentage is less then writback_percent, bcache should not
> > performance writeback I/O. But in __update_writeback_rate(),
> > writeback_rate.rate is clamped in [1, NSEC_PER_MSEC]. It seems in PD
> > controller code of __update_writeback_rate(), writeback_percent is only
> > used to calculate dirty target number, its another functionality as
> > writeback threshold is not handled here.
> > 
> > > 
> > > in this patch, When there is not enough dirty data,
> > > let the writeback rate to be 0, and writeback re-schedule
> > > in bch_writeback_thread() periodically with schedule_timeout(),
> > > the behaviors are as follows :
> > > 
> > > 1) If no dirty data have been read into dc->writeback_keys,
> > > goto step 2), otherwise keep writing these dirty data to
> > > back-end device at 1 key per second, until all these dirty data
> > > write over, then goto step 2).
> > > 
> > > 2) Loop in bch_writeback_thread() to check if there is enough
> > > dirty data for writeback. if there is not enough diry data for
> > > writing, then sleep 10 seconds, otherwise, write dirty data to
> > > back-end device.
> > 
> > Bcache uses a Proportion-Differentiation Controller to control writeback
> > rate. When dirty data is far from target, writeback rate is higher; when
> > dirty data is close to target, writeback rate is slower. The advantage
> > of PD controller here is, when regular I/O and writeback I/O happens in
> > same time,
> > - When there are a lot of dirty data, writeback I/O can have more chance
> > to write them back to cached device, which in turns has positive impact
> > to regular I/O.
> > - When dirty data is decreased and close to target dirty number, less
> > writeback I/O can help regular I/O has better throughput and latency.
> > 
> > The root cause of 1 key per second is, the PD controller is designed for
> > better I/O performance, not less energy consumption. When the existing
> > dirty data gets closed to target dirty number, the PD controller chooses
> > to use longer writeback time to make a better regular I/O performance.
> > If it is designed for less energy consumption, it should keep the
> > writeback rate in a high level and finish writing back all dirty data as
> > soon as possible.
> > 
> > This patch may introduce an unexpected behavior of dirty data writeback
> > throughput, when regular write I/O and writeback I/O happen in same
> > time. In this case, dirty data number may shake up and down around
> > target dirty number, it is possible that change (the variable in
> > __update_writeback_rate()) is a minus value, and the result of
> > dc->writeback_rate.rate + change happens to be 0. This patch changes the
> > clamp range of writeback_rate.rate to [0, NSEC_PER_MSEC], so
> > writeback_rate.rate can be possible to be 0. And in bch_next_delay() if
> > d->rate is zero, the write back I/O will be delayed to now +
> > NSEC_PER_SEC. When there is no regular I/O it works well, but when there
> > is regular I/O, this longer delay may cause more dirty data piled in
> > cache device, and PD controller cannot generage a stable writeback rate.
> > This is not an expected behavior for the writeback rate PD controller.
> > 
> > Another method to fix might be,
> > 1) define a sysfs to define writeback_rate with max/dynamic option.
> > 2) dynamic writeback_rate as default
> > 3) when max is set, in __update_writeback_rate() assign NSEC_PER_MSEC to
> > writeback_rate.rate
> > 4) in bch_writeback_thread(), if no writeback I/O on fly, and dirty data
> > does not reach dc->writeback_percent, schedule out.
> > 5) if writeback is necessary then do it in max rate and finish it as
> > soon as possible, to save laptop energy.
> > 
> > The above method might be helpful to energy save as well (perform dirty
> > dat write back in batch), and does not change default PD controller
> > behavior.
> > 
> > Just for your reference. Or if you are too busy to look at it, I can try
> > to compose a patch for review.
> 
> Hi Coli,
> 
> Did this go anywere?  I think the 1-key/sec fix is a good idea and your 
> suggestion will help out mobile users.
> 
> 
> 
> --
> Eric Wheeler
> 
> 
> > 
> > Coly
> > 
> > > 
> > > Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
> > > ---
> > >  drivers/md/bcache/util.c      |  9 ++++++++-
> > >  drivers/md/bcache/writeback.c | 11 +++++++----
> > >  2 files changed, 15 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
> > > index 8c3a938..49dcf09 100644
> > > --- a/drivers/md/bcache/util.c
> > > +++ b/drivers/md/bcache/util.c
> > > @@ -210,7 +210,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
> > >  {
> > >  	uint64_t now = local_clock();
> > >  
> > > -	d->next += div_u64(done * NSEC_PER_SEC, d->rate);
> > > +	/*
> > > +	  if d->rate is zero, write the left dirty data
> > > +	  at the speed of one key per second
> > > +	*/
> > > +	if(!d->rate)
> > > +		d->next = now + NSEC_PER_SEC;
> > > +	else
> > > +		d->next += div_u64(done * NSEC_PER_SEC, d->rate);
> > >  
> > >  	if (time_before64(now + NSEC_PER_SEC, d->next))
> > >  		d->next = now + NSEC_PER_SEC;
> > > diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> > > index 25289e4..4104eaa 100644
> > > --- a/drivers/md/bcache/writeback.c
> > > +++ b/drivers/md/bcache/writeback.c
> > > @@ -16,6 +16,8 @@
> > >  #include <linux/sched/clock.h>
> > >  #include <trace/events/bcache.h>
> > >  
> > > +#define WRITE_BACK_WAIT_CYCLE		10 * HZ
> > > +
> > >  /* Rate limiting */
> > >  
> > >  static void __update_writeback_rate(struct cached_dev *dc)
> > > @@ -55,13 +57,14 @@ static void __update_writeback_rate(struct cached_dev *dc)
> > >  
> > >  	/* Don't increase writeback rate if the device isn't keeping up */
> > >  	if (change > 0 &&
> > > +	    dc->writeback_rate.rate >0 &&
> > >  	    time_after64(local_clock(),
> > >  			 dc->writeback_rate.next + NSEC_PER_MSEC))
> > >  		change = 0;
> > >  
> > >  	dc->writeback_rate.rate =
> > >  		clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
> > > -			1, NSEC_PER_MSEC);
> > > +			0, NSEC_PER_MSEC);
> > >  
> > >  	dc->writeback_rate_proportional = proportional;
> > >  	dc->writeback_rate_derivative = derivative;
> > > @@ -420,15 +423,15 @@ static int bch_writeback_thread(void *arg)
> > >  	while (!kthread_should_stop()) {
> > >  		down_write(&dc->writeback_lock);
> > >  		if (!atomic_read(&dc->has_dirty) ||
> > > -		    (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
> > > -		     !dc->writeback_running)) {
> > > +		    ((!dc->writeback_rate.rate || !dc->writeback_running) &&
> > > +		      !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))) {
> > >  			up_write(&dc->writeback_lock);
> > >  			set_current_state(TASK_INTERRUPTIBLE);
> > >  
> > >  			if (kthread_should_stop())
> > >  				return 0;
> > >  
> > > -			schedule();
> > > +			schedule_timeout(WRITE_BACK_WAIT_CYCLE);
> > >  			continue;
> > >  		}
> > >  
> > > 
> > 
> > 
> > -- 
> > Coly Li
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 12/19] bcache: update bucket_in_use periodically
  2017-07-13  4:27                 ` Coly Li
@ 2017-10-27 19:11                   ` Eric Wheeler
  2017-10-27 19:45                     ` Eric Wheeler
  0 siblings, 1 reply; 120+ messages in thread
From: Eric Wheeler @ 2017-10-27 19:11 UTC (permalink / raw)
  To: Coly Li; +Cc: Michael Lyle, tang.junhui, linux-bcache, linux-block

[-- Attachment #1: Type: TEXT/PLAIN, Size: 7141 bytes --]

On Thu, 13 Jul 2017, Coly Li wrote:

> On 2017/7/13 下午12:13, Eric Wheeler wrote:
> > On Tue, 11 Jul 2017, Coly Li wrote:
> > 
> >> On 2017/7/11 下午1:39, tang.junhui@zte.com.cn wrote:
> >>> Compared to bucket depletion, resulting in hanging dead,
> >>> It is worthy to consumes a little time to update the bucket_in_use.
> >>> If you have any better solution, please show to us,
> >>> We should solve it as soon as possible, not wait for it forever.
> >>>
> >>
> >> I also test this patch on a cache device with 4x3.8TB size, all buckets
> >> iteration takes around 40-50ms. If the iteration needs to hold
> >> bucket_lock of cache set, it is very probably to introduce a huge I/O
> >> latency in period of every 30 seconds.
> >>
> >> For database people, this is not good news.
> > 
> > 
> > Hi Tang,
> >    
> > I'm waiting to queue this patch pending your response to Coly.  
> > 
> > Please send a v2 when you're ready.
> 
> 
> Eric,
> 
> I guess Tang is working on the I/O hang issue during back ground garbage
> collection running. From discussion from other email thread, it seems a
> regular I/O request gets hung for 10+ second in some cases. Maybe that
> issue is more urgent than this one.
> 
> From my personal opinion, updating bucket_in_use is for acting garbage
> collection. If number of bucket in use is not updated in time, garbage
> collection won't start due to old bucket_in_use still beyond
> CUTOFF_WRITEBACK_SYNC.
> 
> We may maintain an atomic counter per-cache set for dirty buckets, and
> update it at some locations when allocating or reclaiming bucket. This
> counter is unnecessary to be very accurate, just accurate enough for
> should_writeback() working correctly.
> 
> I am also looking at it for a better solution as well.

Hi Coli & Tang,

Have either of you had a chance to come up with a solution to this?

--
Eric Wheeler

> 
> Coly
> 
> 
> >>
> >> Coly
> >>
> >>
> >>>
> >>>
> >>>
> >>> 发件人:         Coly Li <i@coly.li>
> >>> 收件人:         linux-block@vger.kernel.org, Tang Junhui
> >>> <tang.junhui@zte.com.cn>,
> >>> 抄送:        bcache@lists.ewheeler.net, linux-bcache@vger.kernel.org,
> >>> hch@infradead.org, axboe@kernel.dk
> >>> 日期:         2017/07/11 13:06
> >>> 主题:        Re: [PATCH 12/19] bcache: update bucket_in_use periodically
> >>> 发件人:        linux-bcache-owner@vger.kernel.org
> >>> ------------------------------------------------------------------------
> >>>
> >>>
> >>>
> >>> On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
> >>>> From: Tang Junhui <tang.junhui@zte.com.cn>
> >>>>
> >>>> bucket_in_use is updated in gc thread which triggered by invalidating or
> >>>> writing sectors_to_gc dirty data, It's been too long, Therefore, when we
> >>>> use it to compare with the threshold, it is often not timely, which leads
> >>>> to inaccurate judgment and often results in bucket depletion.
> >>>>
> >>>> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
> >>>> ---
> >>>>  drivers/md/bcache/btree.c | 29 +++++++++++++++++++++++++++--
> >>>>  1 file changed, 27 insertions(+), 2 deletions(-)
> >>>>
> >>>> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
> >>>> index 866dcf7..77aa20b 100644
> >>>> --- a/drivers/md/bcache/btree.c
> >>>> +++ b/drivers/md/bcache/btree.c
> >>>> @@ -90,6 +90,8 @@
> >>>>  #define MAX_NEED_GC                                  64
> >>>>  #define MAX_SAVE_PRIO                                  72
> >>>>  
> >>>> +#define GC_THREAD_TIMEOUT_MS                 (30 * 1000)
> >>>> +
> >>>>  #define PTR_DIRTY_BIT                                  (((uint64_t) 1
> >>> << 36))
> >>>>  
> >>>>  #define PTR_HASH(c, k)                                              
> >>>                                                                         \
> >>>> @@ -1760,6 +1762,23 @@ static void bch_btree_gc(struct cache_set *c)
> >>>>                   bch_moving_gc(c);
> >>>>  }
> >>>>  
> >>>> +void bch_update_bucket_in_use(struct cache_set *c)
> >>>> +{
> >>>> +                 struct cache *ca;
> >>>> +                 struct bucket *b;
> >>>> +                 unsigned i;
> >>>> +                 size_t available = 0;
> >>>> +
> >>>> +                 for_each_cache(ca, c, i) {
> >>>> +                                  for_each_bucket(b, ca) {
> >>>> +                                                   if (!GC_MARK(b) ||
> >>> GC_MARK(b) == GC_MARK_RECLAIMABLE)
> >>>> +                                                                  
> >>>  available++;
> >>>> +                                  }
> >>>> +                 }
> >>>> +
> >>>
> >>> bucket_lock of cache set should be held before accessing buckets.
> >>>
> >>>
> >>>> +                 c->gc_stats.in_use = (c->nbuckets - available) * 100
> >>> / c->nbuckets;
> >>>> +}
> >>>> +
> >>>>  static bool gc_should_run(struct cache_set *c)
> >>>>  {
> >>>>                   struct cache *ca;
> >>>> @@ -1778,10 +1797,16 @@ static bool gc_should_run(struct cache_set *c)
> >>>>  static int bch_gc_thread(void *arg)
> >>>>  {
> >>>>                   struct cache_set *c = arg;
> >>>> +                 long  ret;
> >>>> +                 unsigned long timeout =
> >>> msecs_to_jiffies(GC_THREAD_TIMEOUT_MS);
> >>>>  
> >>>>                   while (1) {
> >>>> -                                  wait_event_interruptible(c->gc_wait,
> >>>> -                                                    
> >>>  kthread_should_stop() || gc_should_run(c));
> >>>> +                                  ret =
> >>> wait_event_interruptible_timeout(c->gc_wait,
> >>>> +                                                    
> >>>  kthread_should_stop() || gc_should_run(c), timeout);
> >>>> +                                  if (!ret) {
> >>>> +                                                  
> >>> bch_update_bucket_in_use(c);
> >>>> +                                                   continue;
> >>>
> >>> A continue here will ignore status returned from kthread_should_stop(),
> >>> which might not be expected behavior.
> >>>
> >>>
> >>>> +                                  }
> >>>>  
> >>>>                                    if (kthread_should_stop())
> >>>>                                                     break;
> >>>>
> >>>
> >>> Iterating all buckets from the cache set requires bucket_lock to be
> >>> held. Waiting for bucket_lock may take quite a long time for either
> >>> bucket allocating code or bch_gc_thread(). What I concern is, this patch
> >>> may introduce bucket allocation delay in period of GC_THREAD_TIMEOUT_MS.
> >>>
> >>> We need to find out a way to avoid such a performance regression.
> >>>
> >>> -- 
> >>> Coly Li
> >>> --
> >>> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> >>> the body of a message to majordomo@vger.kernel.org
> >>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>>
> >>>
> >>
> >>
> >> -- 
> >> Coly Li
> 
> 
> -- 
> Coly Li
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 11/19] bcache: Subtract dirty sectors of thin flash from cache_sectors in calculating writeback rate
  2017-07-13  4:15               ` Coly Li
@ 2017-10-27 19:12                 ` Eric Wheeler
  0 siblings, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2017-10-27 19:12 UTC (permalink / raw)
  To: Coly Li; +Cc: Michael Lyle, tang.junhui, linux-bcache, linux-block, stable

[-- Attachment #1: Type: TEXT/PLAIN, Size: 5660 bytes --]

On Thu, 13 Jul 2017, Coly Li wrote:

> On 2017/7/13 下午12:12, Eric Wheeler wrote:
> > On Tue, 11 Jul 2017, tang.junhui@zte.com.cn wrote:
> > 
> >>> Based on the above implementation, non-dirty space from flash only
> >>> bcache device will mislead writeback rate calculation too. So I suggest
> >>> to subtract all buckets size from all flash only bcache devices. Then it
> >>> might be something like,
> >>
> >> what is non-dirty space from flash only bcache device?
> >> Where is non-dirty space from flash only bcache device?
> > 
> > Hi Tang, Coly:
> >    
> > Waas there more discussion on this thread, or is the patch good to go?  
> > 
> > Please send your ack if you're happy with it so I can queue it up.
> 
> I discussed with Tang offline, this patch is correct. But the patch
> commit log should be improved. Now I help to work on it, should be done
> quite soon.

Has an updated commit log been made?  I've not seen this in the commit 
stream yet.

--
Eric Wheeler



> 
> Coly
> 
> >>
> >>
> >> 发件人:         Coly Li <i@coly.li>
> >> 收件人:         Tang Junhui <tang.junhui@zte.com.cn>,
> >> 抄送:        bcache@lists.ewheeler.net, linux-block@vger.kernel.org, linux-bcache@vger.kernel.org,
> >> hch@infradead.org, axboe@kernel.dk, stable@vger.kernel.org
> >> 日期:         2017/07/11 02:11
> >> 主题:        Re: [PATCH 11/19] bcache: Subtract dirty sectors of thin flash from cache_sectors in calculating
> >> writeback rate
> >> 发件人:        linux-bcache-owner@vger.kernel.org
> >>
> >> _________________________________________________________________________________________________________________
> >>
> >>
> >>
> >> On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
> >>> From: Tang Junhui <tang.junhui@zte.com.cn>
> >>>
> >>> Since dirty sectors of thin flash cannot be used to cache data for backend
> >>> device, so we should subtract it in calculating writeback rate.
> >>>
> >>
> >> I see you want to get ride of the noise of flash only cache device for
> >> writeback rate calculation. It makes sense, because flash only cache
> >> device won't have write back happen at all.
> >>
> >>
> >>> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
> >>> Cc: stable@vger.kernel.org
> >>> ---
> >>>  drivers/md/bcache/writeback.c |  2 +-
> >>>  drivers/md/bcache/writeback.h | 19 +++++++++++++++++++
> >>>  2 files changed, 20 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
> >>> index 4ac8b13..25289e4 100644
> >>> --- a/drivers/md/bcache/writeback.c
> >>> +++ b/drivers/md/bcache/writeback.c
> >>> @@ -21,7 +21,7 @@
> >>>  static void __update_writeback_rate(struct cached_dev *dc)
> >>>  {
> >>>                   struct cache_set *c = dc->disk.c;
> >>> -                 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
> >>> +                 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
> >> bcache_flash_devs_sectors_dirty(c);
> >>
> >> See flash_dev_run(), the flash volume is created per struct
> >> bcache_device of a cache set. That means, all data allocated for the
> >> flash volume will be from a flash only bcache device. Regular dirty data
> >> won't mixed allocating with flash volume dirty data on identical struct
> >> bcache device.
> >>
> >> Based on the above implementation, non-dirty space from flash only
> >> bcache device will mislead writeback rate calculation too. So I suggest
> >> to subtract all buckets size from all flash only bcache devices. Then it
> >> might be something like,
> >>
> >> uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
> >>                                                   bcache_flash_devs_nbuckets(c);
> >>
> >>
> >>
> >> Just FYI. Thanks.
> >>
> >> Coly
> >>
> >>>                   uint64_t cache_dirty_target =
> >>>                                    div_u64(cache_sectors * dc->writeback_percent, 100);
> >>>  
> >>> diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
> >>> index c2ab4b4..24ff589 100644
> >>> --- a/drivers/md/bcache/writeback.h
> >>> +++ b/drivers/md/bcache/writeback.h
> >>> @@ -14,6 +14,25 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
> >>>                   return ret;
> >>>  }
> >>>  
> >>> +static inline uint64_t  bcache_flash_devs_sectors_dirty(struct cache_set *c)
> >>> +{
> >>> +                 uint64_t i, ret = 0;
> >>> +
> >>> +                 mutex_lock(&bch_register_lock);
> >>> +
> >>> +                 for (i = 0; i < c->nr_uuids; i++) {
> >>> +                                  struct bcache_device *d = c->devices[i];
> >>> +
> >>> +                                  if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
> >>> +                                                   continue;
> >>> +                    ret += bcache_dev_sectors_dirty(d);
> >>> +                 }
> >>> +
> >>> +                 mutex_unlock(&bch_register_lock);
> >>> +
> >>> +                 return ret;
> >>> +}
> >>> +
> >>>  static inline unsigned offset_to_stripe(struct bcache_device *d,
> >>>                                                                                       uint64_t offset)
> >>>  {
> >>>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> >> the body of a message to majordomo@vger.kernel.org
> >> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>
> >>
> >>
> 
> 
> -- 
> Coly Li
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 04/19] bcache: fix wrong cache_misses statistics
  2017-07-13  4:09           ` Eric Wheeler
@ 2017-10-27 19:14             ` Eric Wheeler
  0 siblings, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2017-10-27 19:14 UTC (permalink / raw)
  To: Tang Junhui; +Cc: Michael Lyle, Coly Li, linux-block, linux-bcache

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1507 bytes --]

On Thu, 13 Jul 2017, Eric Wheeler wrote:

> On Sun, 2 Jul 2017, Coly Li wrote:
> 
> > On 2017/7/1 上午4:42, bcache@lists.ewheeler.net wrote:
> > > From: Tang Junhui <tang.junhui@zte.com.cn>
> > > 
> > > Some missed IOs are not counted into cache_misses, this patch fix this
> > > issue.
> > 
> > Could you please explain more about,
> > - which kind of missed I/O are mot counted
> > - where cache_missed is located
> > 
> > This will help the patch to be more understandable.
> 
> Hi Tang,
> 
> I'm waiting to queue this patch pending your response to Coly.  Can you 
> update the message send a v2?

Hi Tang,

Can you to an update message and send this in so we can get the cache miss 
metrics corrected?
 
--
Eric Wheeler

> 
> 
> 
> > 
> > > 
> > > Signed-off-by: tang.junhui <tang.junhui@zte.com.cn>
> > > Reviewed-by: Eric Wheeler <bcache@linux.ewheeler.net>
> > > Cc: stable@vger.kernel.org
> > 
> > [snip]
> > 
> > > @@ -758,7 +760,7 @@ static void cached_dev_read_done_bh(struct closure *cl)
> > >  	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
> > >  
> > >  	bch_mark_cache_accounting(s->iop.c, s->d,
> > > -				  !s->cache_miss, s->iop.bypass);
> > > +				  !s->cache_missed, s->iop.bypass);
> > >  	trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass);
> > 
> > 
> > Should the above line be changed to,
> > 	trace_bcache_read(s->orig_bio, !s->cache_missed, s->iop.bypass);
> > as well ?
> > 
> > 
> > [snip]
> > 
> > Thanks.
> > 
> > -- 
> > Coly Li
> > 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 12/19] bcache: update bucket_in_use periodically
  2017-10-27 19:11                   ` Eric Wheeler
@ 2017-10-27 19:45                     ` Eric Wheeler
  0 siblings, 0 replies; 120+ messages in thread
From: Eric Wheeler @ 2017-10-27 19:45 UTC (permalink / raw)
  To: tang.junhui; +Cc: Coly Li, Michael Lyle, linux-bcache, linux-block

[-- Attachment #1: Type: TEXT/PLAIN, Size: 7703 bytes --]

On Fri, 27 Oct 2017, Eric Wheeler wrote:

> On Thu, 13 Jul 2017, Coly Li wrote:
> 
> > On 2017/7/13 下午12:13, Eric Wheeler wrote:
> > > On Tue, 11 Jul 2017, Coly Li wrote:
> > > 
> > >> On 2017/7/11 下午1:39, tang.junhui@zte.com.cn wrote:
> > >>> Compared to bucket depletion, resulting in hanging dead,
> > >>> It is worthy to consumes a little time to update the bucket_in_use.
> > >>> If you have any better solution, please show to us,
> > >>> We should solve it as soon as possible, not wait for it forever.
> > >>>
> > >>
> > >> I also test this patch on a cache device with 4x3.8TB size, all buckets
> > >> iteration takes around 40-50ms. If the iteration needs to hold
> > >> bucket_lock of cache set, it is very probably to introduce a huge I/O
> > >> latency in period of every 30 seconds.
> > >>
> > >> For database people, this is not good news.
> > > 
> > > 
> > > Hi Tang,
> > >    
> > > I'm waiting to queue this patch pending your response to Coly.  
> > > 
> > > Please send a v2 when you're ready.
> > 
> > 
> > Eric,
> > 
> > I guess Tang is working on the I/O hang issue during back ground garbage
> > collection running. From discussion from other email thread, it seems a
> > regular I/O request gets hung for 10+ second in some cases. Maybe that
> > issue is more urgent than this one.
> > 
> > From my personal opinion, updating bucket_in_use is for acting garbage
> > collection. If number of bucket in use is not updated in time, garbage
> > collection won't start due to old bucket_in_use still beyond
> > CUTOFF_WRITEBACK_SYNC.
> > 
> > We may maintain an atomic counter per-cache set for dirty buckets, and
> > update it at some locations when allocating or reclaiming bucket. This
> > counter is unnecessary to be very accurate, just accurate enough for
> > should_writeback() working correctly.
> > 
> > I am also looking at it for a better solution as well.
> 
> Hi Coli & Tang,
> 
> Have either of you had a chance to come up with a solution to this?

Nevermind, I just saw the patch sent on 10/24.  Thanks for your work on 
that!  I'll see if we can try it out.

--
Eric Wheeler



> 
> --
> Eric Wheeler
> 
> > 
> > Coly
> > 
> > 
> > >>
> > >> Coly
> > >>
> > >>
> > >>>
> > >>>
> > >>>
> > >>> 发件人:         Coly Li <i@coly.li>
> > >>> 收件人:         linux-block@vger.kernel.org, Tang Junhui
> > >>> <tang.junhui@zte.com.cn>,
> > >>> 抄送:        bcache@lists.ewheeler.net, linux-bcache@vger.kernel.org,
> > >>> hch@infradead.org, axboe@kernel.dk
> > >>> 日期:         2017/07/11 13:06
> > >>> 主题:        Re: [PATCH 12/19] bcache: update bucket_in_use periodically
> > >>> 发件人:        linux-bcache-owner@vger.kernel.org
> > >>> ------------------------------------------------------------------------
> > >>>
> > >>>
> > >>>
> > >>> On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
> > >>>> From: Tang Junhui <tang.junhui@zte.com.cn>
> > >>>>
> > >>>> bucket_in_use is updated in gc thread which triggered by invalidating or
> > >>>> writing sectors_to_gc dirty data, It's been too long, Therefore, when we
> > >>>> use it to compare with the threshold, it is often not timely, which leads
> > >>>> to inaccurate judgment and often results in bucket depletion.
> > >>>>
> > >>>> Signed-off-by: Tang Junhui <tang.junhui@zte.com.cn>
> > >>>> ---
> > >>>>  drivers/md/bcache/btree.c | 29 +++++++++++++++++++++++++++--
> > >>>>  1 file changed, 27 insertions(+), 2 deletions(-)
> > >>>>
> > >>>> diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
> > >>>> index 866dcf7..77aa20b 100644
> > >>>> --- a/drivers/md/bcache/btree.c
> > >>>> +++ b/drivers/md/bcache/btree.c
> > >>>> @@ -90,6 +90,8 @@
> > >>>>  #define MAX_NEED_GC                                  64
> > >>>>  #define MAX_SAVE_PRIO                                  72
> > >>>>  
> > >>>> +#define GC_THREAD_TIMEOUT_MS                 (30 * 1000)
> > >>>> +
> > >>>>  #define PTR_DIRTY_BIT                                  (((uint64_t) 1
> > >>> << 36))
> > >>>>  
> > >>>>  #define PTR_HASH(c, k)                                              
> > >>>                                                                         \
> > >>>> @@ -1760,6 +1762,23 @@ static void bch_btree_gc(struct cache_set *c)
> > >>>>                   bch_moving_gc(c);
> > >>>>  }
> > >>>>  
> > >>>> +void bch_update_bucket_in_use(struct cache_set *c)
> > >>>> +{
> > >>>> +                 struct cache *ca;
> > >>>> +                 struct bucket *b;
> > >>>> +                 unsigned i;
> > >>>> +                 size_t available = 0;
> > >>>> +
> > >>>> +                 for_each_cache(ca, c, i) {
> > >>>> +                                  for_each_bucket(b, ca) {
> > >>>> +                                                   if (!GC_MARK(b) ||
> > >>> GC_MARK(b) == GC_MARK_RECLAIMABLE)
> > >>>> +                                                                  
> > >>>  available++;
> > >>>> +                                  }
> > >>>> +                 }
> > >>>> +
> > >>>
> > >>> bucket_lock of cache set should be held before accessing buckets.
> > >>>
> > >>>
> > >>>> +                 c->gc_stats.in_use = (c->nbuckets - available) * 100
> > >>> / c->nbuckets;
> > >>>> +}
> > >>>> +
> > >>>>  static bool gc_should_run(struct cache_set *c)
> > >>>>  {
> > >>>>                   struct cache *ca;
> > >>>> @@ -1778,10 +1797,16 @@ static bool gc_should_run(struct cache_set *c)
> > >>>>  static int bch_gc_thread(void *arg)
> > >>>>  {
> > >>>>                   struct cache_set *c = arg;
> > >>>> +                 long  ret;
> > >>>> +                 unsigned long timeout =
> > >>> msecs_to_jiffies(GC_THREAD_TIMEOUT_MS);
> > >>>>  
> > >>>>                   while (1) {
> > >>>> -                                  wait_event_interruptible(c->gc_wait,
> > >>>> -                                                    
> > >>>  kthread_should_stop() || gc_should_run(c));
> > >>>> +                                  ret =
> > >>> wait_event_interruptible_timeout(c->gc_wait,
> > >>>> +                                                    
> > >>>  kthread_should_stop() || gc_should_run(c), timeout);
> > >>>> +                                  if (!ret) {
> > >>>> +                                                  
> > >>> bch_update_bucket_in_use(c);
> > >>>> +                                                   continue;
> > >>>
> > >>> A continue here will ignore status returned from kthread_should_stop(),
> > >>> which might not be expected behavior.
> > >>>
> > >>>
> > >>>> +                                  }
> > >>>>  
> > >>>>                                    if (kthread_should_stop())
> > >>>>                                                     break;
> > >>>>
> > >>>
> > >>> Iterating all buckets from the cache set requires bucket_lock to be
> > >>> held. Waiting for bucket_lock may take quite a long time for either
> > >>> bucket allocating code or bch_gc_thread(). What I concern is, this patch
> > >>> may introduce bucket allocation delay in period of GC_THREAD_TIMEOUT_MS.
> > >>>
> > >>> We need to find out a way to avoid such a performance regression.
> > >>>
> > >>> -- 
> > >>> Coly Li
> > >>> --
> > >>> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> > >>> the body of a message to majordomo@vger.kernel.org
> > >>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > >>>
> > >>>
> > >>
> > >>
> > >> -- 
> > >> Coly Li
> > 
> > 
> > -- 
> > Coly Li
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 

^ permalink raw reply	[flat|nested] 120+ messages in thread

* Re: [PATCH 15/19] bcache: fix issue of writeback rate at minimum 1 key per second
  2017-10-27 19:09             ` Eric Wheeler
@ 2017-10-28  8:58               ` Coly Li
  0 siblings, 0 replies; 120+ messages in thread
From: Coly Li @ 2017-10-28  8:58 UTC (permalink / raw)
  To: Eric Wheeler
  Cc: Tang Junhui, Michael Lyle, linux-block, linux-bcache, hch, axboe

On 2017/10/28 上午3:09, Eric Wheeler wrote:
> 
> [+cc Michael Lyle]
> 
> On Fri, 27 Oct 2017, Eric Wheeler wrote:
> 
>> On Sun, 16 Jul 2017, Coly Li wrote:
>>
>>> On 2017/7/1 上午4:43, bcache@lists.ewheeler.net wrote:
>>>> From: Tang Junhui <tang.junhui@zte.com.cn>
>>>>
>>>> When there is not enough dirty data in writeback cache,
>>>> writeback rate is at minimum 1 key per second
>>>> util all dirty data to be cleaned, it is inefficiency,
>>>> and also causes waste of energy;
>>>
>>> Hi Junhui and Eric,
>>>
>>> What: /sys/block/<disk>/bcache/writeback_percent
>>> Description:
>>>       For backing devices: If nonzero, writeback from cache to
>>>       backing device only takes place when more than this percentage
>>>       of the cache is used, allowing more write coalescing to take
>>>       place and reducing total number of writes sent to the backing
>>>       device. Integer between 0 and 40.
>>>
>>> I see above text from Documentation/ABI/testing/sysfs-block-bcache (I
>>> know this document is quite old), it seems if "not enough" means dirty
>>> data percentage is less then writback_percent, bcache should not
>>> performance writeback I/O. But in __update_writeback_rate(),
>>> writeback_rate.rate is clamped in [1, NSEC_PER_MSEC]. It seems in PD
>>> controller code of __update_writeback_rate(), writeback_percent is only
>>> used to calculate dirty target number, its another functionality as
>>> writeback threshold is not handled here.
>>>
>>>>
>>>> in this patch, When there is not enough dirty data,
>>>> let the writeback rate to be 0, and writeback re-schedule
>>>> in bch_writeback_thread() periodically with schedule_timeout(),
>>>> the behaviors are as follows :
>>>>
>>>> 1) If no dirty data have been read into dc->writeback_keys,
>>>> goto step 2), otherwise keep writing these dirty data to
>>>> back-end device at 1 key per second, until all these dirty data
>>>> write over, then goto step 2).
>>>>
>>>> 2) Loop in bch_writeback_thread() to check if there is enough
>>>> dirty data for writeback. if there is not enough diry data for
>>>> writing, then sleep 10 seconds, otherwise, write dirty data to
>>>> back-end device.
>>>
>>> Bcache uses a Proportion-Differentiation Controller to control writeback
>>> rate. When dirty data is far from target, writeback rate is higher; when
>>> dirty data is close to target, writeback rate is slower. The advantage
>>> of PD controller here is, when regular I/O and writeback I/O happens in
>>> same time,
>>> - When there are a lot of dirty data, writeback I/O can have more chance
>>> to write them back to cached device, which in turns has positive impact
>>> to regular I/O.
>>> - When dirty data is decreased and close to target dirty number, less
>>> writeback I/O can help regular I/O has better throughput and latency.
>>>
>>> The root cause of 1 key per second is, the PD controller is designed for
>>> better I/O performance, not less energy consumption. When the existing
>>> dirty data gets closed to target dirty number, the PD controller chooses
>>> to use longer writeback time to make a better regular I/O performance.
>>> If it is designed for less energy consumption, it should keep the
>>> writeback rate in a high level and finish writing back all dirty data as
>>> soon as possible.
>>>
>>> This patch may introduce an unexpected behavior of dirty data writeback
>>> throughput, when regular write I/O and writeback I/O happen in same
>>> time. In this case, dirty data number may shake up and down around
>>> target dirty number, it is possible that change (the variable in
>>> __update_writeback_rate()) is a minus value, and the result of
>>> dc->writeback_rate.rate + change happens to be 0. This patch changes the
>>> clamp range of writeback_rate.rate to [0, NSEC_PER_MSEC], so
>>> writeback_rate.rate can be possible to be 0. And in bch_next_delay() if
>>> d->rate is zero, the write back I/O will be delayed to now +
>>> NSEC_PER_SEC. When there is no regular I/O it works well, but when there
>>> is regular I/O, this longer delay may cause more dirty data piled in
>>> cache device, and PD controller cannot generage a stable writeback rate.
>>> This is not an expected behavior for the writeback rate PD controller.
>>>
>>> Another method to fix might be,
>>> 1) define a sysfs to define writeback_rate with max/dynamic option.
>>> 2) dynamic writeback_rate as default
>>> 3) when max is set, in __update_writeback_rate() assign NSEC_PER_MSEC to
>>> writeback_rate.rate
>>> 4) in bch_writeback_thread(), if no writeback I/O on fly, and dirty data
>>> does not reach dc->writeback_percent, schedule out.
>>> 5) if writeback is necessary then do it in max rate and finish it as
>>> soon as possible, to save laptop energy.
>>>
>>> The above method might be helpful to energy save as well (perform dirty
>>> dat write back in batch), and does not change default PD controller
>>> behavior.
>>>
>>> Just for your reference. Or if you are too busy to look at it, I can try
>>> to compose a patch for review.
>>
>> Hi Coli,
>>
>> Did this go anywere?  I think the 1-key/sec fix is a good idea and your 
>> suggestion will help out mobile users.
>>

Hi Eric,

Michael is working on writeback improvement currently. He proposes some
patches to improve writeback efficiency from a little bit different
view, and after some quite deep discussion I feel some of his ideas are
promising. e.g. writeback more keys if backing device is idle.

Currently it seems a better writeback performance results more lock
contention in between with front end I/O. This is why Junhui posts a
realy time buckets in use counting patch. This is a start to reduce lock
contention in bcache tree writebac/gc/key insert.

I just feel this is a serieal continuous effort to improve writeback
efficiency. the 1-key/sec fix might be one of them, let's
improve-and-test :-)

Thanks.

Coly Li


-- 
Coly Li

^ permalink raw reply	[flat|nested] 120+ messages in thread

end of thread, other threads:[~2017-10-28  8:58 UTC | newest]

Thread overview: 120+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-25 19:10 [PULL] bcache updates based on git.kernel.dk/linux-block:for-next Eric Wheeler
2017-06-28 23:06 ` [PULL] bcache fixes and updates for-4.13 Eric Wheeler
2017-06-28 23:06   ` Eric Wheeler
2017-06-29 13:45   ` Christoph Hellwig
2016-10-11 19:04     ` [PATCH 07/19] bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints Eric Wheeler
2016-10-11 19:04     ` Eric Wheeler
2016-10-11 19:08     ` [PATCH 08/19] bcache: documentation for sysfs entries describing bcache cache hinting Eric Wheeler
2016-10-11 19:08     ` Eric Wheeler
2016-10-20  0:36     ` [PATCH 09/19] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints Eric Wheeler
2016-10-20  0:36     ` Eric Wheeler
2017-05-09 19:03     ` [PATCH 01/19] bcache: Fix leak of bdev reference Jan Kara
2017-05-09 19:03     ` Jan Kara
2017-05-09 19:05     ` [PATCH 02/19] bcache: fix sequential large write IO bypass Tang Junhui
2017-05-09 19:05     ` Tang Junhui
2017-05-09 19:07     ` [PATCH 03/19] bcache: do not subtract sectors_to_gc for bypassed IO Tang Junhui
2017-05-09 19:07     ` Tang Junhui
2017-05-09 19:14     ` [PATCH 05/19] bcache: fix calling ida_simple_remove() with incorrect minor Tang Junhui
2017-05-09 19:14     ` Tang Junhui
2017-06-01  8:48     ` [PATCH 10/19] bcache: initialize stripe_sectors_dirty correctly for thin flash device Tang Junhui
2017-06-01  8:48     ` Tang Junhui
2017-06-12 21:18     ` [PATCH 14/19] bcache: Correct return value for sysfs attach errors Tony Asleson
2017-06-12 21:18     ` Tony Asleson
2017-06-28  0:30     ` [PATCH 13/19] bcache: delete redundant calling set_gc_sectors() Tang Junhui
2017-06-28  0:30     ` Tang Junhui
2017-06-28  0:37     ` [PATCH 16/19] bcache: increase the number of open buckets Tang Junhui
2017-06-28  0:37     ` Tang Junhui
2017-06-28  0:41     ` [PATCH 17/19] bcache: fix for gc and write-back race Tang Junhui
2017-06-28  0:41     ` Tang Junhui
2017-06-28 11:47     ` [PATCH 18/19] bcache: silence static checker warning Dan Carpenter
2017-06-28 11:47     ` Dan Carpenter
2017-06-28 11:48     ` [PATCH 19/19] bcache: Update continue_at() documentation Dan Carpenter
2017-06-28 11:48     ` Dan Carpenter
2017-06-29 16:19     ` [PULL] bcache fixes and updates for-4.13 Coly Li
2017-06-29 22:12     ` Eric Wheeler
2017-06-29 22:12       ` Eric Wheeler
2017-06-29 22:25       ` Eric Wheeler
2017-06-29 23:28         ` Nick Alcock
2017-06-30 20:42     ` [PATCH 01/19] bcache: Fix leak of bdev reference bcache
2017-06-30 20:42       ` [PATCH 02/19] bcache: fix sequential large write IO bypass bcache
2017-07-05 18:25         ` Christoph Hellwig
2017-06-30 20:42       ` [PATCH 03/19] bcache: do not subtract sectors_to_gc for bypassed IO bcache
2017-07-01 17:26         ` Coly Li
2017-07-05 18:25         ` Christoph Hellwig
2017-06-30 20:42       ` [PATCH 04/19] bcache: fix wrong cache_misses statistics bcache
2017-07-01 17:58         ` Coly Li
2017-07-13  4:09           ` Eric Wheeler
2017-10-27 19:14             ` Eric Wheeler
2017-06-30 20:42       ` [PATCH 05/19] bcache: fix calling ida_simple_remove() with incorrect minor bcache
2017-07-05 18:26         ` Christoph Hellwig
2017-07-06  6:21           ` tang.junhui
2017-06-30 20:42       ` [PATCH 06/19] bcache: explicitly destory mutex while exiting bcache
2017-07-01 18:43         ` Coly Li
2017-07-05 11:58           ` Liang Chen
2017-07-05 11:58             ` Liang Chen
2017-07-11  7:22             ` Coly Li
2017-07-05 18:27         ` Christoph Hellwig
2017-07-06  1:56           ` Liang Chen
2017-06-30 20:42       ` [PATCH 07/19] bcache: introduce bcache sysfs entries for ioprio-based bypass/writeback hints bcache
2017-07-05 18:47         ` Christoph Hellwig
2017-07-05 21:49           ` Eric Wheeler
2017-06-30 20:42       ` [PATCH 08/19] bcache: documentation for sysfs entries describing bcache cache hinting bcache
2017-07-05 18:27         ` Christoph Hellwig
2017-06-30 20:42       ` [PATCH 09/19] bcache: update bio->bi_opf bypass/writeback REQ_ flag hints bcache
2017-07-01 18:49         ` Coly Li
2017-07-01 19:39           ` Eric Wheeler
2017-07-02  6:51             ` Coly Li
2017-07-03 22:51               ` [PATCH 09/19 v2] " bcache
2017-07-04  4:08                 ` Coly Li
2017-07-05 18:48                 ` Christoph Hellwig
2017-07-06  7:35                   ` Coly Li
2017-07-06 15:24                     ` Christoph Hellwig
2017-07-11  3:48                       ` Coly Li
2017-07-12  9:18                         ` Coly Li
2017-06-30 20:42       ` [PATCH 10/19] bcache: initialize stripe_sectors_dirty correctly for thin flash device bcache
2017-07-01 18:52         ` Coly Li
2017-07-13  4:10           ` Eric Wheeler
2017-06-30 20:43       ` [PATCH 11/19] bcache: Subtract dirty sectors of thin flash from cache_sectors in calculating writeback rate bcache
2017-07-10 18:11         ` Coly Li
     [not found]           ` <OF92BDA950.86AA00FA-ON4825815A.001F33D9-4825815A.001F5C89@zte.com.cn>
2017-07-13  4:12             ` Eric Wheeler
2017-07-13  4:15               ` Coly Li
2017-10-27 19:12                 ` Eric Wheeler
2017-06-30 20:43       ` [PATCH 12/19] bcache: update bucket_in_use periodically bcache
2017-07-11  5:05         ` Coly Li
     [not found]           ` <OF5C19A8FA.5FF48E0C-ON4825815A.001E6DB1-4825815A.001F14F2@zte.com.cn>
2017-07-11  7:20             ` Coly Li
2017-07-11 13:06             ` Coly Li
2017-07-13  4:13               ` Eric Wheeler
2017-07-13  4:27                 ` Coly Li
2017-10-27 19:11                   ` Eric Wheeler
2017-10-27 19:45                     ` Eric Wheeler
2017-06-30 20:43       ` [PATCH 13/19] bcache: delete redundant calling set_gc_sectors() bcache
2017-07-13  3:41         ` Eric Wheeler
2017-06-30 20:43       ` [PATCH 14/19] bcache: Correct return value for sysfs attach errors bcache
2017-06-30 20:43       ` [PATCH 15/19] bcache: fix issue of writeback rate at minimum 1 key per second bcache
2017-07-16 10:04         ` Coly Li
2017-10-27 19:07           ` Eric Wheeler
2017-10-27 19:09             ` Eric Wheeler
2017-10-28  8:58               ` Coly Li
2017-06-30 20:43       ` [PATCH 16/19] bcache: increase the number of open buckets bcache
2017-06-30 20:43         ` bcache
2017-07-13  9:56         ` Coly Li
2017-06-30 20:43       ` [PATCH 17/19] bcache: fix for gc and write-back race bcache
2017-08-03 16:20         ` Coly Li
2017-06-30 20:43       ` [PATCH 18/19] bcache: silence static checker warning bcache
2017-07-13  9:44         ` Coly Li
2017-06-30 20:43       ` [PATCH 19/19] bcache: Update continue_at() documentation bcache
2017-07-05 18:48         ` Christoph Hellwig
2017-07-08 18:12         ` Coly Li
2017-07-01 16:55       ` [PATCH 01/19] bcache: Fix leak of bdev reference Coly Li
2017-07-05 18:24       ` Christoph Hellwig
2017-09-04 17:30         ` Coly Li
2017-09-05  6:43           ` Christoph Hellwig
2017-09-05  6:55             ` Coly Li
2017-09-06  5:25             ` Coly Li
2017-07-14 11:40 ` [PULL] bcache updates based on git.kernel.dk/linux-block:for-next Eddie Chapman
2017-07-14 15:07   ` Coly Li
2017-07-14 17:33     ` Eddie Chapman
     [not found]       ` <OF92BA0158.87BDF9E3-ON4825815E.000736BF-4825815E.000833F7@zte.com.cn>
2017-07-18 18:24         ` Eddie Chapman
2017-07-18 18:31           ` Eddie Chapman
2017-07-18 20:06             ` Greg KH
2017-07-18 20:36               ` Eddie Chapman

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.