All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] bcache: introduce ioprio-based bypass/writeback hints
@ 2016-09-26 23:17 Eric Wheeler
  2016-09-27  1:43 ` Pavel Goran
  2016-09-28 19:26 ` Kai Krakow
  0 siblings, 2 replies; 11+ messages in thread
From: Eric Wheeler @ 2016-09-26 23:17 UTC (permalink / raw)


Add support to bcache hinting functions and sysfs to hint by the ioprio of
'current' which can be configured with `ionice`.

Cache hinting is configurable by writing 'class,level' pairs to sysfs.
These are the defaults:
	echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
	echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback

(-p) IO Class	 (-n) Class level	Action
-----------------------------------------------------
(1) Realtime	  0-7				Writeback
(2) Best-effort     0 				Writeback
(2) Best-effort   1-6				Original bcache logic
(2) Best-effort     7				Bypass cache
(3) Idle          n/a				Bypass cache

See `man ionice` for more ioprio detail.

Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
---
 drivers/md/bcache/bcache.h    |  3 +++
 drivers/md/bcache/request.c   | 13 +++++++++++++
 drivers/md/bcache/sysfs.c     | 45 +++++++++++++++++++++++++++++++++++++++++++
 drivers/md/bcache/writeback.c |  3 +++
 drivers/md/bcache/writeback.h | 13 +++++++++++++
 5 files changed, 77 insertions(+)

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 6b420a5..bf945fa 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -367,6 +367,9 @@ struct cached_dev {
 	unsigned		writeback_rate_update_seconds;
 	unsigned		writeback_rate_d_term;
 	unsigned		writeback_rate_p_term_inverse;
+
+	unsigned short		ioprio_writeback;
+	unsigned short		ioprio_bypass;
 };
 
 enum alloc_reserve {
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 4b177fe..0c0a2f6 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -375,6 +375,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	unsigned sectors, congested = bch_get_congested(c);
 	struct task_struct *task = current;
 	struct io *i;
+	struct io_context *ioc;
 
 	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
@@ -386,6 +387,18 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	     op_is_write(bio_op(bio))))
 		goto skip;
 
+	/* If process ioprio is lower-or-equal to dc->ioprio_bypass, then
+	 * hint for bypass. Note that a lower-priority IO class+value
+	 * has a greater numeric value. */
+	ioc = get_task_io_context(current, GFP_ATOMIC, NUMA_NO_NODE);
+	if (ioc
+		&& ioprio_valid(ioc->ioprio)
+		&& ioprio_valid(dc->ioprio_writeback)
+		&& ioc->ioprio >= dc->ioprio_bypass) {
+		put_io_context(ioc);
+		return true;
+	}
+
 	if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
 	    bio_sectors(bio) & (c->sb.block_size - 1)) {
 		pr_debug("skipping unaligned io");
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index b3ff57d..83ac010 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -106,6 +106,9 @@ rw_attribute(btree_shrinker_disabled);
 rw_attribute(copy_gc_enabled);
 rw_attribute(size);
 
+rw_attribute(ioprio_writeback);
+rw_attribute(ioprio_bypass);
+
 SHOW(__bch_cached_dev)
 {
 	struct cached_dev *dc = container_of(kobj, struct cached_dev,
@@ -182,6 +185,17 @@ SHOW(__bch_cached_dev)
 		return strlen(buf);
 	}
 
+	if (attr == &sysfs_ioprio_bypass)
+		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
+			IOPRIO_PRIO_CLASS(dc->ioprio_bypass),
+			IOPRIO_PRIO_DATA(dc->ioprio_bypass));
+
+	if (attr == &sysfs_ioprio_writeback)
+		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
+			IOPRIO_PRIO_CLASS(dc->ioprio_writeback),
+			IOPRIO_PRIO_DATA(dc->ioprio_writeback));
+
+
 #undef var
 	return 0;
 }
@@ -194,6 +208,8 @@ STORE(__cached_dev)
 	unsigned v = size;
 	struct cache_set *c;
 	struct kobj_uevent_env *env;
+	unsigned ioprio_class = 0; /* invalid initial ioprio values */
+	unsigned ioprio_value = IOPRIO_BE_NR+1;
 
 #define d_strtoul(var)		sysfs_strtoul(var, dc->var)
 #define d_strtoul_nonzero(var)	sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
@@ -282,6 +298,33 @@ STORE(__cached_dev)
 	if (attr == &sysfs_stop)
 		bcache_device_stop(&dc->disk);
 
+	if (attr == &sysfs_ioprio_writeback) {
+		if (sscanf(buf, "%u,%u", &ioprio_class, &ioprio_value) != 2
+			|| ioprio_class > IOPRIO_CLASS_IDLE
+			|| ioprio_class < IOPRIO_CLASS_RT
+			|| ioprio_value >= IOPRIO_BE_NR)
+			pr_err("bcache: ioprio_writeback invalid ioprio class: %u data value: %u\n",
+				ioprio_class, ioprio_value);
+		else if (ioprio_class == IOPRIO_CLASS_IDLE)
+			dc->ioprio_writeback = IOPRIO_PRIO_VALUE(ioprio_class, 0);
+		else
+			dc->ioprio_writeback = IOPRIO_PRIO_VALUE(ioprio_class, ioprio_value);
+	}
+
+	if (attr == &sysfs_ioprio_bypass)
+	{
+		if (sscanf(buf, "%u,%u", &ioprio_class, &ioprio_value) != 2
+			|| ioprio_class > IOPRIO_CLASS_IDLE
+			|| ioprio_class < IOPRIO_CLASS_RT
+			|| ioprio_value >= IOPRIO_BE_NR)
+			pr_err("bcache: ioprio_bypass invalid ioprio class: %u data value: %u\n",
+				ioprio_class, ioprio_value);
+		else if (ioprio_class == IOPRIO_CLASS_IDLE)
+			dc->ioprio_writeback = IOPRIO_PRIO_VALUE(ioprio_class, 0);
+		else
+			dc->ioprio_writeback = IOPRIO_PRIO_VALUE(ioprio_class, ioprio_value);
+	}
+
 	return size;
 }
 
@@ -334,6 +377,8 @@ static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_verify,
 	&sysfs_bypass_torture_test,
 #endif
+	&sysfs_ioprio_bypass,
+	&sysfs_ioprio_writeback,
 	NULL
 };
 KTYPE(bch_cached_dev);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index d9fd2a6..9772988 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -514,6 +514,9 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_rate_d_term	= 30;
 	dc->writeback_rate_p_term_inverse = 6000;
 
+	dc->ioprio_writeback = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
+	dc->ioprio_bypass    = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NR-1);
+
 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 }
 
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 301eaf5..a21fd87 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -43,6 +43,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 				    unsigned cache_mode, bool would_skip)
 {
 	unsigned in_use = dc->disk.c->gc_stats.in_use;
+	struct io_context *ioc;
 
 	if (cache_mode != CACHE_MODE_WRITEBACK ||
 	    test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
@@ -57,6 +58,18 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 	if (would_skip)
 		return false;
 
+	/* If process ioprio is higher-or-equal to dc->ioprio_writeback, then
+	 * hint for writeback. Note that a higher-priority IO class+value
+	 * has a lesser numeric value. */
+	ioc = get_task_io_context(current, GFP_ATOMIC, NUMA_NO_NODE);
+	if (ioc
+		&& ioprio_valid(ioc->ioprio)
+		&& ioprio_valid(dc->ioprio_writeback)
+		&& ioc->ioprio <= dc->ioprio_writeback) {
+		put_io_context(ioc);
+		return true;
+	}
+
 	return bio->bi_opf & REQ_SYNC ||
 		in_use <= CUTOFF_WRITEBACK;
 }
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] bcache: introduce ioprio-based bypass/writeback hints
  2016-09-26 23:17 [PATCH 1/2] bcache: introduce ioprio-based bypass/writeback hints Eric Wheeler
@ 2016-09-27  1:43 ` Pavel Goran
  2016-09-27  2:43   ` Kai Krakow
  2016-09-28 19:26 ` Kai Krakow
  1 sibling, 1 reply; 11+ messages in thread
From: Pavel Goran @ 2016-09-27  1:43 UTC (permalink / raw)
  To: Eric Wheeler

Hello Eric,

Tuesday, September 27, 2016, 6:17:22 AM, you wrote:

> Add support to bcache hinting functions and sysfs to hint by the ioprio of
> 'current' which can be configured with `ionice`.

> Cache hinting is configurable by writing 'class,level' pairs to sysfs.
> These are the defaults:
>         echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
>         echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback

> (-p) IO Class    (-n) Class level       Action
> -----------------------------------------------------
> (1) Realtime      0-7                           Writeback
> (2) Best-effort     0                           Writeback
> (2) Best-effort   1-6                           Original bcache logic
> (2) Best-effort     7                           Bypass cache
> (3) Idle          n/a                           Bypass cache

Not sure it's a good idea, at all. If I set cache policy to, say,
write-through, then I expect write-back to never happen, regardless of what
userspace does with IO priority.

Similarly, using low IO priority (idle or best effort-7) should not make IO
*slow regardless of any other IO load* (which would happen if cache is
completely bypassed).

Right now, it looks to me as inappropriate mixing of different concepts.
Unless I fail to understand something.

Pavel Goran
  

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] bcache: introduce ioprio-based bypass/writeback hints
  2016-09-27  1:43 ` Pavel Goran
@ 2016-09-27  2:43   ` Kai Krakow
  2016-09-28 18:21     ` Eric Wheeler
  0 siblings, 1 reply; 11+ messages in thread
From: Kai Krakow @ 2016-09-27  2:43 UTC (permalink / raw)
  To: linux-bcache

Am Tue, 27 Sep 2016 08:43:09 +0700
schrieb Pavel Goran <via-bcache@pvgoran.name>:

> Hello Eric,
> 
> Tuesday, September 27, 2016, 6:17:22 AM, you wrote:
> 
> > Add support to bcache hinting functions and sysfs to hint by the
> > ioprio of 'current' which can be configured with `ionice`.  
> 
> > Cache hinting is configurable by writing 'class,level' pairs to
> > sysfs. These are the defaults:
> >         echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
> >         echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback  
> 
> > (-p) IO Class    (-n) Class level       Action
> > -----------------------------------------------------
> > (1) Realtime      0-7                           Writeback
> > (2) Best-effort     0                           Writeback
> > (2) Best-effort   1-6                           Original bcache
> > logic (2) Best-effort     7                           Bypass cache
> > (3) Idle          n/a                           Bypass cache  
> 
> Not sure it's a good idea, at all. If I set cache policy to, say,
> write-through, then I expect write-back to never happen, regardless
> of what userspace does with IO priority.
> 
> Similarly, using low IO priority (idle or best effort-7) should not
> make IO *slow regardless of any other IO load* (which would happen if
> cache is completely bypassed).
> 
> Right now, it looks to me as inappropriate mixing of different
> concepts. Unless I fail to understand something.

I'm also uncomfortable with mixing concepts but I like the idea of
this. In the end, "idle" means that you don't care about the
performance of the process anyways.

I think the kernel already supports cache hinting in the sense of
"don't pollute my cache with these file operations". It should maybe
better hook in with that but I think this was already discussed here.
The outcome was that mixing different cache levels is also not a good
idea.

Currently, sticking to using "ioprio_bypass" would probably the best we
can easily get. I strongly object to enable "ioprio_writeback" to
bypass whatever policy was assigned to the device.

So, +1 for ioprio_bypass, but NACK for ioprio_writeback...

Maybe it would make sense to have another IO class like "bulk" which
could work like "idle" but also bypasses writing to caches and thus
stops flushing important data out of cache (and as a side-effect also
reduces flash wearing for bulk operations).


-- 
Regards,
Kai

Replies to list-only preferred.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] bcache: introduce ioprio-based bypass/writeback hints
  2016-09-27  2:43   ` Kai Krakow
@ 2016-09-28 18:21     ` Eric Wheeler
  2016-09-29  6:57       ` Re[2]: " Pavel Goran
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Wheeler @ 2016-09-28 18:21 UTC (permalink / raw)
  To: Kai Krakow; +Cc: linux-bcache, Pavel Goran

On Tue, 27 Sep 2016, Kai Krakow wrote:

> Am Tue, 27 Sep 2016 08:43:09 +0700
> schrieb Pavel Goran <via-bcache@pvgoran.name>:
> 
> > Hello Eric,
> > 
> > Tuesday, September 27, 2016, 6:17:22 AM, you wrote:
> > 
> > > Add support to bcache hinting functions and sysfs to hint by the
> > > ioprio of 'current' which can be configured with `ionice`.  
> > 
> > > Cache hinting is configurable by writing 'class,level' pairs to
> > > sysfs. These are the defaults:
> > >         echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
> > >         echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback  
> > 
> > > (-p) IO Class    (-n) Class level       Action
> > > -----------------------------------------------------
> > > (1) Realtime      0-7                           Writeback
> > > (2) Best-effort     0                           Writeback
> > > (2) Best-effort   1-6                           Original bcache
> > > logic (2) Best-effort     7                           Bypass cache
> > > (3) Idle          n/a                           Bypass cache  
> > 
> > Not sure it's a good idea, at all. If I set cache policy to, say,
> > write-through, then I expect write-back to never happen, regardless
> > of what userspace does with IO priority.

It respects policy first of course.  You can see the top of 
should_writeback() in the patch as context:
	if (cache_mode != CACHE_MODE_WRITEBACK || [...]

> > Similarly, using low IO priority (idle or best effort-7) should not
> > make IO *slow regardless of any other IO load* (which would happen if
> > cache is completely bypassed).

Perhaps I should explain my motivation for writing such a patch:

We run 10's of terabytes through bcache in writeback mode each week for 
backup verifies.  Since most verifies read snapshots, the pattern appears 
random so bcache pollutes the cache with data that it will never read 
again. Thus, we really needed an ioprio_bypass implementation to save on 
SSD wearout costs.

We have VMs that do a lot of writing, but for which IO latency is not an 
issue. Thus, ioprio_bypasss reduces the erase cycle load on the SSDs for 
those processes as well. There are a few VMs that benefit from 100% 
caching, thus ioprio_writeback can help with performance.

Unnecessary cache reads/writes cause two problems:

1. Performance degrades because the cache is polluted with unnecessary 
   data.

2. It costs real money when SSDs wear out. I have no idea how many we've 
   purchased because of the cost of caching that provides performance we 
   need.


To address your question, though, about making IO slow regardless of other 
IO:

I tested, and even if a read is in an IO class flagged to be bypass (lower 
priority than ioprio_bypass), it will read from the cache if it's already 
hot. In addition, that bypass-flagged read for a block that was in the 
cache does not get demoted---it stays in the cache for processes that need 
it.

If not already in cache, it will read from the backing device and will not 
promote into (pollute) your cache. Having idle IOs bypass the cache can 
increase performance everywhere else since you probably don't care about 
the performance of idle IOs.

> In the end, "idle" means that you don't care about the performance of 
> the process anyways.

Exactly. If an IO is idle, then chances are you've made it idle because 
you care more about other tasks' IO performance. If idle did not bypass 
the cache, then it would pollute the cache and your more important cache 
would have less room for hot data.

> I think the kernel already supports cache hinting in the sense of
> "don't pollute my cache with these file operations". It should maybe
> better hook in with that but I think this was already discussed here.
> The outcome was that mixing different cache levels is also not a good
> idea.

I believe you are referring to the page cache, and yes it supports hinting 
through fadvise and madvise. I've actually investigated this and others 
have already tried. The best route for plumbing is by fiddling page table 
bits to store ioprio bits, but everything thinks that is a bit of a hack. 

See this thread where others have tried and been rejected: 
	https://lkml.org/lkml/2014/10/29/698

As it turns out, fadvise and madvise are too granular for most cases. 
Usually system administrators are the ones making choices about cache 
hierarchy and process priority, whereas, software developers are generally 
implementing fadvise/madvise within their userspace code. The only way to 
push fadvise/madvise down from the outside of a process call is to hook 
the system call with something ugly like LD_PRELOAD; similar 
implementations have been made like libeatmydata.so to turn off 
synchronous writes on existing software without patching and recompiling.

For those of us using direct IO (O_DIRECT), page cache hints are useless 
because the memory cache is being bypassed. This is our use case. We wish 
to avoid the page cache altogether, but influence the block layer cache 
with IO priorities.


> Currently, sticking to using "ioprio_bypass" would probably the best we
> can easily get. I strongly object to enable "ioprio_writeback" to
> bypass whatever policy was assigned to the device.
>
> So, +1 for ioprio_bypass, but NACK for ioprio_writeback...
> 

As above, the policy is respected. This ioprio implementation will not 
override the cache policy and only affects writes if the cache mode is in 
writeback.


> Maybe it would make sense to have another IO class like "bulk" which
> could work like "idle" but also bypasses writing to caches and thus
> stops flushing important data out of cache (and as a side-effect also
> reduces flash wearing for bulk operations).

If you really want idle IO that can writeback, then I can set the default 
for ioprio_writeback and/or ioprio_bypass to be disabled until a user 
explicitly sets them.

Presently, they are configured the way that I would use them in our 
infrastructure. For us, most IO priorities are in the best-effort class 
(-c 2), and levels 1-6 give us plenty of room for organizing priorities. 

Very few processes are low latency (-c 1), but those that are need to 
complete as soon as possible and are thus hinted to always writeback. It 
is also nice to have a non-realtime ioprio of 2,0 for things that need to 
writeback but do not need the throughput limiting time slicing that 
happens in the realtime class in order to provide low latency guarantees.

(One of our systems has only a 13% hit ratio because of pollution. I look 
forward to seeing what this metric becomes after tuning the ioprio 
parameters.)

The block layer is being refactored right now and core changes are not 
being accepted until the legacy scheduling code has been removed or 
deprecated in favor of the blk-mq code. Adding a fourth IO class of "bulk" 
would require modifications to the existing IO schedulers that would 
certainly be rejected upstream.

For now, adding configurable tunables to bcache is our best option to 
provide a feature that has the capacity to increase performance and the 
longevity of flash devices.

--
Eric Wheeler


> 
> 
> -- 
> Regards,
> Kai
> 
> Replies to list-only preferred.
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] bcache: introduce ioprio-based bypass/writeback hints
  2016-09-26 23:17 [PATCH 1/2] bcache: introduce ioprio-based bypass/writeback hints Eric Wheeler
  2016-09-27  1:43 ` Pavel Goran
@ 2016-09-28 19:26 ` Kai Krakow
  2016-09-28 21:38   ` Eric Wheeler
  1 sibling, 1 reply; 11+ messages in thread
From: Kai Krakow @ 2016-09-28 19:26 UTC (permalink / raw)
  To: linux-bcache

Am Mon, 26 Sep 2016 16:17:22 -0700
schrieb Eric Wheeler <git@linux.ewheeler.net>:

> Add support to bcache hinting functions and sysfs to hint by the
> ioprio of 'current' which can be configured with `ionice`.
> 
> Cache hinting is configurable by writing 'class,level' pairs to sysfs.
> These are the defaults:
> 	echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
> 	echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback
> 
> (-p) IO Class	 (-n) Class level	Action
> -----------------------------------------------------
> (1) Realtime	  0-7				Writeback
> (2) Best-effort     0 				Writeback
> (2) Best-effort   1-6				Original bcache
> logic (2) Best-effort     7				Bypass
> cache (3) Idle          n/a				Bypass
> cache
> 
> See `man ionice` for more ioprio detail.

This patch fails to apply on 4.7.5... Is it possible to adapt for that
version?

> Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
> ---
>  drivers/md/bcache/bcache.h    |  3 +++
>  drivers/md/bcache/request.c   | 13 +++++++++++++
>  drivers/md/bcache/sysfs.c     | 45
> +++++++++++++++++++++++++++++++++++++++++++
> drivers/md/bcache/writeback.c |  3 +++ drivers/md/bcache/writeback.h
> | 13 +++++++++++++ 5 files changed, 77 insertions(+)
> 
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 6b420a5..bf945fa 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -367,6 +367,9 @@ struct cached_dev {
>  	unsigned		writeback_rate_update_seconds;
>  	unsigned		writeback_rate_d_term;
>  	unsigned		writeback_rate_p_term_inverse;
> +
> +	unsigned short		ioprio_writeback;
> +	unsigned short		ioprio_bypass;
>  };
>  
>  enum alloc_reserve {
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index 4b177fe..0c0a2f6 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -375,6 +375,7 @@ static bool check_should_bypass(struct cached_dev
> *dc, struct bio *bio) unsigned sectors, congested =
> bch_get_congested(c); struct task_struct *task = current;
>  	struct io *i;
> +	struct io_context *ioc;
>  
>  	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
>  	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
> @@ -386,6 +387,18 @@ static bool check_should_bypass(struct
> cached_dev *dc, struct bio *bio) op_is_write(bio_op(bio))))
>  		goto skip;
>  
> +	/* If process ioprio is lower-or-equal to dc->ioprio_bypass,
> then
> +	 * hint for bypass. Note that a lower-priority IO class+value
> +	 * has a greater numeric value. */
> +	ioc = get_task_io_context(current, GFP_ATOMIC, NUMA_NO_NODE);
> +	if (ioc
> +		&& ioprio_valid(ioc->ioprio)
> +		&& ioprio_valid(dc->ioprio_writeback)
> +		&& ioc->ioprio >= dc->ioprio_bypass) {
> +		put_io_context(ioc);
> +		return true;
> +	}
> +
>  	if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
>  	    bio_sectors(bio) & (c->sb.block_size - 1)) {
>  		pr_debug("skipping unaligned io");
> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
> index b3ff57d..83ac010 100644
> --- a/drivers/md/bcache/sysfs.c
> +++ b/drivers/md/bcache/sysfs.c
> @@ -106,6 +106,9 @@ rw_attribute(btree_shrinker_disabled);
>  rw_attribute(copy_gc_enabled);
>  rw_attribute(size);
>  
> +rw_attribute(ioprio_writeback);
> +rw_attribute(ioprio_bypass);
> +
>  SHOW(__bch_cached_dev)
>  {
>  	struct cached_dev *dc = container_of(kobj, struct cached_dev,
> @@ -182,6 +185,17 @@ SHOW(__bch_cached_dev)
>  		return strlen(buf);
>  	}
>  
> +	if (attr == &sysfs_ioprio_bypass)
> +		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
> +			IOPRIO_PRIO_CLASS(dc->ioprio_bypass),
> +			IOPRIO_PRIO_DATA(dc->ioprio_bypass));
> +
> +	if (attr == &sysfs_ioprio_writeback)
> +		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
> +			IOPRIO_PRIO_CLASS(dc->ioprio_writeback),
> +			IOPRIO_PRIO_DATA(dc->ioprio_writeback));
> +
> +
>  #undef var
>  	return 0;
>  }
> @@ -194,6 +208,8 @@ STORE(__cached_dev)
>  	unsigned v = size;
>  	struct cache_set *c;
>  	struct kobj_uevent_env *env;
> +	unsigned ioprio_class = 0; /* invalid initial ioprio values
> */
> +	unsigned ioprio_value = IOPRIO_BE_NR+1;
>  
>  #define d_strtoul(var)		sysfs_strtoul(var, dc->var)
>  #define d_strtoul_nonzero(var)	sysfs_strtoul_clamp(var,
> dc->var, 1, INT_MAX) @@ -282,6 +298,33 @@ STORE(__cached_dev)
>  	if (attr == &sysfs_stop)
>  		bcache_device_stop(&dc->disk);
>  
> +	if (attr == &sysfs_ioprio_writeback) {
> +		if (sscanf(buf, "%u,%u", &ioprio_class,
> &ioprio_value) != 2
> +			|| ioprio_class > IOPRIO_CLASS_IDLE
> +			|| ioprio_class < IOPRIO_CLASS_RT
> +			|| ioprio_value >= IOPRIO_BE_NR)
> +			pr_err("bcache: ioprio_writeback invalid
> ioprio class: %u data value: %u\n",
> +				ioprio_class, ioprio_value);
> +		else if (ioprio_class == IOPRIO_CLASS_IDLE)
> +			dc->ioprio_writeback =
> IOPRIO_PRIO_VALUE(ioprio_class, 0);
> +		else
> +			dc->ioprio_writeback =
> IOPRIO_PRIO_VALUE(ioprio_class, ioprio_value);
> +	}
> +
> +	if (attr == &sysfs_ioprio_bypass)
> +	{
> +		if (sscanf(buf, "%u,%u", &ioprio_class,
> &ioprio_value) != 2
> +			|| ioprio_class > IOPRIO_CLASS_IDLE
> +			|| ioprio_class < IOPRIO_CLASS_RT
> +			|| ioprio_value >= IOPRIO_BE_NR)
> +			pr_err("bcache: ioprio_bypass invalid ioprio
> class: %u data value: %u\n",
> +				ioprio_class, ioprio_value);
> +		else if (ioprio_class == IOPRIO_CLASS_IDLE)
> +			dc->ioprio_writeback =
> IOPRIO_PRIO_VALUE(ioprio_class, 0);
> +		else
> +			dc->ioprio_writeback =
> IOPRIO_PRIO_VALUE(ioprio_class, ioprio_value);
> +	}
> +
>  	return size;
>  }
>  
> @@ -334,6 +377,8 @@ static struct attribute *bch_cached_dev_files[] =
> { &sysfs_verify,
>  	&sysfs_bypass_torture_test,
>  #endif
> +	&sysfs_ioprio_bypass,
> +	&sysfs_ioprio_writeback,
>  	NULL
>  };
>  KTYPE(bch_cached_dev);
> diff --git a/drivers/md/bcache/writeback.c
> b/drivers/md/bcache/writeback.c index d9fd2a6..9772988 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -514,6 +514,9 @@ void bch_cached_dev_writeback_init(struct
> cached_dev *dc) dc->writeback_rate_d_term	= 30;
>  	dc->writeback_rate_p_term_inverse = 6000;
>  
> +	dc->ioprio_writeback = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
> +	dc->ioprio_bypass    = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
> IOPRIO_BE_NR-1); +
>  	INIT_DELAYED_WORK(&dc->writeback_rate_update,
> update_writeback_rate); }
>  
> diff --git a/drivers/md/bcache/writeback.h
> b/drivers/md/bcache/writeback.h index 301eaf5..a21fd87 100644
> --- a/drivers/md/bcache/writeback.h
> +++ b/drivers/md/bcache/writeback.h
> @@ -43,6 +43,7 @@ static inline bool should_writeback(struct
> cached_dev *dc, struct bio *bio, unsigned cache_mode, bool would_skip)
>  {
>  	unsigned in_use = dc->disk.c->gc_stats.in_use;
> +	struct io_context *ioc;
>  
>  	if (cache_mode != CACHE_MODE_WRITEBACK ||
>  	    test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
> @@ -57,6 +58,18 @@ static inline bool should_writeback(struct
> cached_dev *dc, struct bio *bio, if (would_skip)
>  		return false;
>  
> +	/* If process ioprio is higher-or-equal to
> dc->ioprio_writeback, then
> +	 * hint for writeback. Note that a higher-priority IO
> class+value
> +	 * has a lesser numeric value. */
> +	ioc = get_task_io_context(current, GFP_ATOMIC, NUMA_NO_NODE);
> +	if (ioc
> +		&& ioprio_valid(ioc->ioprio)
> +		&& ioprio_valid(dc->ioprio_writeback)
> +		&& ioc->ioprio <= dc->ioprio_writeback) {
> +		put_io_context(ioc);
> +		return true;
> +	}
> +
>  	return bio->bi_opf & REQ_SYNC ||
>  		in_use <= CUTOFF_WRITEBACK;
>  }



-- 
Regards,
Kai

Replies to list-only preferred.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] bcache: introduce ioprio-based bypass/writeback hints
  2016-09-28 19:26 ` Kai Krakow
@ 2016-09-28 21:38   ` Eric Wheeler
  2016-09-29  2:19     ` Kai Krakow
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Wheeler @ 2016-09-28 21:38 UTC (permalink / raw)
  To: Kai Krakow; +Cc: linux-bcache

On Wed, 28 Sep 2016, Kai Krakow wrote:

> Am Mon, 26 Sep 2016 16:17:22 -0700
> schrieb Eric Wheeler <git@linux.ewheeler.net>:
> 
> > Add support to bcache hinting functions and sysfs to hint by the
> > ioprio of 'current' which can be configured with `ionice`.
> > 
> > Cache hinting is configurable by writing 'class,level' pairs to sysfs.
> > These are the defaults:
> > 	echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
> > 	echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback
> > 
> > (-p) IO Class	 (-n) Class level	Action
> > -----------------------------------------------------
> > (1) Realtime	  0-7				Writeback
> > (2) Best-effort     0 				Writeback
> > (2) Best-effort   1-6				Original bcache
> > logic (2) Best-effort     7				Bypass
> > cache (3) Idle          n/a				Bypass
> > cache
> > 
> > See `man ionice` for more ioprio detail.
> 
> This patch fails to apply on 4.7.5... Is it possible to adapt for that
> version?
>

Try this one. It's my 4.1 backport which applies cleanly to 4.7, too.
 
commit 03d581518c28b9361cfde85922919baa1e430c54
Author: Eric Wheeler <git@linux.ewheeler.net>
Date:   Mon Sep 26 16:17:22 2016 -0700

    bcache: introduce ioprio-based bypass/writeback hints
    
    Add support to bcache hinting functions and sysfs to hint by the ioprio of
    'current' which can be configured with `ionice`.
    
    Cache hinting is configurable by writing 'class,level' pairs to sysfs.
    These are the defaults:
    	echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
    	echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback
    
    (-c) IO Class	 (-n) Class level	Action
    -----------------------------------------------------
    (1) Realtime	  0-7				Writeback
    (2) Best-effort     0 				Writeback
    (2) Best-effort   1-6				Original bcache logic
    (2) Best-effort     7				Bypass cache
    (3) Idle          n/a				Bypass cache
    
    See `man ionice` for more ioprio detail.
    
    Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
    
    Conflicts:
    	drivers/md/bcache/writeback.h

diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 04f7bc2..ff1026f 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -382,6 +382,9 @@ struct cached_dev {
 	unsigned		writeback_rate_update_seconds;
 	unsigned		writeback_rate_d_term;
 	unsigned		writeback_rate_p_term_inverse;
+
+	unsigned short		ioprio_writeback;
+	unsigned short		ioprio_bypass;
 };
 
 enum alloc_reserve {
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ab43fad..7b36da3 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -369,6 +369,7 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	unsigned sectors, congested = bch_get_congested(c);
 	struct task_struct *task = current;
 	struct io *i;
+	struct io_context *ioc;
 
 	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
 	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
@@ -380,6 +381,18 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
 	     (bio->bi_rw & REQ_WRITE)))
 		goto skip;
 
+	/* If process ioprio is lower-or-equal to dc->ioprio_bypass, then
+	 * hint for bypass. Note that a lower-priority IO class+value
+	 * has a greater numeric value. */
+	ioc = get_task_io_context(current, GFP_ATOMIC, NUMA_NO_NODE);
+	if (ioc
+		&& ioprio_valid(ioc->ioprio)
+		&& ioprio_valid(dc->ioprio_writeback)
+		&& ioc->ioprio >= dc->ioprio_bypass) {
+		put_io_context(ioc);
+		return true;
+	}
+
 	if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
 	    bio_sectors(bio) & (c->sb.block_size - 1)) {
 		pr_debug("skipping unaligned io");
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index b3ff57d..83ac010 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -106,6 +106,9 @@ rw_attribute(btree_shrinker_disabled);
 rw_attribute(copy_gc_enabled);
 rw_attribute(size);
 
+rw_attribute(ioprio_writeback);
+rw_attribute(ioprio_bypass);
+
 SHOW(__bch_cached_dev)
 {
 	struct cached_dev *dc = container_of(kobj, struct cached_dev,
@@ -182,6 +185,17 @@ SHOW(__bch_cached_dev)
 		return strlen(buf);
 	}
 
+	if (attr == &sysfs_ioprio_bypass)
+		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
+			IOPRIO_PRIO_CLASS(dc->ioprio_bypass),
+			IOPRIO_PRIO_DATA(dc->ioprio_bypass));
+
+	if (attr == &sysfs_ioprio_writeback)
+		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
+			IOPRIO_PRIO_CLASS(dc->ioprio_writeback),
+			IOPRIO_PRIO_DATA(dc->ioprio_writeback));
+
+
 #undef var
 	return 0;
 }
@@ -194,6 +208,8 @@ STORE(__cached_dev)
 	unsigned v = size;
 	struct cache_set *c;
 	struct kobj_uevent_env *env;
+	unsigned ioprio_class = 0; /* invalid initial ioprio values */
+	unsigned ioprio_value = IOPRIO_BE_NR+1;
 
 #define d_strtoul(var)		sysfs_strtoul(var, dc->var)
 #define d_strtoul_nonzero(var)	sysfs_strtoul_clamp(var, dc->var, 1, INT_MAX)
@@ -282,6 +298,33 @@ STORE(__cached_dev)
 	if (attr == &sysfs_stop)
 		bcache_device_stop(&dc->disk);
 
+	if (attr == &sysfs_ioprio_writeback) {
+		if (sscanf(buf, "%u,%u", &ioprio_class, &ioprio_value) != 2
+			|| ioprio_class > IOPRIO_CLASS_IDLE
+			|| ioprio_class < IOPRIO_CLASS_RT
+			|| ioprio_value >= IOPRIO_BE_NR)
+			pr_err("bcache: ioprio_writeback invalid ioprio class: %u data value: %u\n",
+				ioprio_class, ioprio_value);
+		else if (ioprio_class == IOPRIO_CLASS_IDLE)
+			dc->ioprio_writeback = IOPRIO_PRIO_VALUE(ioprio_class, 0);
+		else
+			dc->ioprio_writeback = IOPRIO_PRIO_VALUE(ioprio_class, ioprio_value);
+	}
+
+	if (attr == &sysfs_ioprio_bypass)
+	{
+		if (sscanf(buf, "%u,%u", &ioprio_class, &ioprio_value) != 2
+			|| ioprio_class > IOPRIO_CLASS_IDLE
+			|| ioprio_class < IOPRIO_CLASS_RT
+			|| ioprio_value >= IOPRIO_BE_NR)
+			pr_err("bcache: ioprio_bypass invalid ioprio class: %u data value: %u\n",
+				ioprio_class, ioprio_value);
+		else if (ioprio_class == IOPRIO_CLASS_IDLE)
+			dc->ioprio_writeback = IOPRIO_PRIO_VALUE(ioprio_class, 0);
+		else
+			dc->ioprio_writeback = IOPRIO_PRIO_VALUE(ioprio_class, ioprio_value);
+	}
+
 	return size;
 }
 
@@ -334,6 +377,8 @@ static struct attribute *bch_cached_dev_files[] = {
 	&sysfs_verify,
 	&sysfs_bypass_torture_test,
 #endif
+	&sysfs_ioprio_bypass,
+	&sysfs_ioprio_writeback,
 	NULL
 };
 KTYPE(bch_cached_dev);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index f1986bc..cc2c7ae 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -494,6 +494,9 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
 	dc->writeback_rate_d_term	= 30;
 	dc->writeback_rate_p_term_inverse = 6000;
 
+	dc->ioprio_writeback = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
+	dc->ioprio_bypass    = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NR-1);
+
 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
 }
 
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 0a9dab1..defd168 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -43,6 +43,7 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 				    unsigned cache_mode, bool would_skip)
 {
 	unsigned in_use = dc->disk.c->gc_stats.in_use;
+	struct io_context *ioc;
 
 	if (cache_mode != CACHE_MODE_WRITEBACK ||
 	    test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
@@ -57,6 +58,18 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio,
 	if (would_skip)
 		return false;
 
+	/* If process ioprio is higher-or-equal to dc->ioprio_writeback, then
+	 * hint for writeback. Note that a higher-priority IO class+value
+	 * has a lesser numeric value. */
+	ioc = get_task_io_context(current, GFP_ATOMIC, NUMA_NO_NODE);
+	if (ioc
+		&& ioprio_valid(ioc->ioprio)
+		&& ioprio_valid(dc->ioprio_writeback)
+		&& ioc->ioprio <= dc->ioprio_writeback) {
+		put_io_context(ioc);
+		return true;
+	}
+
 	return bio->bi_rw & REQ_SYNC ||
 		in_use <= CUTOFF_WRITEBACK;
 }

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] bcache: introduce ioprio-based bypass/writeback hints
  2016-09-28 21:38   ` Eric Wheeler
@ 2016-09-29  2:19     ` Kai Krakow
  0 siblings, 0 replies; 11+ messages in thread
From: Kai Krakow @ 2016-09-29  2:19 UTC (permalink / raw)
  To: linux-bcache

Am Wed, 28 Sep 2016 14:38:37 -0700 (PDT)
schrieb Eric Wheeler <bcache@lists.ewheeler.net>:

> On Wed, 28 Sep 2016, Kai Krakow wrote:
> 
> > Am Mon, 26 Sep 2016 16:17:22 -0700
> > schrieb Eric Wheeler <git@linux.ewheeler.net>:
> >   
> > > Add support to bcache hinting functions and sysfs to hint by the
> > > ioprio of 'current' which can be configured with `ionice`.
> > > 
> > > Cache hinting is configurable by writing 'class,level' pairs to
> > > sysfs. These are the defaults:
> > > 	echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
> > > 	echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback
> > > 
> > > (-p) IO Class	 (-n) Class level	Action
> > > -----------------------------------------------------
> > > (1) Realtime	  0-7				Writeback
> > > (2) Best-effort     0 				Writeback
> > > (2) Best-effort   1-6				Original
> > > bcache logic (2) Best-effort     7
> > > Bypass cache (3) Idle          n/a
> > > Bypass cache
> > > 
> > > See `man ionice` for more ioprio detail.  
> > 
> > This patch fails to apply on 4.7.5... Is it possible to adapt for
> > that version?
> >  
> 
> Try this one. It's my 4.1 backport which applies cleanly to 4.7, too.

Thanks, this applies.

I wonder if this also has an effect if I only use deadline scheduler? I
think only cfq uses scheduling hints but even with deadline, processes
know about their io priorities. So, bcache should still be able to use
this even with deadline?


> commit 03d581518c28b9361cfde85922919baa1e430c54
> Author: Eric Wheeler <git@linux.ewheeler.net>
> Date:   Mon Sep 26 16:17:22 2016 -0700
> 
>     bcache: introduce ioprio-based bypass/writeback hints
>     
>     Add support to bcache hinting functions and sysfs to hint by the
> ioprio of 'current' which can be configured with `ionice`.
>     
>     Cache hinting is configurable by writing 'class,level' pairs to
> sysfs. These are the defaults:
>     	echo 2,7 > /sys/block/bcache0/bcache/ioprio_bypass
>     	echo 2,0 > /sys/block/bcache0/bcache/ioprio_writeback
>     
>     (-c) IO Class	 (-n) Class level	Action
>     -----------------------------------------------------
>     (1) Realtime	  0-7				Writeback
>     (2) Best-effort     0 				Writeback
>     (2) Best-effort   1-6				Original
> bcache logic (2) Best-effort     7
> Bypass cache (3) Idle          n/a
> Bypass cache 
>     See `man ionice` for more ioprio detail.
>     
>     Signed-off-by: Eric Wheeler <bcache@linux.ewheeler.net>
>     
>     Conflicts:
>     	drivers/md/bcache/writeback.h
> 
> diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
> index 04f7bc2..ff1026f 100644
> --- a/drivers/md/bcache/bcache.h
> +++ b/drivers/md/bcache/bcache.h
> @@ -382,6 +382,9 @@ struct cached_dev {
>  	unsigned		writeback_rate_update_seconds;
>  	unsigned		writeback_rate_d_term;
>  	unsigned		writeback_rate_p_term_inverse;
> +
> +	unsigned short		ioprio_writeback;
> +	unsigned short		ioprio_bypass;
>  };
>  
>  enum alloc_reserve {
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index ab43fad..7b36da3 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -369,6 +369,7 @@ static bool check_should_bypass(struct cached_dev
> *dc, struct bio *bio) unsigned sectors, congested =
> bch_get_congested(c); struct task_struct *task = current;
>  	struct io *i;
> +	struct io_context *ioc;
>  
>  	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
>  	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
> @@ -380,6 +381,18 @@ static bool check_should_bypass(struct
> cached_dev *dc, struct bio *bio) (bio->bi_rw & REQ_WRITE)))
>  		goto skip;
>  
> +	/* If process ioprio is lower-or-equal to dc->ioprio_bypass,
> then
> +	 * hint for bypass. Note that a lower-priority IO class+value
> +	 * has a greater numeric value. */
> +	ioc = get_task_io_context(current, GFP_ATOMIC, NUMA_NO_NODE);
> +	if (ioc
> +		&& ioprio_valid(ioc->ioprio)
> +		&& ioprio_valid(dc->ioprio_writeback)
> +		&& ioc->ioprio >= dc->ioprio_bypass) {
> +		put_io_context(ioc);
> +		return true;
> +	}
> +
>  	if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
>  	    bio_sectors(bio) & (c->sb.block_size - 1)) {
>  		pr_debug("skipping unaligned io");
> diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
> index b3ff57d..83ac010 100644
> --- a/drivers/md/bcache/sysfs.c
> +++ b/drivers/md/bcache/sysfs.c
> @@ -106,6 +106,9 @@ rw_attribute(btree_shrinker_disabled);
>  rw_attribute(copy_gc_enabled);
>  rw_attribute(size);
>  
> +rw_attribute(ioprio_writeback);
> +rw_attribute(ioprio_bypass);
> +
>  SHOW(__bch_cached_dev)
>  {
>  	struct cached_dev *dc = container_of(kobj, struct cached_dev,
> @@ -182,6 +185,17 @@ SHOW(__bch_cached_dev)
>  		return strlen(buf);
>  	}
>  
> +	if (attr == &sysfs_ioprio_bypass)
> +		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
> +			IOPRIO_PRIO_CLASS(dc->ioprio_bypass),
> +			IOPRIO_PRIO_DATA(dc->ioprio_bypass));
> +
> +	if (attr == &sysfs_ioprio_writeback)
> +		return snprintf(buf, PAGE_SIZE-1, "%d,%ld\n",
> +			IOPRIO_PRIO_CLASS(dc->ioprio_writeback),
> +			IOPRIO_PRIO_DATA(dc->ioprio_writeback));
> +
> +
>  #undef var
>  	return 0;
>  }
> @@ -194,6 +208,8 @@ STORE(__cached_dev)
>  	unsigned v = size;
>  	struct cache_set *c;
>  	struct kobj_uevent_env *env;
> +	unsigned ioprio_class = 0; /* invalid initial ioprio values
> */
> +	unsigned ioprio_value = IOPRIO_BE_NR+1;
>  
>  #define d_strtoul(var)		sysfs_strtoul(var, dc->var)
>  #define d_strtoul_nonzero(var)	sysfs_strtoul_clamp(var,
> dc->var, 1, INT_MAX) @@ -282,6 +298,33 @@ STORE(__cached_dev)
>  	if (attr == &sysfs_stop)
>  		bcache_device_stop(&dc->disk);
>  
> +	if (attr == &sysfs_ioprio_writeback) {
> +		if (sscanf(buf, "%u,%u", &ioprio_class,
> &ioprio_value) != 2
> +			|| ioprio_class > IOPRIO_CLASS_IDLE
> +			|| ioprio_class < IOPRIO_CLASS_RT
> +			|| ioprio_value >= IOPRIO_BE_NR)
> +			pr_err("bcache: ioprio_writeback invalid
> ioprio class: %u data value: %u\n",
> +				ioprio_class, ioprio_value);
> +		else if (ioprio_class == IOPRIO_CLASS_IDLE)
> +			dc->ioprio_writeback =
> IOPRIO_PRIO_VALUE(ioprio_class, 0);
> +		else
> +			dc->ioprio_writeback =
> IOPRIO_PRIO_VALUE(ioprio_class, ioprio_value);
> +	}
> +
> +	if (attr == &sysfs_ioprio_bypass)
> +	{
> +		if (sscanf(buf, "%u,%u", &ioprio_class,
> &ioprio_value) != 2
> +			|| ioprio_class > IOPRIO_CLASS_IDLE
> +			|| ioprio_class < IOPRIO_CLASS_RT
> +			|| ioprio_value >= IOPRIO_BE_NR)
> +			pr_err("bcache: ioprio_bypass invalid ioprio
> class: %u data value: %u\n",
> +				ioprio_class, ioprio_value);
> +		else if (ioprio_class == IOPRIO_CLASS_IDLE)
> +			dc->ioprio_writeback =
> IOPRIO_PRIO_VALUE(ioprio_class, 0);
> +		else
> +			dc->ioprio_writeback =
> IOPRIO_PRIO_VALUE(ioprio_class, ioprio_value);
> +	}
> +
>  	return size;
>  }
>  
> @@ -334,6 +377,8 @@ static struct attribute *bch_cached_dev_files[] =
> { &sysfs_verify,
>  	&sysfs_bypass_torture_test,
>  #endif
> +	&sysfs_ioprio_bypass,
> +	&sysfs_ioprio_writeback,
>  	NULL
>  };
>  KTYPE(bch_cached_dev);
> diff --git a/drivers/md/bcache/writeback.c
> b/drivers/md/bcache/writeback.c index f1986bc..cc2c7ae 100644
> --- a/drivers/md/bcache/writeback.c
> +++ b/drivers/md/bcache/writeback.c
> @@ -494,6 +494,9 @@ void bch_cached_dev_writeback_init(struct
> cached_dev *dc) dc->writeback_rate_d_term	= 30;
>  	dc->writeback_rate_p_term_inverse = 6000;
>  
> +	dc->ioprio_writeback = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
> +	dc->ioprio_bypass    = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
> IOPRIO_BE_NR-1); +
>  	INIT_DELAYED_WORK(&dc->writeback_rate_update,
> update_writeback_rate); }
>  
> diff --git a/drivers/md/bcache/writeback.h
> b/drivers/md/bcache/writeback.h index 0a9dab1..defd168 100644
> --- a/drivers/md/bcache/writeback.h
> +++ b/drivers/md/bcache/writeback.h
> @@ -43,6 +43,7 @@ static inline bool should_writeback(struct
> cached_dev *dc, struct bio *bio, unsigned cache_mode, bool would_skip)
>  {
>  	unsigned in_use = dc->disk.c->gc_stats.in_use;
> +	struct io_context *ioc;
>  
>  	if (cache_mode != CACHE_MODE_WRITEBACK ||
>  	    test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
> @@ -57,6 +58,18 @@ static inline bool should_writeback(struct
> cached_dev *dc, struct bio *bio, if (would_skip)
>  		return false;
>  
> +	/* If process ioprio is higher-or-equal to
> dc->ioprio_writeback, then
> +	 * hint for writeback. Note that a higher-priority IO
> class+value
> +	 * has a lesser numeric value. */
> +	ioc = get_task_io_context(current, GFP_ATOMIC, NUMA_NO_NODE);
> +	if (ioc
> +		&& ioprio_valid(ioc->ioprio)
> +		&& ioprio_valid(dc->ioprio_writeback)
> +		&& ioc->ioprio <= dc->ioprio_writeback) {
> +		put_io_context(ioc);
> +		return true;
> +	}
> +
>  	return bio->bi_rw & REQ_SYNC ||
>  		in_use <= CUTOFF_WRITEBACK;
>  }
> 



-- 
Regards,
Kai

Replies to list-only preferred.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re[2]: [PATCH 1/2] bcache: introduce ioprio-based bypass/writeback hints
  2016-09-28 18:21     ` Eric Wheeler
@ 2016-09-29  6:57       ` Pavel Goran
  2016-09-29 18:31         ` Eric Wheeler
  2016-09-29 18:34         ` Kai Krakow
  0 siblings, 2 replies; 11+ messages in thread
From: Pavel Goran @ 2016-09-29  6:57 UTC (permalink / raw)
  To: Eric Wheeler

Hello Eric,

Thursday, September 29, 2016, 1:21:53 AM, you wrote:

> It respects policy first of course.  You can see the top of
> should_writeback() in the patch as context:
>         if (cache_mode != CACHE_MODE_WRITEBACK || [...]

So the priority-dependent logic is only applied when cache mode is
"writeback", right? Then, what does "Original bcache logic" from the table in
the original patch description mean? That is, how does it differ from
"Writeback" from the same table?

Does "Writeback" there mean that bcache would write data to the cache
*always*, even in case of linear write, or in other situations where the
original writeback logic would bypass cache? If so, the table should perhaps
be adjusted to say "Always writeback" instead of "Writeback", and "Original
writeback logic" instead of "Original bcache logic".

> If not already in cache, it will read from the backing device and will not
> promote into (pollute) your cache. Having idle IOs bypass the cache can 
> increase performance everywhere else since you probably don't care about 
> the performance of idle IOs.

Ok, this sounds convincing.

Ideally this would be handled by some kind of priority semantics within the
cache, so that "high-priority" data would always replace "low-priority" data,
and "low-priority" data would only replace "high-priority" data if it's really
old. It's entirely feasible to implement within bcache, but probably it
wouldn't be easy, and might even mean changing cache data format.

Your SSD wearout problem would (again, ideally) be handled by some kind of
custom process attribute (no idea if the kernel even has this kind of concept,
probably not) that would signal bcache to bypass caching for IO that
originates from this process. The additional "bulk" priority class proposed by
Kai would also be a good solution.

However, given that these two approaches are supposedly unavailable, using
existing IO priorities looks like an acceptable solution.

> If you really want idle IO that can writeback, then I can set the default
> for ioprio_writeback and/or ioprio_bypass to be disabled until a user 
> explicitly sets them.

Are you talking about bcache options available via sysfs? It would be good to
have control over the priority-dependent logic in this way (or it's already
implemented?). Not sure what the defaults should be, though. :) It would be
even better to have the possibility to save these flags to the superblock, if
it's something that can be easily implemented.

Pavel Goran
  

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re[2]: [PATCH 1/2] bcache: introduce ioprio-based bypass/writeback hints
  2016-09-29  6:57       ` Re[2]: " Pavel Goran
@ 2016-09-29 18:31         ` Eric Wheeler
  2016-09-29 18:34         ` Kai Krakow
  1 sibling, 0 replies; 11+ messages in thread
From: Eric Wheeler @ 2016-09-29 18:31 UTC (permalink / raw)
  To: Pavel Goran; +Cc: linux-bcache

> Hello Eric,
> 
> Thursday, September 29, 2016, 1:21:53 AM, you wrote:
> 
> > It respects policy first of course.  You can see the top of
> > should_writeback() in the patch as context:
> >         if (cache_mode != CACHE_MODE_WRITEBACK || [...]
> 
> So the priority-dependent logic is only applied when cache mode is
> "writeback", right? 

Correct. It might be better to say that the new priority-dependent logic 
_for writes_ is only applied when cache mode is "writeback" via the new 
syfs entry ioprio_writeback.

For reads, ioprio_bypass is hinted in both writeback and writethrough 
modes.

> Then, what does "Original bcache logic" from the table in the original 
> patch description mean? That is, how does it differ from "Writeback" 
> from the same table?

There are two hints in `struct data_insert_op`:
	iop.bypass, iop.writeback

bcache has an existing function `should_writeback()` which is a hint on 
the cached_dev_write() path.  I've added the writeback portion of the 
ioprio logic there, so if it doesn't match the ioprio set in 
ioprio_writeback, it continues as if my code hadn't been added.

> Does "Writeback" there mean that bcache would write data to the cache
> *always*, even in case of linear write, or in other situations where the
> original writeback logic would bypass cache? If so, the table should perhaps
> be adjusted to say "Always writeback" instead of "Writeback", and "Original
> writeback logic" instead of "Original bcache logic".

Similarly, the function `should_bypass()` sets the iop.bypass hint.  This 
is where the logic as to whether or not the write looks "sequential" is 
held.

The IO is then passed to cached_dev_write() which checks 
should_writeback().  If should_writeback() returns true, it overrides the 
bypass flag.  Thus, iop.bypass is weaker than iop.writeback.  Bcache makes 
other decisions which may override bypass, too, like overlapping of dirty 
data and perhaps other reasons.  

Keep in mind that SSDs are still better at seq-writes than random writes 
because of erase block rewrites handled by the SSD's FTL.  For realtime 
applications 100% writeback might still be preferred, even if the IOs are 
sequential.  This will wear your SSD faster, of course, so set your 
io priorities appropriately.

Pseudo-code call-stack:
	cached_dev_make_request():
	   s->iop.bypass = check_should_bypass()
	   cached_dev_write():
	         if (should_writeback()) {
                    s->iop.bypass = false;
                    s->iop.writeback = true;
                 }
		 submit-based-on-hint( s )

> > If not already in cache, it will read from the backing device and will not
> > promote into (pollute) your cache. Having idle IOs bypass the cache can 
> > increase performance everywhere else since you probably don't care about 
> > the performance of idle IOs.
> 
> Ok, this sounds convincing.
> 
> Ideally this would be handled by some kind of priority semantics within the
> cache, so that "high-priority" data would always replace "low-priority" data,
> and "low-priority" data would only replace "high-priority" data if it's really
> old. It's entirely feasible to implement within bcache, but probably it
> wouldn't be easy, and might even mean changing cache data format.
> 
> Your SSD wearout problem would (again, ideally) be handled by some kind of
> custom process attribute (no idea if the kernel even has this kind of concept,
> probably not) that would signal bcache to bypass caching for IO that
> originates from this process. The additional "bulk" priority class proposed by
> Kai would also be a good solution.

Some SSDs have a SMART attribute for LBAs written---which is a good idea 
to monitor if your SSD supports it---but its not consistent enough to 
build a framework around.  It varies from mfg to mfg.
 
> However, given that these two approaches are supposedly unavailable, using
> existing IO priorities looks like an acceptable solution.

Agreed.  

> > If you really want idle IO that can writeback, then I can set the default
> > for ioprio_writeback and/or ioprio_bypass to be disabled until a user 
> > explicitly sets them.
> 
> Are you talking about bcache options available via sysfs? It would be good to
> have control over the priority-dependent logic in this way (or it's already
> implemented?). 

This patch. It just adds sysfs files to configure ioprio_bypass and 
ioprio_writeback with flags that hook into the existing 
should_bypass()/should_writeback() functions.  

I'm planning a v2 patch to update documentation and allow 0,0 to disable 
either or both features.

> Not sure what the defaults should be, though. :) 

Does anyone else have a suggestion on best defaults? As currently written, 
it is like so:
	writeback for higher priority than `ionice -c2 -n0`
	bypass    for lower  priority than `ionice -c2 -n7`

Users who do not use `ionice` are unaffected by this default since 
unprioritized pids are either "ioprio_invalid()" or have an ioprio of '-c2 -n4'.

> It would be even better to have the possibility to save these flags to 
> the superblock, if it's something that can be easily implemented.

Thats definitely a neat idea.  It would be great to put other flags like 
sequential_cutoff, writeback_*, and perhaps a few other sysfs 
configurations into the superblock so I needn't set them at boot time.  
A superblock patch for another time :)

--
Eric Wheeler

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] bcache: introduce ioprio-based bypass/writeback hints
  2016-09-29  6:57       ` Re[2]: " Pavel Goran
  2016-09-29 18:31         ` Eric Wheeler
@ 2016-09-29 18:34         ` Kai Krakow
  2016-09-29 22:50           ` Eric Wheeler
  1 sibling, 1 reply; 11+ messages in thread
From: Kai Krakow @ 2016-09-29 18:34 UTC (permalink / raw)
  To: linux-bcache

Am Thu, 29 Sep 2016 13:57:00 +0700
schrieb Pavel Goran <via-bcache@pvgoran.name>:

> Hello Eric,
> 
> Thursday, September 29, 2016, 1:21:53 AM, you wrote:
> 
> > It respects policy first of course.  You can see the top of
> > should_writeback() in the patch as context:
> >         if (cache_mode != CACHE_MODE_WRITEBACK || [...]  
> 
> So the priority-dependent logic is only applied when cache mode is
> "writeback", right? Then, what does "Original bcache logic" from the
> table in the original patch description mean? That is, how does it
> differ from "Writeback" from the same table?

I understand that when bcache heuristics decide to skip writeback
caching and write directly to the backing device, this would override
it and still force writeback.

OTOH, if you don't use the writeback policy, this will simply not apply.

> Does "Writeback" there mean that bcache would write data to the cache
> *always*, even in case of linear write, or in other situations where
> the original writeback logic would bypass cache?

That's what I understood now.

> If so, the table
> should perhaps be adjusted to say "Always writeback" instead of
> "Writeback", and "Original writeback logic" instead of "Original
> bcache logic".

Makes sense.

> > If not already in cache, it will read from the backing device and
> > will not promote into (pollute) your cache. Having idle IOs bypass
> > the cache can increase performance everywhere else since you
> > probably don't care about the performance of idle IOs.  
> 
> Ok, this sounds convincing.

And it seems to be true, my real-workload tests today showed
consistently higher hit-rates.

> Ideally this would be handled by some kind of priority semantics
> within the cache, so that "high-priority" data would always replace
> "low-priority" data, and "low-priority" data would only replace
> "high-priority" data if it's really old. It's entirely feasible to
> implement within bcache, but probably it wouldn't be easy, and might
> even mean changing cache data format.

I don't think it's worth to add such a management overhead... Maybe one
better stacks multiple cache layers with different policies.

> Your SSD wearout problem would (again, ideally) be handled by some
> kind of custom process attribute (no idea if the kernel even has this
> kind of concept, probably not) that would signal bcache to bypass
> caching for IO that originates from this process. The additional
> "bulk" priority class proposed by Kai would also be a good solution.

That was my thinking behind suggesting a bulk class... To reduce
wearout. I've killed my first SSD within 12 months due to this and
using bcache. Well, I didn't actually kill it, I replaced it when the
lifetime prognosis reached 97%. It was spec'ed for 85 TB data written.
And the usual 5 GB per day pattern just didn't apply due to untuned
btrfs maintenance running in the background (scrubbing,
defragmentation, balancing) via cron.

> However, given that these two approaches are supposedly unavailable,
> using existing IO priorities looks like an acceptable solution.

ACK.

> > If you really want idle IO that can writeback, then I can set the
> > default for ioprio_writeback and/or ioprio_bypass to be disabled
> > until a user explicitly sets them.  
> 
> Are you talking about bcache options available via sysfs? It would be
> good to have control over the priority-dependent logic in this way
> (or it's already implemented?). Not sure what the defaults should be,
> though. :) It would be even better to have the possibility to save
> these flags to the superblock, if it's something that can be easily
> implemented.

I think saving to superblock would be a cool feature. I think the
defaults are very sane. And there should be a flip-switch to turn
on/off this login, default being off probably. Just to fulfill
principle of least surprise. I'd probably always turn it on at the first
opportunity. :-)

It probably allows me to turn off writearound and instead use writeback
for my backup storage.

Still, I need to know if this logic works with the deadline scheduler
(which is not supposed to adhere to IO priorities, but still I guess
bcache will still see them).


-- 
Regards,
Kai

Replies to list-only preferred.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/2] bcache: introduce ioprio-based bypass/writeback hints
  2016-09-29 18:34         ` Kai Krakow
@ 2016-09-29 22:50           ` Eric Wheeler
  0 siblings, 0 replies; 11+ messages in thread
From: Eric Wheeler @ 2016-09-29 22:50 UTC (permalink / raw)
  To: Kai Krakow; +Cc: linux-bcache

On Thu, 29 Sep 2016, Kai Krakow wrote:
> Am Thu, 29 Sep 2016 13:57:00 +0700
> schrieb Pavel Goran <via-bcache@pvgoran.name>:
> 
> > Hello Eric,
> > 
> > Thursday, September 29, 2016, 1:21:53 AM, you wrote:
> > 
> > > It respects policy first of course.  You can see the top of
> > > should_writeback() in the patch as context:
> > >         if (cache_mode != CACHE_MODE_WRITEBACK || [...]  
> > 
> > So the priority-dependent logic is only applied when cache mode is
> > "writeback", right? Then, what does "Original bcache logic" from the
> > table in the original patch description mean? That is, how does it
> > differ from "Writeback" from the same table?
> 
> I understand that when bcache heuristics decide to skip writeback
> caching and write directly to the backing device, this would override
> it and still force writeback.
> 
> OTOH, if you don't use the writeback policy, this will simply not apply.

Correct.


> > Does "Writeback" there mean that bcache would write data to the cache
> > *always*, even in case of linear write, or in other situations where
> > the original writeback logic would bypass cache?
> 
> That's what I understood now.

I'm certainly setting the should_writeback hint, but I don't know that 
this necessarily means always. While I've not seen one (nor have I 
looked), there could be an exception to the should_writeback hint that 
would cause a bypass anyway.


> > If so, the table
> > should perhaps be adjusted to say "Always writeback" instead of
> > "Writeback", and "Original writeback logic" instead of "Original
> > bcache logic".
> 
> Makes sense.
> 
> > > If not already in cache, it will read from the backing device and
> > > will not promote into (pollute) your cache. Having idle IOs bypass
> > > the cache can increase performance everywhere else since you
> > > probably don't care about the performance of idle IOs.  
> > 
> > Ok, this sounds convincing.
> 
> And it seems to be true, my real-workload tests today showed
> consistently higher hit-rates.

Yay!


> > Your SSD wearout problem would (again, ideally) be handled by some
> > kind of custom process attribute (no idea if the kernel even has this
> > kind of concept, probably not) that would signal bcache to bypass
> > caching for IO that originates from this process. The additional
> > "bulk" priority class proposed by Kai would also be a good solution.
> 
> That was my thinking behind suggesting a bulk class... To reduce
> wearout. I've killed my first SSD within 12 months due to this and
> using bcache. Well, I didn't actually kill it, I replaced it when the
> lifetime prognosis reached 97%. It was spec'ed for 85 TB data written.
> And the usual 5 GB per day pattern just didn't apply due to untuned
> btrfs maintenance running in the background (scrubbing,
> defragmentation, balancing) via cron.

fwiw, I set my btrfs-transaction PID to high priority and set the cleaner 
threads to low priority. I suppose there's the possibility of a 
priority-inversion deadlock, but I've not run into one.


> > However, given that these two approaches are supposedly unavailable,
> > using existing IO priorities looks like an acceptable solution.
> 
> ACK.
> 
> > > If you really want idle IO that can writeback, then I can set the
> > > default for ioprio_writeback and/or ioprio_bypass to be disabled
> > > until a user explicitly sets them.  
> > 
> > Are you talking about bcache options available via sysfs? It would be
> > good to have control over the priority-dependent logic in this way
> > (or it's already implemented?). Not sure what the defaults should be,
> > though. :) It would be even better to have the possibility to save
> > these flags to the superblock, if it's something that can be easily
> > implemented.
> 
> I think saving to superblock would be a cool feature. I think the
> defaults are very sane. And there should be a flip-switch to turn
> on/off this login, default being off probably. Just to fulfill
> principle of least surprise. I'd probably always turn it on at the first
> opportunity. :-)

I think the default will be 0,0 for writeback and 2,7 for bypass. The best 
surprise that a user could have is better hit rates and less wearout.


> It probably allows me to turn off writearound and instead use writeback
> for my backup storage.

This was an unintended surprise, I'm glad to hear that too!


> Still, I need to know if this logic works with the deadline scheduler
> (which is not supposed to adhere to IO priorities, but still I guess
> bcache will still see them).

I've tested with noop, deadline, cfq, and bfq. They all work great in 
terms of bcache respecting the priority flags. Note that schedulers are 
only applied to the devices below bcache.

--
Eric Wheeler

> 
> 
> -- 
> Regards,
> Kai
> 
> Replies to list-only preferred.
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-bcache" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2016-09-29 22:50 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-26 23:17 [PATCH 1/2] bcache: introduce ioprio-based bypass/writeback hints Eric Wheeler
2016-09-27  1:43 ` Pavel Goran
2016-09-27  2:43   ` Kai Krakow
2016-09-28 18:21     ` Eric Wheeler
2016-09-29  6:57       ` Re[2]: " Pavel Goran
2016-09-29 18:31         ` Eric Wheeler
2016-09-29 18:34         ` Kai Krakow
2016-09-29 22:50           ` Eric Wheeler
2016-09-28 19:26 ` Kai Krakow
2016-09-28 21:38   ` Eric Wheeler
2016-09-29  2:19     ` Kai Krakow

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.