linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] blk-mq: add random early detection I/O scheduler
@ 2017-04-01 19:55 Omar Sandoval
  2017-04-01 22:07 ` Jens Axboe
  0 siblings, 1 reply; 3+ messages in thread
From: Omar Sandoval @ 2017-04-01 19:55 UTC (permalink / raw)
  To: Jens Axboe, linux-block, linux-kernel; +Cc: kernel-team

From: Omar Sandoval <osandov@fb.com>

This patch introduces a new I/O scheduler based on the classic random
early detection active queue management algorithm [1]. Random early
detection is one of the simplest and most studied AQM algorithms for
networking, but until now, it hasn't been applied to disk I/O
scheduling.

When applied to network routers, RED probabilistically either marks
packets with ECN or drops them, depending on the configuration. When
dealing with disk I/O, POSIX does not have any mechanism with which to
notify the caller that the disk is congested, so we instead only provide
the latter strategy. Included in this patch is a minor change to the
blk-mq to support this.

Performance results are extremely promising. This scheduling technique
does not require any cross-hardware queue data sharing, as limits are
applied on a per-hardware queue basis, making RED highly scalable.
Additionally, with RED, I/O latencies on a heavily loaded device can be
better than even a completely idle device, as is demonstrated by this
fio job:

----
[global]
filename=/dev/sda
direct=1
runtime=10s
time_based
group_reporting

[idle_reader]
rate_iops=1000
ioengine=sync
rw=randread

[contended_reader]
stonewall
numjobs=4
ioengine=libaio
iodepth=1024
rw=randread
----

1: http://www.icir.org/floyd/papers/red/red.html

Signed-off-by: Omar Sandoval <osandov@fb.com>
---
 block/Kconfig.iosched |   6 ++
 block/Makefile        |   1 +
 block/blk-mq.c        |   2 +
 block/red-iosched.c   | 191 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 200 insertions(+)
 create mode 100644 block/red-iosched.c

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 58fc8684788d..e8bdd144ec9f 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -69,6 +69,12 @@ config MQ_IOSCHED_DEADLINE
 	---help---
 	  MQ version of the deadline IO scheduler.
 
+config MQ_IOSCHED_RED
+	tristate "Random early detection I/O scheduler"
+	default y
+	---help---
+	  Block I/O adaptation of the RED active queue management algorithm.
+
 endmenu
 
 endif
diff --git a/block/Makefile b/block/Makefile
index 081bb680789b..607ee6e27901 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
+obj-$(CONFIG_MQ_IOSCHED_RED)	+= red-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 061fc2cc88d3..d7792ca0432c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1542,6 +1542,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
 	if (unlikely(!rq)) {
 		__wbt_done(q->rq_wb, wb_acct);
+		bio_advance(bio, bio->bi_iter.bi_size);
+		bio_endio(bio);
 		return BLK_QC_T_NONE;
 	}
 
diff --git a/block/red-iosched.c b/block/red-iosched.c
new file mode 100644
index 000000000000..862158a02e95
--- /dev/null
+++ b/block/red-iosched.c
@@ -0,0 +1,191 @@
+/*
+ * Random early detection I/O scheduler.
+ *
+ * Copyright (C) 2017 Facebook
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/elevator.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/sbitmap.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-tag.h"
+
+enum {
+	RED_DEFAULT_MIN_THRESH = 16,
+	RED_DEFAULT_MAX_THRESH = 256,
+	RED_MAX_MAX_THRESH = 256,
+};
+
+struct red_queue_data {
+	struct request_queue *q;
+	unsigned int min_thresh, max_thresh;
+};
+
+static int red_init_sched(struct request_queue *q, struct elevator_type *e)
+{
+	struct red_queue_data *rqd;
+	struct elevator_queue *eq;
+
+	eq = elevator_alloc(q, e);
+	if (!eq)
+		return -ENOMEM;
+
+	rqd = kmalloc_node(sizeof(*rqd), GFP_KERNEL, q->node);
+	if (!rqd) {
+		kobject_put(&eq->kobj);
+		return -ENOMEM;
+	}
+	rqd->min_thresh = RED_DEFAULT_MIN_THRESH;
+	rqd->max_thresh = RED_DEFAULT_MAX_THRESH;
+
+	eq->elevator_data = rqd;
+	q->elevator = eq;
+
+	return 0;
+}
+
+static void red_exit_sched(struct elevator_queue *e)
+{
+	struct red_queue_data *rqd = e->elevator_data;
+
+	kfree(rqd);
+}
+
+static struct request *red_get_request(struct request_queue *q,
+				       unsigned int op,
+				       struct blk_mq_alloc_data *data)
+{
+	struct red_queue_data *rqd = q->elevator->elevator_data;
+	unsigned int queue_length;
+	u32 drop_prob;
+
+	queue_length = sbitmap_weight(&data->hctx->sched_tags->bitmap_tags.sb);
+	if (queue_length <= rqd->min_thresh)
+		goto enqueue;
+	else if (queue_length >= rqd->max_thresh)
+		goto drop;
+
+	drop_prob = (U32_MAX / (rqd->max_thresh - rqd->min_thresh) *
+		     (queue_length - rqd->min_thresh));
+
+	if (prandom_u32() <= drop_prob)
+		goto drop;
+
+enqueue:
+	return __blk_mq_alloc_request(data, op);
+
+drop:
+	/*
+	 * Non-blocking callers will return EWOULDBLOCK; blocking callers should
+	 * check the return code and retry.
+	 */
+	return NULL;
+}
+
+static ssize_t red_min_thresh_show(struct elevator_queue *e, char *page)
+{
+	struct red_queue_data *rqd = e->elevator_data;
+
+	return sprintf(page, "%u\n", rqd->min_thresh);
+}
+
+static ssize_t red_min_thresh_store(struct elevator_queue *e, const char *page,
+				    size_t count)
+{
+	struct red_queue_data *rqd = e->elevator_data;
+	unsigned int thresh;
+	int ret;
+
+	ret = kstrtouint(page, 10, &thresh);
+	if (ret)
+		return ret;
+
+	if (thresh >= rqd->max_thresh)
+		return -EINVAL;
+
+	rqd->min_thresh = thresh;
+
+	return count;
+}
+
+static ssize_t red_max_thresh_show(struct elevator_queue *e, char *page)
+{
+	struct red_queue_data *rqd = e->elevator_data;
+
+	return sprintf(page, "%u\n", rqd->max_thresh);
+}
+
+static ssize_t red_max_thresh_store(struct elevator_queue *e, const char *page,
+				    size_t count)
+{
+	struct red_queue_data *rqd = e->elevator_data;
+	unsigned int thresh;
+	int ret;
+
+	ret = kstrtouint(page, 10, &thresh);
+	if (ret)
+		return ret;
+
+	if (thresh <= rqd->min_thresh || thresh > RED_MAX_MAX_THRESH)
+		return -EINVAL;
+
+	rqd->max_thresh = thresh;
+
+	return count;
+}
+
+#define RED_THRESH_ATTR(which) __ATTR(which##_thresh, 0644, red_##which##_thresh_show, red_##which##_thresh_store)
+static struct elv_fs_entry red_sched_attrs[] = {
+	RED_THRESH_ATTR(min),
+	RED_THRESH_ATTR(max),
+	__ATTR_NULL
+};
+#undef RED_THRESH_ATTR
+
+static struct elevator_type red_sched = {
+	.ops.mq = {
+		.init_sched = red_init_sched,
+		.exit_sched = red_exit_sched,
+		.get_request = red_get_request,
+	},
+	.uses_mq = true,
+	.elevator_attrs = red_sched_attrs,
+	.elevator_name = "red",
+	.elevator_owner = THIS_MODULE,
+};
+
+static int __init red_init(void)
+{
+	return elv_register(&red_sched);
+}
+
+static void __exit red_exit(void)
+{
+	elv_unregister(&red_sched);
+}
+
+module_init(red_init);
+module_exit(red_exit);
+
+MODULE_AUTHOR("Omar Sandoval");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Random early detection I/O scheduler");
-- 
2.12.1

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH] blk-mq: add random early detection I/O scheduler
  2017-04-01 19:55 [PATCH] blk-mq: add random early detection I/O scheduler Omar Sandoval
@ 2017-04-01 22:07 ` Jens Axboe
  2017-04-01 23:29   ` Bart Van Assche
  0 siblings, 1 reply; 3+ messages in thread
From: Jens Axboe @ 2017-04-01 22:07 UTC (permalink / raw)
  To: Omar Sandoval, linux-block, linux-kernel; +Cc: kernel-team

On 04/01/2017 01:55 PM, Omar Sandoval wrote:
> From: Omar Sandoval <osandov@fb.com>
> 
> This patch introduces a new I/O scheduler based on the classic random
> early detection active queue management algorithm [1]. Random early
> detection is one of the simplest and most studied AQM algorithms for
> networking, but until now, it hasn't been applied to disk I/O
> scheduling.
> 
> When applied to network routers, RED probabilistically either marks
> packets with ECN or drops them, depending on the configuration. When
> dealing with disk I/O, POSIX does not have any mechanism with which to
> notify the caller that the disk is congested, so we instead only provide
> the latter strategy. Included in this patch is a minor change to the
> blk-mq to support this.

This is great work. If we combine this with a thin provisioning target,
we can even use this to save space on the backend. Better latencies,
AND lower disk utilization.

I'm tempted to just queue this up for this cycle and make it the default.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] blk-mq: add random early detection I/O scheduler
  2017-04-01 22:07 ` Jens Axboe
@ 2017-04-01 23:29   ` Bart Van Assche
  0 siblings, 0 replies; 3+ messages in thread
From: Bart Van Assche @ 2017-04-01 23:29 UTC (permalink / raw)
  To: linux-kernel, osandov, linux-block, axboe; +Cc: kernel-team

T24gU2F0LCAyMDE3LTA0LTAxIGF0IDE2OjA3IC0wNjAwLCBKZW5zIEF4Ym9lIHdyb3RlOg0KPiBP
biAwNC8wMS8yMDE3IDAxOjU1IFBNLCBPbWFyIFNhbmRvdmFsIHdyb3RlOg0KPiA+IEZyb206IE9t
YXIgU2FuZG92YWwgPG9zYW5kb3ZAZmIuY29tPg0KPiA+IA0KPiA+IFRoaXMgcGF0Y2ggaW50cm9k
dWNlcyBhIG5ldyBJL08gc2NoZWR1bGVyIGJhc2VkIG9uIHRoZSBjbGFzc2ljIHJhbmRvbQ0KPiA+
IGVhcmx5IGRldGVjdGlvbiBhY3RpdmUgcXVldWUgbWFuYWdlbWVudCBhbGdvcml0aG0gWzFdLiBS
YW5kb20gZWFybHkNCj4gPiBkZXRlY3Rpb24gaXMgb25lIG9mIHRoZSBzaW1wbGVzdCBhbmQgbW9z
dCBzdHVkaWVkIEFRTSBhbGdvcml0aG1zIGZvcg0KPiA+IG5ldHdvcmtpbmcsIGJ1dCB1bnRpbCBu
b3csIGl0IGhhc24ndCBiZWVuIGFwcGxpZWQgdG8gZGlzayBJL08NCj4gPiBzY2hlZHVsaW5nLg0K
PiA+IA0KPiA+IFdoZW4gYXBwbGllZCB0byBuZXR3b3JrIHJvdXRlcnMsIFJFRCBwcm9iYWJpbGlz
dGljYWxseSBlaXRoZXIgbWFya3MNCj4gPiBwYWNrZXRzIHdpdGggRUNOIG9yIGRyb3BzIHRoZW0s
IGRlcGVuZGluZyBvbiB0aGUgY29uZmlndXJhdGlvbi4gV2hlbg0KPiA+IGRlYWxpbmcgd2l0aCBk
aXNrIEkvTywgUE9TSVggZG9lcyBub3QgaGF2ZSBhbnkgbWVjaGFuaXNtIHdpdGggd2hpY2ggdG8N
Cj4gPiBub3RpZnkgdGhlIGNhbGxlciB0aGF0IHRoZSBkaXNrIGlzIGNvbmdlc3RlZCwgc28gd2Ug
aW5zdGVhZCBvbmx5IHByb3ZpZGUNCj4gPiB0aGUgbGF0dGVyIHN0cmF0ZWd5LiBJbmNsdWRlZCBp
biB0aGlzIHBhdGNoIGlzIGEgbWlub3IgY2hhbmdlIHRvIHRoZQ0KPiA+IGJsay1tcSB0byBzdXBw
b3J0IHRoaXMuDQo+IA0KPiBUaGlzIGlzIGdyZWF0IHdvcmsuIElmIHdlIGNvbWJpbmUgdGhpcyB3
aXRoIGEgdGhpbiBwcm92aXNpb25pbmcgdGFyZ2V0LA0KPiB3ZSBjYW4gZXZlbiB1c2UgdGhpcyB0
byBzYXZlIHNwYWNlIG9uIHRoZSBiYWNrZW5kLiBCZXR0ZXIgbGF0ZW5jaWVzLA0KPiBBTkQgbG93
ZXIgZGlzayB1dGlsaXphdGlvbi4NCj4gDQo+IEknbSB0ZW1wdGVkIHRvIGp1c3QgcXVldWUgdGhp
cyB1cCBmb3IgdGhpcyBjeWNsZSBhbmQgbWFrZSBpdCB0aGUgZGVmYXVsdC4NCg0KSGVsbG8gSmVu
cywNCg0KRGlkIHlvdSBtZWFuIG1ha2luZyB0aGlzIHRoZSBkZWZhdWx0IHNjaGVkdWxlciBmb3Ig
U1NEcyBvbmx5IG9yIGZvciBhbGwgdHlwZXMNCm9mIGJsb2NrIGRldmljZXM/IE91ciAoV2VzdGVy
biBEaWdpdGFsKSBleHBlcmllbmNlIGlzIHRoYXQgYW55IEkvTyBzY2hlZHVsZXINCnRoYXQgbGlt
aXRzIHRoZSBxdWV1ZSBkZXB0aCByZWR1Y2VzIHRocm91Z2hwdXQgZm9yIGF0IGxlYXN0IGRhdGEt
Y2VudGVyIHN0eWxlDQp3b3JrbG9hZHMgd2hlbiB1c2luZyBoYXJkIGRpc2tzLiBUaGlzIGlzIHdo
eSBBZGFtIGlzIHdvcmtpbmcgb24gaW1wcm92aW5nIEkvTw0KcHJpb3JpdHkgc3VwcG9ydCBmb3Ig
dGhlIExpbnV4IGJsb2NrIGxheWVyLiBUaGF0IGFwcHJvYWNoIG5hbWVseSBhbGxvd3MgdG8NCnJl
ZHVjZSBsYXRlbmN5IG9mIGNlcnRhaW4gcmVxdWVzdHMgd2l0aG91dCBzaWduaWZpY2FudGx5IGlt
cGFjdGluZyBhdmVyYWdlDQpsYXRlbmN5IGFuZCB0aHJvdWdocHV0Lg0KDQpCYXJ0Lg==

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2017-04-01 23:30 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-01 19:55 [PATCH] blk-mq: add random early detection I/O scheduler Omar Sandoval
2017-04-01 22:07 ` Jens Axboe
2017-04-01 23:29   ` Bart Van Assche

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).