linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCHSET v6] blk-mq scheduling framework
@ 2017-01-11 21:39 Jens Axboe
  2017-01-11 21:39 ` [PATCH 01/10] block: move existing elevator ops to union Jens Axboe
                   ` (13 more replies)
  0 siblings, 14 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-11 21:39 UTC (permalink / raw)
  To: linux-kernel, linux-block; +Cc: osandov, bart.vanassche

Another year, another posting of this patchset. The previous posting
was here:

https://www.spinics.net/lists/kernel/msg2406106.html

(yes, I've skipped v5, it was fixes on top of v4, not the rework).

I've reworked bits of this to get rid of the shadow requests, thanks
to Bart for the inspiration. The missing piece, for me, was the fact
that we have the tags->rqs[] indirection array already. I've done this
somewhat differently, though, by having the internal scheduler tag
map be allocated/torn down when an IO scheduler is attached or
detached. This also means that when we run without a scheduler, we
don't have to do double tag allocations, it'll work like before.

The patchset applies on top of 4.10-rc3, or can be pulled here:

git://git.kernel.dk/linux-block blk-mq-sched.6


 block/Kconfig.iosched    |   50 ++++
 block/Makefile           |    3 
 block/blk-core.c         |   19 -
 block/blk-exec.c         |    3 
 block/blk-flush.c        |   15 -
 block/blk-ioc.c          |   12 
 block/blk-merge.c        |    4 
 block/blk-mq-sched.c     |  354 +++++++++++++++++++++++++++++
 block/blk-mq-sched.h     |  157 ++++++++++++
 block/blk-mq-sysfs.c     |   13 +
 block/blk-mq-tag.c       |   58 ++--
 block/blk-mq-tag.h       |    4 
 block/blk-mq.c           |  413 +++++++++++++++++++---------------
 block/blk-mq.h           |   40 +++
 block/blk-tag.c          |    1 
 block/blk.h              |   26 +-
 block/cfq-iosched.c      |    2 
 block/deadline-iosched.c |    2 
 block/elevator.c         |  247 +++++++++++++++-----
 block/mq-deadline.c      |  569 +++++++++++++++++++++++++++++++++++++++++++++++
 block/noop-iosched.c     |    2 
 drivers/nvme/host/pci.c  |    1 
 include/linux/blk-mq.h   |    9 
 include/linux/blkdev.h   |    6 
 include/linux/elevator.h |   36 ++
 25 files changed, 1732 insertions(+), 314 deletions(-)

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* [PATCH 01/10] block: move existing elevator ops to union
  2017-01-11 21:39 [PATCHSET v6] blk-mq scheduling framework Jens Axboe
@ 2017-01-11 21:39 ` Jens Axboe
  2017-01-12 10:15   ` Johannes Thumshirn
                     ` (2 more replies)
  2017-01-11 21:39 ` [PATCH 02/10] blk-mq: make mq_ops a const pointer Jens Axboe
                   ` (12 subsequent siblings)
  13 siblings, 3 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-11 21:39 UTC (permalink / raw)
  To: linux-kernel, linux-block; +Cc: osandov, bart.vanassche, Jens Axboe

Prep patch for adding MQ ops as well, since doing anon unions with
named initializers doesn't work on older compilers.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-ioc.c          |  8 +++----
 block/blk-merge.c        |  4 ++--
 block/blk.h              | 10 ++++----
 block/cfq-iosched.c      |  2 +-
 block/deadline-iosched.c |  2 +-
 block/elevator.c         | 60 ++++++++++++++++++++++++------------------------
 block/noop-iosched.c     |  2 +-
 include/linux/elevator.h |  4 +++-
 8 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 381cb50a673c..ab372092a57d 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -43,8 +43,8 @@ static void ioc_exit_icq(struct io_cq *icq)
 	if (icq->flags & ICQ_EXITED)
 		return;
 
-	if (et->ops.elevator_exit_icq_fn)
-		et->ops.elevator_exit_icq_fn(icq);
+	if (et->ops.sq.elevator_exit_icq_fn)
+		et->ops.sq.elevator_exit_icq_fn(icq);
 
 	icq->flags |= ICQ_EXITED;
 }
@@ -383,8 +383,8 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 	if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
 		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
 		list_add(&icq->q_node, &q->icq_list);
-		if (et->ops.elevator_init_icq_fn)
-			et->ops.elevator_init_icq_fn(icq);
+		if (et->ops.sq.elevator_init_icq_fn)
+			et->ops.sq.elevator_init_icq_fn(icq);
 	} else {
 		kmem_cache_free(et->icq_cache, icq);
 		icq = ioc_lookup_icq(ioc, q);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 182398cb1524..480570b691dc 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -763,8 +763,8 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_allow_rq_merge_fn)
-		if (!e->type->ops.elevator_allow_rq_merge_fn(q, rq, next))
+	if (e->type->ops.sq.elevator_allow_rq_merge_fn)
+		if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next))
 			return 0;
 
 	return attempt_merge(q, rq, next);
diff --git a/block/blk.h b/block/blk.h
index 041185e5f129..f46c0ac8ae3d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -167,7 +167,7 @@ static inline struct request *__elv_next_request(struct request_queue *q)
 			return NULL;
 		}
 		if (unlikely(blk_queue_bypass(q)) ||
-		    !q->elevator->type->ops.elevator_dispatch_fn(q, 0))
+		    !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0))
 			return NULL;
 	}
 }
@@ -176,16 +176,16 @@ static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_activate_req_fn)
-		e->type->ops.elevator_activate_req_fn(q, rq);
+	if (e->type->ops.sq.elevator_activate_req_fn)
+		e->type->ops.sq.elevator_activate_req_fn(q, rq);
 }
 
 static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_deactivate_req_fn)
-		e->type->ops.elevator_deactivate_req_fn(q, rq);
+	if (e->type->ops.sq.elevator_deactivate_req_fn)
+		e->type->ops.sq.elevator_deactivate_req_fn(q, rq);
 }
 
 #ifdef CONFIG_FAIL_IO_TIMEOUT
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index c73a6fcaeb9d..37aeb20fa454 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -4837,7 +4837,7 @@ static struct elv_fs_entry cfq_attrs[] = {
 };
 
 static struct elevator_type iosched_cfq = {
-	.ops = {
+	.ops.sq = {
 		.elevator_merge_fn = 		cfq_merge,
 		.elevator_merged_fn =		cfq_merged_request,
 		.elevator_merge_req_fn =	cfq_merged_requests,
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 55e0bb6d7da7..05fc0ea25a98 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -439,7 +439,7 @@ static struct elv_fs_entry deadline_attrs[] = {
 };
 
 static struct elevator_type iosched_deadline = {
-	.ops = {
+	.ops.sq = {
 		.elevator_merge_fn = 		deadline_merge,
 		.elevator_merged_fn =		deadline_merged_request,
 		.elevator_merge_req_fn =	deadline_merged_requests,
diff --git a/block/elevator.c b/block/elevator.c
index 40f0c04e5ad3..022a26830297 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -58,8 +58,8 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_allow_bio_merge_fn)
-		return e->type->ops.elevator_allow_bio_merge_fn(q, rq, bio);
+	if (e->type->ops.sq.elevator_allow_bio_merge_fn)
+		return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
 
 	return 1;
 }
@@ -224,7 +224,7 @@ int elevator_init(struct request_queue *q, char *name)
 		}
 	}
 
-	err = e->ops.elevator_init_fn(q, e);
+	err = e->ops.sq.elevator_init_fn(q, e);
 	if (err)
 		elevator_put(e);
 	return err;
@@ -234,8 +234,8 @@ EXPORT_SYMBOL(elevator_init);
 void elevator_exit(struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
-	if (e->type->ops.elevator_exit_fn)
-		e->type->ops.elevator_exit_fn(e);
+	if (e->type->ops.sq.elevator_exit_fn)
+		e->type->ops.sq.elevator_exit_fn(e);
 	mutex_unlock(&e->sysfs_lock);
 
 	kobject_put(&e->kobj);
@@ -443,8 +443,8 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 		return ELEVATOR_BACK_MERGE;
 	}
 
-	if (e->type->ops.elevator_merge_fn)
-		return e->type->ops.elevator_merge_fn(q, req, bio);
+	if (e->type->ops.sq.elevator_merge_fn)
+		return e->type->ops.sq.elevator_merge_fn(q, req, bio);
 
 	return ELEVATOR_NO_MERGE;
 }
@@ -495,8 +495,8 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_merged_fn)
-		e->type->ops.elevator_merged_fn(q, rq, type);
+	if (e->type->ops.sq.elevator_merged_fn)
+		e->type->ops.sq.elevator_merged_fn(q, rq, type);
 
 	if (type == ELEVATOR_BACK_MERGE)
 		elv_rqhash_reposition(q, rq);
@@ -510,8 +510,8 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 	struct elevator_queue *e = q->elevator;
 	const int next_sorted = next->rq_flags & RQF_SORTED;
 
-	if (next_sorted && e->type->ops.elevator_merge_req_fn)
-		e->type->ops.elevator_merge_req_fn(q, rq, next);
+	if (next_sorted && e->type->ops.sq.elevator_merge_req_fn)
+		e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
 
 	elv_rqhash_reposition(q, rq);
 
@@ -528,8 +528,8 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_bio_merged_fn)
-		e->type->ops.elevator_bio_merged_fn(q, rq, bio);
+	if (e->type->ops.sq.elevator_bio_merged_fn)
+		e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
 }
 
 #ifdef CONFIG_PM
@@ -578,7 +578,7 @@ void elv_drain_elevator(struct request_queue *q)
 
 	lockdep_assert_held(q->queue_lock);
 
-	while (q->elevator->type->ops.elevator_dispatch_fn(q, 1))
+	while (q->elevator->type->ops.sq.elevator_dispatch_fn(q, 1))
 		;
 	if (q->nr_sorted && printed++ < 10) {
 		printk(KERN_ERR "%s: forced dispatching is broken "
@@ -653,7 +653,7 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 		 * rq cannot be accessed after calling
 		 * elevator_add_req_fn.
 		 */
-		q->elevator->type->ops.elevator_add_req_fn(q, rq);
+		q->elevator->type->ops.sq.elevator_add_req_fn(q, rq);
 		break;
 
 	case ELEVATOR_INSERT_FLUSH:
@@ -682,8 +682,8 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_latter_req_fn)
-		return e->type->ops.elevator_latter_req_fn(q, rq);
+	if (e->type->ops.sq.elevator_latter_req_fn)
+		return e->type->ops.sq.elevator_latter_req_fn(q, rq);
 	return NULL;
 }
 
@@ -691,8 +691,8 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_former_req_fn)
-		return e->type->ops.elevator_former_req_fn(q, rq);
+	if (e->type->ops.sq.elevator_former_req_fn)
+		return e->type->ops.sq.elevator_former_req_fn(q, rq);
 	return NULL;
 }
 
@@ -701,8 +701,8 @@ int elv_set_request(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_set_req_fn)
-		return e->type->ops.elevator_set_req_fn(q, rq, bio, gfp_mask);
+	if (e->type->ops.sq.elevator_set_req_fn)
+		return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
 	return 0;
 }
 
@@ -710,16 +710,16 @@ void elv_put_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_put_req_fn)
-		e->type->ops.elevator_put_req_fn(rq);
+	if (e->type->ops.sq.elevator_put_req_fn)
+		e->type->ops.sq.elevator_put_req_fn(rq);
 }
 
 int elv_may_queue(struct request_queue *q, unsigned int op)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.elevator_may_queue_fn)
-		return e->type->ops.elevator_may_queue_fn(q, op);
+	if (e->type->ops.sq.elevator_may_queue_fn)
+		return e->type->ops.sq.elevator_may_queue_fn(q, op);
 
 	return ELV_MQUEUE_MAY;
 }
@@ -734,8 +734,8 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 	if (blk_account_rq(rq)) {
 		q->in_flight[rq_is_sync(rq)]--;
 		if ((rq->rq_flags & RQF_SORTED) &&
-		    e->type->ops.elevator_completed_req_fn)
-			e->type->ops.elevator_completed_req_fn(q, rq);
+		    e->type->ops.sq.elevator_completed_req_fn)
+			e->type->ops.sq.elevator_completed_req_fn(q, rq);
 	}
 }
 
@@ -803,8 +803,8 @@ int elv_register_queue(struct request_queue *q)
 		}
 		kobject_uevent(&e->kobj, KOBJ_ADD);
 		e->registered = 1;
-		if (e->type->ops.elevator_registered_fn)
-			e->type->ops.elevator_registered_fn(q);
+		if (e->type->ops.sq.elevator_registered_fn)
+			e->type->ops.sq.elevator_registered_fn(q);
 	}
 	return error;
 }
@@ -912,7 +912,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	spin_unlock_irq(q->queue_lock);
 
 	/* allocate, init and register new elevator */
-	err = new_e->ops.elevator_init_fn(q, new_e);
+	err = new_e->ops.sq.elevator_init_fn(q, new_e);
 	if (err)
 		goto fail_init;
 
diff --git a/block/noop-iosched.c b/block/noop-iosched.c
index a163c487cf38..2d1b15d89b45 100644
--- a/block/noop-iosched.c
+++ b/block/noop-iosched.c
@@ -92,7 +92,7 @@ static void noop_exit_queue(struct elevator_queue *e)
 }
 
 static struct elevator_type elevator_noop = {
-	.ops = {
+	.ops.sq = {
 		.elevator_merge_req_fn		= noop_merged_requests,
 		.elevator_dispatch_fn		= noop_dispatch,
 		.elevator_add_req_fn		= noop_add_request,
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index b276e9ef0e0b..2a9e966eed03 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -94,7 +94,9 @@ struct elevator_type
 	struct kmem_cache *icq_cache;
 
 	/* fields provided by elevator implementation */
-	struct elevator_ops ops;
+	union {
+		struct elevator_ops sq;
+	} ops;
 	size_t icq_size;	/* see iocontext.h */
 	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 02/10] blk-mq: make mq_ops a const pointer
  2017-01-11 21:39 [PATCHSET v6] blk-mq scheduling framework Jens Axboe
  2017-01-11 21:39 ` [PATCH 01/10] block: move existing elevator ops to union Jens Axboe
@ 2017-01-11 21:39 ` Jens Axboe
  2017-01-12 10:14   ` Johannes Thumshirn
  2017-01-13  8:16   ` Christoph Hellwig
  2017-01-11 21:39 ` [PATCH 03/10] block: move rq_ioc() to blk.h Jens Axboe
                   ` (11 subsequent siblings)
  13 siblings, 2 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-11 21:39 UTC (permalink / raw)
  To: linux-kernel, linux-block; +Cc: osandov, bart.vanassche, Jens Axboe

We never change it, make that clear.

Signed-off-by: Jens Axboe <axboe@fb.com>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
---
 block/blk-mq.c         | 2 +-
 include/linux/blk-mq.h | 2 +-
 include/linux/blkdev.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index a8e67a155d04..79e1cb0f7b15 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -639,7 +639,7 @@ struct blk_mq_timeout_data {
 
 void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
-	struct blk_mq_ops *ops = req->q->mq_ops;
+	const struct blk_mq_ops *ops = req->q->mq_ops;
 	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 
 	/*
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 4a2ab5d99ff7..afc81d77e471 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -60,7 +60,7 @@ struct blk_mq_hw_ctx {
 
 struct blk_mq_tag_set {
 	unsigned int		*mq_map;
-	struct blk_mq_ops	*ops;
+	const struct blk_mq_ops	*ops;
 	unsigned int		nr_hw_queues;
 	unsigned int		queue_depth;	/* max hw supported */
 	unsigned int		reserved_tags;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 83695641bd5e..e19fc8e01fb2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -407,7 +407,7 @@ struct request_queue {
 	dma_drain_needed_fn	*dma_drain_needed;
 	lld_busy_fn		*lld_busy_fn;
 
-	struct blk_mq_ops	*mq_ops;
+	const struct blk_mq_ops	*mq_ops;
 
 	unsigned int		*mq_map;
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 03/10] block: move rq_ioc() to blk.h
  2017-01-11 21:39 [PATCHSET v6] blk-mq scheduling framework Jens Axboe
  2017-01-11 21:39 ` [PATCH 01/10] block: move existing elevator ops to union Jens Axboe
  2017-01-11 21:39 ` [PATCH 02/10] blk-mq: make mq_ops a const pointer Jens Axboe
@ 2017-01-11 21:39 ` Jens Axboe
  2017-01-12 10:14   ` Johannes Thumshirn
                     ` (2 more replies)
  2017-01-11 21:39 ` [PATCH 04/10] blk-mq: un-export blk_mq_free_hctx_request() Jens Axboe
                   ` (10 subsequent siblings)
  13 siblings, 3 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-11 21:39 UTC (permalink / raw)
  To: linux-kernel, linux-block; +Cc: osandov, bart.vanassche, Jens Axboe

We want to use it outside of blk-core.c.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-core.c | 16 ----------------
 block/blk.h      | 16 ++++++++++++++++
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 61ba08c58b64..92baea07acbc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1040,22 +1040,6 @@ static bool blk_rq_should_init_elevator(struct bio *bio)
 }
 
 /**
- * rq_ioc - determine io_context for request allocation
- * @bio: request being allocated is for this bio (can be %NULL)
- *
- * Determine io_context to use for request allocation for @bio.  May return
- * %NULL if %current->io_context doesn't exist.
- */
-static struct io_context *rq_ioc(struct bio *bio)
-{
-#ifdef CONFIG_BLK_CGROUP
-	if (bio && bio->bi_ioc)
-		return bio->bi_ioc;
-#endif
-	return current->io_context;
-}
-
-/**
  * __get_request - get a free request
  * @rl: request list to allocate from
  * @op: operation and flags
diff --git a/block/blk.h b/block/blk.h
index f46c0ac8ae3d..9a716b5925a4 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -264,6 +264,22 @@ void ioc_clear_queue(struct request_queue *q);
 int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
 
 /**
+ * rq_ioc - determine io_context for request allocation
+ * @bio: request being allocated is for this bio (can be %NULL)
+ *
+ * Determine io_context to use for request allocation for @bio.  May return
+ * %NULL if %current->io_context doesn't exist.
+ */
+static inline struct io_context *rq_ioc(struct bio *bio)
+{
+#ifdef CONFIG_BLK_CGROUP
+	if (bio && bio->bi_ioc)
+		return bio->bi_ioc;
+#endif
+	return current->io_context;
+}
+
+/**
  * create_io_context - try to create task->io_context
  * @gfp_mask: allocation mask
  * @node: allocation node
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 04/10] blk-mq: un-export blk_mq_free_hctx_request()
  2017-01-11 21:39 [PATCHSET v6] blk-mq scheduling framework Jens Axboe
                   ` (2 preceding siblings ...)
  2017-01-11 21:39 ` [PATCH 03/10] block: move rq_ioc() to blk.h Jens Axboe
@ 2017-01-11 21:39 ` Jens Axboe
  2017-01-12 10:13   ` Johannes Thumshirn
                     ` (2 more replies)
  2017-01-11 21:39 ` [PATCH 05/10] blk-mq: export some helpers we need to the scheduling framework Jens Axboe
                   ` (9 subsequent siblings)
  13 siblings, 3 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-11 21:39 UTC (permalink / raw)
  To: linux-kernel, linux-block; +Cc: osandov, bart.vanassche, Jens Axboe

It's only used in blk-mq, kill it from the main exported header
and kill the symbol export as well.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c         | 5 ++---
 include/linux/blk-mq.h | 1 -
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 79e1cb0f7b15..f49f6325b332 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -337,15 +337,14 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 	blk_queue_exit(q);
 }
 
-void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+static void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx,
+				     struct request *rq)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 
 	ctx->rq_completed[rq_is_sync(rq)]++;
 	__blk_mq_free_request(hctx, ctx, rq);
-
 }
-EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
 
 void blk_mq_free_request(struct request *rq)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index afc81d77e471..2686f9e7302a 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -181,7 +181,6 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
 void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_free_request(struct request *rq);
-void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 
 enum {
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 05/10] blk-mq: export some helpers we need to the scheduling framework
  2017-01-11 21:39 [PATCHSET v6] blk-mq scheduling framework Jens Axboe
                   ` (3 preceding siblings ...)
  2017-01-11 21:39 ` [PATCH 04/10] blk-mq: un-export blk_mq_free_hctx_request() Jens Axboe
@ 2017-01-11 21:39 ` Jens Axboe
  2017-01-12 10:17   ` Johannes Thumshirn
                     ` (2 more replies)
  2017-01-11 21:39 ` [PATCH 06/10] blk-mq-tag: cleanup the normal/reserved tag allocation Jens Axboe
                   ` (8 subsequent siblings)
  13 siblings, 3 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-11 21:39 UTC (permalink / raw)
  To: linux-kernel, linux-block; +Cc: osandov, bart.vanassche, Jens Axboe

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 39 +++++++++++++++++++++------------------
 block/blk-mq.h | 25 +++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 18 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index f49f6325b332..9fc521755e22 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -167,8 +167,8 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 }
 EXPORT_SYMBOL(blk_mq_can_queue);
 
-static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
-			       struct request *rq, unsigned int op)
+void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+			struct request *rq, unsigned int op)
 {
 	INIT_LIST_HEAD(&rq->queuelist);
 	/* csd/requeue_work/fifo_time is initialized before use */
@@ -213,9 +213,10 @@ static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 
 	ctx->rq_dispatched[op_is_sync(op)]++;
 }
+EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init);
 
-static struct request *
-__blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
+struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
+				       unsigned int op)
 {
 	struct request *rq;
 	unsigned int tag;
@@ -236,6 +237,7 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, unsigned int op)
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(__blk_mq_alloc_request);
 
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		unsigned int flags)
@@ -319,8 +321,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
 }
 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 
-static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
-				  struct blk_mq_ctx *ctx, struct request *rq)
+void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+			   struct request *rq)
 {
 	const int tag = rq->tag;
 	struct request_queue *q = rq->q;
@@ -802,7 +804,7 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
  * Process software queues that have been marked busy, splicing them
  * to the for-dispatch
  */
-static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
+void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
 	struct flush_busy_ctx_data data = {
 		.hctx = hctx,
@@ -811,6 +813,7 @@ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 
 	sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
 }
+EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
 
 static inline unsigned int queued_to_index(unsigned int queued)
 {
@@ -921,7 +924,7 @@ static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
 	/*
 	 * Touch any software queue that has pending entries.
 	 */
-	flush_busy_ctxs(hctx, &rq_list);
+	blk_mq_flush_busy_ctxs(hctx, &rq_list);
 
 	/*
 	 * If we have previous entries on our dispatch list, grab them
@@ -1135,8 +1138,8 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
 		list_add_tail(&rq->queuelist, &ctx->rq_list);
 }
 
-static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
-				    struct request *rq, bool at_head)
+void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+			     bool at_head)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 
@@ -1550,8 +1553,8 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	return cookie;
 }
 
-static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
-		struct blk_mq_tags *tags, unsigned int hctx_idx)
+void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+			unsigned int hctx_idx)
 {
 	struct page *page;
 
@@ -1588,8 +1591,8 @@ static size_t order_to_size(unsigned int order)
 	return (size_t)PAGE_SIZE << order;
 }
 
-static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
-		unsigned int hctx_idx)
+struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
+				       unsigned int hctx_idx)
 {
 	struct blk_mq_tags *tags;
 	unsigned int i, j, entries_per_page, max_order = 4;
@@ -2279,10 +2282,10 @@ static int blk_mq_queue_reinit_dead(unsigned int cpu)
  * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list
  * and set bit0 in pending bitmap as ctx1->index_hw is still zero.
  *
- * And then while running hw queue, flush_busy_ctxs() finds bit0 is set in
- * pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
- * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list
- * is ignored.
+ * And then while running hw queue, blk_mq_flush_busy_ctxs() finds bit0 is set
+ * in pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
+ * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list is
+ * ignored.
  */
 static int blk_mq_queue_reinit_prepare(unsigned int cpu)
 {
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 63e9116cddbd..e59f5ca520a2 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -32,6 +32,21 @@ void blk_mq_free_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *);
+void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
+
+/*
+ * Internal helpers for allocating/freeing the request map
+ */
+void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+			unsigned int hctx_idx);
+struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
+					unsigned int hctx_idx);
+
+/*
+ * Internal helpers for request insertion into sw queues
+ */
+void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+				bool at_head);
 
 /*
  * CPU hotplug helpers
@@ -103,6 +118,16 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
 	data->hctx = hctx;
 }
 
+/*
+ * Internal helpers for request allocation/init/free
+ */
+void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
+			struct request *rq, unsigned int op);
+void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+				struct request *rq);
+struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
+					unsigned int op);
+
 static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
 {
 	return test_bit(BLK_MQ_S_STOPPED, &hctx->state);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 06/10] blk-mq-tag: cleanup the normal/reserved tag allocation
  2017-01-11 21:39 [PATCHSET v6] blk-mq scheduling framework Jens Axboe
                   ` (4 preceding siblings ...)
  2017-01-11 21:39 ` [PATCH 05/10] blk-mq: export some helpers we need to the scheduling framework Jens Axboe
@ 2017-01-11 21:39 ` Jens Axboe
  2017-01-12 21:22   ` Bart Van Assche
  2017-01-13  8:30   ` Christoph Hellwig
  2017-01-11 21:40 ` [PATCH 07/10] blk-mq: abstract out helpers for allocating/freeing tag maps Jens Axboe
                   ` (7 subsequent siblings)
  13 siblings, 2 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-11 21:39 UTC (permalink / raw)
  To: linux-kernel, linux-block; +Cc: osandov, bart.vanassche, Jens Axboe

This is in preparation for having another tag set available. Cleanup
the parameters, and allow passing in of tags fo blk_mq_put_tag().

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq-tag.c | 58 ++++++++++++++++++++++++++++++------------------------
 block/blk-mq-tag.h |  4 ++--
 block/blk-mq.c     |  2 +-
 block/blk-mq.h     |  5 +++++
 4 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index dcf5ce3ba4bf..0ff8df347bfb 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -97,25 +97,35 @@ static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt)
 	return __sbitmap_queue_get(bt);
 }
 
-static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
-		  struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags)
+static struct sbitmap_queue *bt_from_data(struct blk_mq_alloc_data *data)
 {
+	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+
+	if (data->flags & BLK_MQ_REQ_RESERVED)
+		return &tags->breserved_tags;
+
+	return &tags->bitmap_tags;
+}
+
+static int bt_get(struct blk_mq_alloc_data *data)
+{
+	struct sbitmap_queue *bt = bt_from_data(data);
 	struct sbq_wait_state *ws;
 	DEFINE_WAIT(wait);
 	int tag;
 
-	tag = __bt_get(hctx, bt);
+	tag = __bt_get(data->hctx, bt);
 	if (tag != -1)
 		return tag;
 
 	if (data->flags & BLK_MQ_REQ_NOWAIT)
 		return -1;
 
-	ws = bt_wait_ptr(bt, hctx);
+	ws = bt_wait_ptr(bt, data->hctx);
 	do {
 		prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
 
-		tag = __bt_get(hctx, bt);
+		tag = __bt_get(data->hctx, bt);
 		if (tag != -1)
 			break;
 
@@ -125,14 +135,14 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
 		 * some to complete. Note that hctx can be NULL here for
 		 * reserved tag allocation.
 		 */
-		if (hctx)
-			blk_mq_run_hw_queue(hctx, false);
+		if (data->hctx)
+			blk_mq_run_hw_queue(data->hctx, false);
 
 		/*
 		 * Retry tag allocation after running the hardware queue,
 		 * as running the queue may also have found completions.
 		 */
-		tag = __bt_get(hctx, bt);
+		tag = __bt_get(data->hctx, bt);
 		if (tag != -1)
 			break;
 
@@ -142,14 +152,10 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
 
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu);
-		if (data->flags & BLK_MQ_REQ_RESERVED) {
-			bt = &data->hctx->tags->breserved_tags;
-		} else {
-			hctx = data->hctx;
-			bt = &hctx->tags->bitmap_tags;
-		}
+		bt = bt_from_data(data);
+
 		finish_wait(&ws->wait, &wait);
-		ws = bt_wait_ptr(bt, hctx);
+		ws = bt_wait_ptr(bt, data->hctx);
 	} while (1);
 
 	finish_wait(&ws->wait, &wait);
@@ -160,25 +166,27 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
 {
 	int tag;
 
-	tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
-		     data->hctx->tags);
-	if (tag >= 0)
-		return tag + data->hctx->tags->nr_reserved_tags;
+	tag = bt_get(data);
+	if (tag >= 0) {
+		struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+
+		return tag + tags->nr_reserved_tags;
+	}
 
 	return BLK_MQ_TAG_FAIL;
 }
 
 static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
 {
+	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	int tag;
 
-	if (unlikely(!data->hctx->tags->nr_reserved_tags)) {
+	if (unlikely(!tags->nr_reserved_tags)) {
 		WARN_ON_ONCE(1);
 		return BLK_MQ_TAG_FAIL;
 	}
 
-	tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL,
-		     data->hctx->tags);
+	tag = bt_get(data);
 	if (tag < 0)
 		return BLK_MQ_TAG_FAIL;
 
@@ -192,11 +200,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	return __blk_mq_get_tag(data);
 }
 
-void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
-		    unsigned int tag)
+void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
+		    struct blk_mq_ctx *ctx, unsigned int tag)
 {
-	struct blk_mq_tags *tags = hctx->tags;
-
 	if (tag >= tags->nr_reserved_tags) {
 		const int real_tag = tag - tags->nr_reserved_tags;
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index d1662734dc53..923602dd3bfb 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -24,8 +24,8 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
 extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
-extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
-			   unsigned int tag);
+extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
+			   struct blk_mq_ctx *ctx, unsigned int tag);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
 extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9fc521755e22..6fab8e9c724f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -335,7 +335,7 @@ void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
-	blk_mq_put_tag(hctx, ctx, tag);
+	blk_mq_put_tag(hctx, hctx->tags, ctx, tag);
 	blk_queue_exit(q);
 }
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e59f5ca520a2..48b7771eb192 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -118,6 +118,11 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
 	data->hctx = hctx;
 }
 
+static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
+{
+	return data->hctx->tags;
+}
+
 /*
  * Internal helpers for request allocation/init/free
  */
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 07/10] blk-mq: abstract out helpers for allocating/freeing tag maps
  2017-01-11 21:39 [PATCHSET v6] blk-mq scheduling framework Jens Axboe
                   ` (5 preceding siblings ...)
  2017-01-11 21:39 ` [PATCH 06/10] blk-mq-tag: cleanup the normal/reserved tag allocation Jens Axboe
@ 2017-01-11 21:40 ` Jens Axboe
  2017-01-12 21:29   ` Bart Van Assche
  2017-01-11 21:40 ` [PATCH 08/10] blk-mq-sched: add framework for MQ capable IO schedulers Jens Axboe
                   ` (6 subsequent siblings)
  13 siblings, 1 reply; 62+ messages in thread
From: Jens Axboe @ 2017-01-11 21:40 UTC (permalink / raw)
  To: linux-kernel, linux-block; +Cc: osandov, bart.vanassche, Jens Axboe

Prep patch for adding an extra tag map for scheduler requests.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 119 ++++++++++++++++++++++++++++++++++++---------------------
 block/blk-mq.h |  14 ++++---
 2 files changed, 84 insertions(+), 49 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6fab8e9c724f..3b7a087e8bf0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1553,8 +1553,8 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	return cookie;
 }
 
-void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
-			unsigned int hctx_idx)
+void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+		     unsigned int hctx_idx)
 {
 	struct page *page;
 
@@ -1580,33 +1580,30 @@ void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		kmemleak_free(page_address(page));
 		__free_pages(page, page->private);
 	}
+}
 
+void blk_mq_free_rq_map(struct blk_mq_tags *tags)
+{
 	kfree(tags->rqs);
+	tags->rqs = NULL;
 
 	blk_mq_free_tags(tags);
 }
 
-static size_t order_to_size(unsigned int order)
-{
-	return (size_t)PAGE_SIZE << order;
-}
-
-struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
-				       unsigned int hctx_idx)
+struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+					unsigned int hctx_idx,
+					unsigned int nr_tags,
+					unsigned int reserved_tags)
 {
 	struct blk_mq_tags *tags;
-	unsigned int i, j, entries_per_page, max_order = 4;
-	size_t rq_size, left;
 
-	tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
+	tags = blk_mq_init_tags(nr_tags, reserved_tags,
 				set->numa_node,
 				BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
 	if (!tags)
 		return NULL;
 
-	INIT_LIST_HEAD(&tags->page_list);
-
-	tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
+	tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
 				 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
 				 set->numa_node);
 	if (!tags->rqs) {
@@ -1614,15 +1611,31 @@ struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 		return NULL;
 	}
 
+	return tags;
+}
+
+static size_t order_to_size(unsigned int order)
+{
+	return (size_t)PAGE_SIZE << order;
+}
+
+int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+		     unsigned int hctx_idx, unsigned int depth)
+{
+	unsigned int i, j, entries_per_page, max_order = 4;
+	size_t rq_size, left;
+
+	INIT_LIST_HEAD(&tags->page_list);
+
 	/*
 	 * rq_size is the size of the request plus driver payload, rounded
 	 * to the cacheline size
 	 */
 	rq_size = round_up(sizeof(struct request) + set->cmd_size,
 				cache_line_size());
-	left = rq_size * set->queue_depth;
+	left = rq_size * depth;
 
-	for (i = 0; i < set->queue_depth; ) {
+	for (i = 0; i < depth; ) {
 		int this_order = max_order;
 		struct page *page;
 		int to_do;
@@ -1656,7 +1669,7 @@ struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 		 */
 		kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
 		entries_per_page = order_to_size(this_order) / rq_size;
-		to_do = min(entries_per_page, set->queue_depth - i);
+		to_do = min(entries_per_page, depth - i);
 		left -= to_do * rq_size;
 		for (j = 0; j < to_do; j++) {
 			tags->rqs[i] = p;
@@ -1673,11 +1686,11 @@ struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 			i++;
 		}
 	}
-	return tags;
+	return 0;
 
 fail:
-	blk_mq_free_rq_map(set, tags, hctx_idx);
-	return NULL;
+	blk_mq_free_rqs(set, tags, hctx_idx);
+	return -ENOMEM;
 }
 
 /*
@@ -1869,6 +1882,33 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 	}
 }
 
+static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
+{
+	int ret = 0;
+
+	set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
+					set->queue_depth, set->reserved_tags);
+	if (!set->tags[hctx_idx])
+		return false;
+
+	ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
+				set->queue_depth);
+	if (!ret)
+		return true;
+
+	blk_mq_free_rq_map(set->tags[hctx_idx]);
+	set->tags[hctx_idx] = NULL;
+	return false;
+}
+
+static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
+					 unsigned int hctx_idx)
+{
+	blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
+	blk_mq_free_rq_map(set->tags[hctx_idx]);
+	set->tags[hctx_idx] = NULL;
+}
+
 static void blk_mq_map_swqueue(struct request_queue *q,
 			       const struct cpumask *online_mask)
 {
@@ -1897,17 +1937,15 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 
 		hctx_idx = q->mq_map[i];
 		/* unmapped hw queue can be remapped after CPU topo changed */
-		if (!set->tags[hctx_idx]) {
-			set->tags[hctx_idx] = blk_mq_init_rq_map(set, hctx_idx);
-
+		if (!set->tags[hctx_idx] &&
+		    !__blk_mq_alloc_rq_map(set, hctx_idx)) {
 			/*
 			 * If tags initialization fail for some hctx,
 			 * that hctx won't be brought online.  In this
 			 * case, remap the current ctx to hctx[0] which
 			 * is guaranteed to always have tags allocated
 			 */
-			if (!set->tags[hctx_idx])
-				q->mq_map[i] = 0;
+			q->mq_map[i] = 0;
 		}
 
 		ctx = per_cpu_ptr(q->queue_ctx, i);
@@ -1930,10 +1968,9 @@ static void blk_mq_map_swqueue(struct request_queue *q,
 			 * fallback in case of a new remap fails
 			 * allocation
 			 */
-			if (i && set->tags[i]) {
-				blk_mq_free_rq_map(set, set->tags[i], i);
-				set->tags[i] = NULL;
-			}
+			if (i && set->tags[i])
+				blk_mq_free_map_and_requests(set, i);
+
 			hctx->tags = NULL;
 			continue;
 		}
@@ -2100,10 +2137,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx = hctxs[j];
 
 		if (hctx) {
-			if (hctx->tags) {
-				blk_mq_free_rq_map(set, hctx->tags, j);
-				set->tags[j] = NULL;
-			}
+			if (hctx->tags)
+				blk_mq_free_map_and_requests(set, j);
 			blk_mq_exit_hctx(q, set, hctx, j);
 			free_cpumask_var(hctx->cpumask);
 			kobject_put(&hctx->kobj);
@@ -2299,17 +2334,15 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 {
 	int i;
 
-	for (i = 0; i < set->nr_hw_queues; i++) {
-		set->tags[i] = blk_mq_init_rq_map(set, i);
-		if (!set->tags[i])
+	for (i = 0; i < set->nr_hw_queues; i++)
+		if (!__blk_mq_alloc_rq_map(set, i))
 			goto out_unwind;
-	}
 
 	return 0;
 
 out_unwind:
 	while (--i >= 0)
-		blk_mq_free_rq_map(set, set->tags[i], i);
+		blk_mq_free_rq_map(set->tags[i]);
 
 	return -ENOMEM;
 }
@@ -2392,12 +2425,12 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 	if (set->nr_hw_queues > nr_cpu_ids)
 		set->nr_hw_queues = nr_cpu_ids;
 
+	ret = -ENOMEM;
 	set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
 				 GFP_KERNEL, set->numa_node);
 	if (!set->tags)
 		return -ENOMEM;
 
-	ret = -ENOMEM;
 	set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
 			GFP_KERNEL, set->numa_node);
 	if (!set->mq_map)
@@ -2433,10 +2466,8 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 {
 	int i;
 
-	for (i = 0; i < nr_cpu_ids; i++) {
-		if (set->tags[i])
-			blk_mq_free_rq_map(set, set->tags[i], i);
-	}
+	for (i = 0; i < nr_cpu_ids; i++)
+		blk_mq_free_map_and_requests(set, i);
 
 	kfree(set->mq_map);
 	set->mq_map = NULL;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 48b7771eb192..1b279b02d0f6 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -37,17 +37,21 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
 /*
  * Internal helpers for allocating/freeing the request map
  */
-void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
-			unsigned int hctx_idx);
-struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
-					unsigned int hctx_idx);
+void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+		     unsigned int hctx_idx);
+void blk_mq_free_rq_map(struct blk_mq_tags *tags);
+struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+					unsigned int hctx_idx,
+					unsigned int nr_tags,
+					unsigned int reserved_tags);
+int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+		     unsigned int hctx_idx, unsigned int depth);
 
 /*
  * Internal helpers for request insertion into sw queues
  */
 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 				bool at_head);
-
 /*
  * CPU hotplug helpers
  */
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 08/10] blk-mq-sched: add framework for MQ capable IO schedulers
  2017-01-11 21:39 [PATCHSET v6] blk-mq scheduling framework Jens Axboe
                   ` (6 preceding siblings ...)
  2017-01-11 21:40 ` [PATCH 07/10] blk-mq: abstract out helpers for allocating/freeing tag maps Jens Axboe
@ 2017-01-11 21:40 ` Jens Axboe
  2017-01-12 21:45   ` Bart Van Assche
  2017-01-13 11:15   ` Hannes Reinecke
  2017-01-11 21:40 ` [PATCH 09/10] mq-deadline: add blk-mq adaptation of the deadline IO scheduler Jens Axboe
                   ` (5 subsequent siblings)
  13 siblings, 2 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-11 21:40 UTC (permalink / raw)
  To: linux-kernel, linux-block; +Cc: osandov, bart.vanassche, Jens Axboe

This adds a set of hooks that intercepts the blk-mq path of
allocating/inserting/issuing/completing requests, allowing
us to develop a scheduler within that framework.

We reuse the existing elevator scheduler API on the registration
side, but augment that with the scheduler flagging support for
the blk-mq interfce, and with a separate set of ops hooks for MQ
devices.

We split driver and scheduler tags, so we can run the scheduling
independent of device queue depth.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/Makefile           |   2 +-
 block/blk-core.c         |   3 +-
 block/blk-exec.c         |   3 +-
 block/blk-flush.c        |  15 ++-
 block/blk-ioc.c          |   8 +-
 block/blk-merge.c        |   2 +-
 block/blk-mq-sched.c     | 334 +++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-mq-sched.h     | 155 ++++++++++++++++++++++
 block/blk-mq-sysfs.c     |  13 ++
 block/blk-mq.c           | 258 +++++++++++++++++++-----------------
 block/blk-mq.h           |   8 +-
 block/blk-tag.c          |   1 +
 block/elevator.c         | 201 +++++++++++++++++++++-------
 include/linux/blk-mq.h   |   5 +-
 include/linux/blkdev.h   |   4 +-
 include/linux/elevator.h |  32 +++++
 16 files changed, 865 insertions(+), 179 deletions(-)
 create mode 100644 block/blk-mq-sched.c
 create mode 100644 block/blk-mq-sched.h

diff --git a/block/Makefile b/block/Makefile
index a827f988c4e6..2eee9e1bb6db 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
 			blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
-			blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \
+			blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
 			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
 			badblocks.o partitions/
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 92baea07acbc..ee3a6f340cb8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
 
 #include "blk.h"
 #include "blk-mq.h"
+#include "blk-mq-sched.h"
 #include "blk-wbt.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -2127,7 +2128,7 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 	if (q->mq_ops) {
 		if (blk_queue_io_stat(q))
 			blk_account_io_start(rq, true);
-		blk_mq_insert_request(rq, false, true, false);
+		blk_mq_sched_insert_request(rq, false, true, false);
 		return 0;
 	}
 
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 3ecb00a6cf45..86656fdfa637 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -9,6 +9,7 @@
 #include <linux/sched/sysctl.h>
 
 #include "blk.h"
+#include "blk-mq-sched.h"
 
 /*
  * for max sense size
@@ -65,7 +66,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	 * be reused after dying flag is set
 	 */
 	if (q->mq_ops) {
-		blk_mq_insert_request(rq, at_head, true, false);
+		blk_mq_sched_insert_request(rq, at_head, true, false);
 		return;
 	}
 
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 20b7c7a02f1c..29850d10b30a 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -74,6 +74,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
 
 /* FLUSH/FUA sequences */
 enum {
@@ -169,6 +170,8 @@ static bool blk_flush_complete_seq(struct request *rq,
 	struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
 	bool queued = false, kicked;
 
+	BUG_ON(q->mq_ops && rq->tag < 0);
+
 	BUG_ON(rq->flush.seq & seq);
 	rq->flush.seq |= seq;
 
@@ -318,6 +321,8 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
 	if (q->mq_ops) {
 		struct blk_mq_hw_ctx *hctx;
 
+		BUG_ON(first_rq->tag < 0);
+
 		flush_rq->mq_ctx = first_rq->mq_ctx;
 		flush_rq->tag = first_rq->tag;
 		fq->orig_rq = first_rq;
@@ -451,11 +456,11 @@ void blk_insert_flush(struct request *rq)
 	 * processed directly without going through flush machinery.  Queue
 	 * for normal execution.
 	 */
-	if ((policy & REQ_FSEQ_DATA) &&
-	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-		if (q->mq_ops) {
-			blk_mq_insert_request(rq, false, true, false);
-		} else
+	if (((policy & REQ_FSEQ_DATA) &&
+	     !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH)))) {
+		if (q->mq_ops)
+			blk_mq_sched_insert_request(rq, false, true, false);
+		else
 			list_add_tail(&rq->queuelist, &q->queue_head);
 		return;
 	}
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index ab372092a57d..fe186a9eade9 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -43,7 +43,9 @@ static void ioc_exit_icq(struct io_cq *icq)
 	if (icq->flags & ICQ_EXITED)
 		return;
 
-	if (et->ops.sq.elevator_exit_icq_fn)
+	if (et->uses_mq && et->ops.mq.exit_icq)
+		et->ops.mq.exit_icq(icq);
+	else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn)
 		et->ops.sq.elevator_exit_icq_fn(icq);
 
 	icq->flags |= ICQ_EXITED;
@@ -383,7 +385,9 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 	if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
 		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
 		list_add(&icq->q_node, &q->icq_list);
-		if (et->ops.sq.elevator_init_icq_fn)
+		if (et->uses_mq && et->ops.mq.init_icq)
+			et->ops.mq.init_icq(icq);
+		else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn)
 			et->ops.sq.elevator_init_icq_fn(icq);
 	} else {
 		kmem_cache_free(et->icq_cache, icq);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 480570b691dc..6aa43dec5af4 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -763,7 +763,7 @@ int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.sq.elevator_allow_rq_merge_fn)
+	if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn)
 		if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next))
 			return 0;
 
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
new file mode 100644
index 000000000000..44cf30eb1589
--- /dev/null
+++ b/block/blk-mq-sched.c
@@ -0,0 +1,334 @@
+/*
+ * blk-mq scheduling framework
+ *
+ * Copyright (C) 2016 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/blk-mq.h>
+
+#include <trace/events/block.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-tag.h"
+#include "blk-wbt.h"
+
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+				 void (*exit)(struct blk_mq_hw_ctx *))
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (exit)
+			exit(hctx);
+		kfree(hctx->sched_data);
+		hctx->sched_data = NULL;
+	}
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+				int (*init)(struct blk_mq_hw_ctx *),
+				void (*exit)(struct blk_mq_hw_ctx *))
+{
+	struct blk_mq_hw_ctx *hctx;
+	int ret;
+	int i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
+		if (!hctx->sched_data) {
+			ret = -ENOMEM;
+			goto error;
+		}
+
+		if (init) {
+			ret = init(hctx);
+			if (ret) {
+				/*
+				 * We don't want to give exit() a partially
+				 * initialized sched_data. init() must clean up
+				 * if it fails.
+				 */
+				kfree(hctx->sched_data);
+				hctx->sched_data = NULL;
+				goto error;
+			}
+		}
+	}
+
+	return 0;
+error:
+	blk_mq_sched_free_hctx_data(q, exit);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
+
+static void __blk_mq_sched_assign_ioc(struct request_queue *q,
+				      struct request *rq, struct io_context *ioc)
+{
+	struct io_cq *icq;
+
+	spin_lock_irq(q->queue_lock);
+	icq = ioc_lookup_icq(ioc, q);
+	spin_unlock_irq(q->queue_lock);
+
+	if (!icq) {
+		icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
+		if (!icq)
+			return;
+	}
+
+	rq->elv.icq = icq;
+	if (!blk_mq_sched_get_rq_priv(q, rq)) {
+		get_io_context(icq->ioc);
+		return;
+	}
+
+	rq->elv.icq = NULL;
+}
+
+static void blk_mq_sched_assign_ioc(struct request_queue *q,
+				    struct request *rq, struct bio *bio)
+{
+	struct io_context *ioc;
+
+	ioc = rq_ioc(bio);
+	if (ioc)
+		__blk_mq_sched_assign_ioc(q, rq, ioc);
+}
+
+struct request *blk_mq_sched_get_request(struct request_queue *q,
+					 struct bio *bio,
+					 unsigned int op,
+					 struct blk_mq_alloc_data *data)
+{
+	struct elevator_queue *e = q->elevator;
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+	struct request *rq;
+
+	blk_queue_enter_live(q);
+	ctx = blk_mq_get_ctx(q);
+	hctx = blk_mq_map_queue(q, ctx->cpu);
+
+	blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
+
+	if (e) {
+		data->flags |= BLK_MQ_REQ_INTERNAL;
+		if (e->type->ops.mq.get_request)
+			rq = e->type->ops.mq.get_request(q, op, data);
+		else
+			rq = __blk_mq_alloc_request(data, op);
+	} else {
+		rq = __blk_mq_alloc_request(data, op);
+		if (rq) {
+			rq->tag = rq->internal_tag;
+			rq->internal_tag = -1;
+		}
+	}
+
+	if (rq) {
+		rq->elv.icq = NULL;
+		if (e && e->type->icq_cache)
+			blk_mq_sched_assign_ioc(q, rq, bio);
+		data->hctx->queued++;
+		return rq;
+	}
+
+	blk_queue_exit(q);
+	return NULL;
+}
+
+void blk_mq_sched_put_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+
+	if (rq->rq_flags & RQF_ELVPRIV) {
+		blk_mq_sched_put_rq_priv(rq->q, rq);
+		if (rq->elv.icq) {
+			put_io_context(rq->elv.icq->ioc);
+			rq->elv.icq = NULL;
+		}
+	}
+
+	if (e && e->type->ops.mq.put_request)
+		e->type->ops.mq.put_request(rq);
+	else
+		blk_mq_finish_request(rq);
+}
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+{
+	struct elevator_queue *e = hctx->queue->elevator;
+	LIST_HEAD(rq_list);
+
+	if (unlikely(blk_mq_hctx_stopped(hctx)))
+		return;
+
+	hctx->run++;
+
+	/*
+	 * If we have previous entries on our dispatch list, grab them first for
+	 * more fair dispatch.
+	 */
+	if (!list_empty_careful(&hctx->dispatch)) {
+		spin_lock(&hctx->lock);
+		if (!list_empty(&hctx->dispatch))
+			list_splice_init(&hctx->dispatch, &rq_list);
+		spin_unlock(&hctx->lock);
+	}
+
+	/*
+	 * Only ask the scheduler for requests, if we didn't have residual
+	 * requests from the dispatch list. This is to avoid the case where
+	 * we only ever dispatch a fraction of the requests available because
+	 * of low device queue depth. Once we pull requests out of the IO
+	 * scheduler, we can no longer merge or sort them. So it's best to
+	 * leave them there for as long as we can. Mark the hw queue as
+	 * needing a restart in that case.
+	 */
+	if (list_empty(&rq_list)) {
+		if (e && e->type->ops.mq.dispatch_requests)
+			e->type->ops.mq.dispatch_requests(hctx, &rq_list);
+		else
+			blk_mq_flush_busy_ctxs(hctx, &rq_list);
+	} else if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+
+	blk_mq_dispatch_rq_list(hctx, &rq_list);
+}
+
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio)
+{
+	struct request *rq;
+	int ret;
+
+	ret = elv_merge(q, &rq, bio);
+	if (ret == ELEVATOR_BACK_MERGE) {
+		if (bio_attempt_back_merge(q, rq, bio)) {
+			if (!attempt_back_merge(q, rq))
+				elv_merged_request(q, rq, ret);
+			return true;
+		}
+	} else if (ret == ELEVATOR_FRONT_MERGE) {
+		if (bio_attempt_front_merge(q, rq, bio)) {
+			if (!attempt_front_merge(q, rq))
+				elv_merged_request(q, rq, ret);
+			return true;
+		}
+	}
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
+
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e->type->ops.mq.bio_merge) {
+		struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+		struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+		blk_mq_put_ctx(ctx);
+		return e->type->ops.mq.bio_merge(hctx, bio);
+	}
+
+	return false;
+}
+
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
+{
+	return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
+
+void blk_mq_sched_request_inserted(struct request *rq)
+{
+	trace_block_rq_insert(rq->q, rq);
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
+
+int blk_mq_sched_setup(struct request_queue *q)
+{
+	struct blk_mq_tag_set *set = q->tag_set;
+	struct blk_mq_hw_ctx *hctx;
+	int ret, i;
+
+	/*
+	 * Default to 256, since we don't split into sync/async like the
+	 * old code did. Additionally, this is a per-hw queue depth.
+	 */
+	q->nr_requests = 2 * BLKDEV_MAX_RQ;
+
+	/*
+	 * We're switching to using an IO scheduler, so setup the hctx
+	 * scheduler tags and switch the request map from the regular
+	 * tags to scheduler tags. First allocate what we need, so we
+	 * can safely fail and fallback, if needed.
+	 */
+	ret = 0;
+	queue_for_each_hw_ctx(q, hctx, i) {
+		hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0);
+		if (!hctx->sched_tags) {
+			ret = -ENOMEM;
+			break;
+		}
+		ret = blk_mq_alloc_rqs(set, hctx->sched_tags, i, q->nr_requests);
+		if (ret)
+			break;
+	}
+
+	/*
+	 * If we failed, free what we did allocate
+	 */
+	if (ret) {
+		queue_for_each_hw_ctx(q, hctx, i) {
+			if (!hctx->sched_tags)
+				continue;
+			blk_mq_free_rqs(set, hctx->sched_tags, i);
+			blk_mq_free_rq_map(hctx->sched_tags);
+			hctx->sched_tags = NULL;
+		}
+
+		return ret;
+	}
+
+	queue_for_each_hw_ctx(q, hctx, i)
+		blk_mq_free_rqs(set, hctx->tags, i);
+
+	return 0;
+}
+
+int blk_mq_sched_teardown(struct request_queue *q)
+{
+	struct blk_mq_tag_set *set = q->tag_set;
+	struct blk_mq_hw_ctx *hctx;
+	int i, ret;
+
+	ret = 0;
+	queue_for_each_hw_ctx(q, hctx, i) {
+		ret = blk_mq_alloc_rqs(set, hctx->tags, i, set->queue_depth);
+		if (ret)
+			break;
+	}
+
+	if (ret) {
+		queue_for_each_hw_ctx(q, hctx, i)
+			blk_mq_free_rqs(set, hctx->tags, i);
+		return ret;
+	}
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		blk_mq_free_rqs(set, hctx->sched_tags, i);
+		blk_mq_free_rq_map(hctx->sched_tags);
+		hctx->sched_tags = NULL;
+	}
+
+	return 0;
+}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
new file mode 100644
index 000000000000..68d6a202b827
--- /dev/null
+++ b/block/blk-mq-sched.h
@@ -0,0 +1,155 @@
+#ifndef BLK_MQ_SCHED_H
+#define BLK_MQ_SCHED_H
+
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+
+int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
+				int (*init)(struct blk_mq_hw_ctx *),
+				void (*exit)(struct blk_mq_hw_ctx *));
+
+void blk_mq_sched_free_hctx_data(struct request_queue *q,
+				 void (*exit)(struct blk_mq_hw_ctx *));
+
+struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data);
+void blk_mq_sched_put_request(struct request *rq);
+
+void __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
+void blk_mq_sched_request_inserted(struct request *rq);
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio);
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
+
+int blk_mq_sched_setup(struct request_queue *q);
+int blk_mq_sched_teardown(struct request_queue *q);
+
+static inline bool
+blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (!e || blk_queue_nomerges(q) || !bio_mergeable(bio))
+		return false;
+
+	return __blk_mq_sched_bio_merge(q, bio);
+}
+
+static inline int blk_mq_sched_get_rq_priv(struct request_queue *q,
+					   struct request *rq)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.mq.get_rq_priv)
+		return e->type->ops.mq.get_rq_priv(q, rq);
+
+	return 0;
+}
+
+static inline void blk_mq_sched_put_rq_priv(struct request_queue *q,
+					    struct request *rq)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.mq.put_rq_priv)
+		e->type->ops.mq.put_rq_priv(q, rq);
+}
+
+static inline void
+blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue,
+			    bool async)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+	struct blk_mq_ctx *ctx = rq->mq_ctx;
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+
+	if (e && e->type->ops.mq.insert_requests) {
+		LIST_HEAD(list);
+
+		list_add(&rq->queuelist, &list);
+		e->type->ops.mq.insert_requests(hctx, &list, at_head);
+	} else {
+		spin_lock(&ctx->lock);
+		__blk_mq_insert_request(hctx, rq, at_head);
+		spin_unlock(&ctx->lock);
+	}
+
+	if (run_queue)
+		blk_mq_run_hw_queue(hctx, async);
+}
+
+static inline void
+blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx,
+			     struct list_head *list, bool run_queue_async)
+{
+	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+	struct elevator_queue *e = hctx->queue->elevator;
+
+	if (e && e->type->ops.mq.insert_requests)
+		e->type->ops.mq.insert_requests(hctx, list, false);
+	else
+		blk_mq_insert_requests(hctx, ctx, list);
+
+	blk_mq_run_hw_queue(hctx, run_queue_async);
+}
+
+static inline bool
+blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
+			 struct bio *bio)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.mq.allow_merge)
+		return e->type->ops.mq.allow_merge(q, rq, bio);
+
+	return true;
+}
+
+static inline void
+blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	struct elevator_queue *e = hctx->queue->elevator;
+
+	if (e && e->type->ops.mq.completed_request)
+		e->type->ops.mq.completed_request(hctx, rq);
+
+	BUG_ON(rq->internal_tag == -1);
+
+	blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
+
+	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
+		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+		blk_mq_run_hw_queue(hctx, true);
+	}
+}
+
+static inline void blk_mq_sched_started_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.mq.started_request)
+		e->type->ops.mq.started_request(rq);
+}
+
+static inline void blk_mq_sched_requeue_request(struct request *rq)
+{
+	struct request_queue *q = rq->q;
+	struct elevator_queue *e = q->elevator;
+
+	if (e && e->type->ops.mq.requeue_request)
+		e->type->ops.mq.requeue_request(rq);
+}
+
+static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
+{
+	struct elevator_queue *e = hctx->queue->elevator;
+
+	if (e && e->type->ops.mq.has_work)
+		return e->type->ops.mq.has_work(hctx);
+
+	return false;
+}
+#endif
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index eacd3af72099..2caecaa98e40 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -231,6 +231,14 @@ static ssize_t blk_mq_hw_sysfs_rq_list_show(struct blk_mq_hw_ctx *hctx,
 	return ret;
 }
 
+static ssize_t blk_mq_hw_sysfs_sched_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+	if (hctx->sched_tags)
+		return blk_mq_tag_sysfs_show(hctx->sched_tags, page);
+
+	return 0;
+}
+
 static ssize_t blk_mq_hw_sysfs_tags_show(struct blk_mq_hw_ctx *hctx, char *page)
 {
 	return blk_mq_tag_sysfs_show(hctx->tags, page);
@@ -345,6 +353,10 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_pending = {
 	.attr = {.name = "pending", .mode = S_IRUGO },
 	.show = blk_mq_hw_sysfs_rq_list_show,
 };
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_sched_tags = {
+	.attr = {.name = "sched_tags", .mode = S_IRUGO },
+	.show = blk_mq_hw_sysfs_sched_tags_show,
+};
 static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_tags = {
 	.attr = {.name = "tags", .mode = S_IRUGO },
 	.show = blk_mq_hw_sysfs_tags_show,
@@ -370,6 +382,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_dispatched.attr,
 	&blk_mq_hw_sysfs_pending.attr,
 	&blk_mq_hw_sysfs_tags.attr,
+	&blk_mq_hw_sysfs_sched_tags.attr,
 	&blk_mq_hw_sysfs_cpus.attr,
 	&blk_mq_hw_sysfs_active.attr,
 	&blk_mq_hw_sysfs_poll.attr,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3b7a087e8bf0..3180b5fac88c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -32,6 +32,7 @@
 #include "blk-mq-tag.h"
 #include "blk-stat.h"
 #include "blk-wbt.h"
+#include "blk-mq-sched.h"
 
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
@@ -41,7 +42,8 @@ static LIST_HEAD(all_q_list);
  */
 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
-	return sbitmap_any_bit_set(&hctx->ctx_map);
+	return sbitmap_any_bit_set(&hctx->ctx_map) ||
+		blk_mq_sched_has_work(hctx);
 }
 
 /*
@@ -223,14 +225,17 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
 
 	tag = blk_mq_get_tag(data);
 	if (tag != BLK_MQ_TAG_FAIL) {
-		rq = data->hctx->tags->rqs[tag];
+		struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+
+		rq = tags->rqs[tag];
 
 		if (blk_mq_tag_busy(data->hctx)) {
 			rq->rq_flags = RQF_MQ_INFLIGHT;
 			atomic_inc(&data->hctx->nr_active);
 		}
 
-		rq->tag = tag;
+		rq->tag = -1;
+		rq->internal_tag = tag;
 		blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
 		return rq;
 	}
@@ -242,26 +247,21 @@ EXPORT_SYMBOL_GPL(__blk_mq_alloc_request);
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		unsigned int flags)
 {
-	struct blk_mq_ctx *ctx;
-	struct blk_mq_hw_ctx *hctx;
-	struct request *rq;
 	struct blk_mq_alloc_data alloc_data;
+	struct request *rq;
 	int ret;
 
 	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
 	if (ret)
 		return ERR_PTR(ret);
 
-	ctx = blk_mq_get_ctx(q);
-	hctx = blk_mq_map_queue(q, ctx->cpu);
-	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
-	rq = __blk_mq_alloc_request(&alloc_data, rw);
-	blk_mq_put_ctx(ctx);
+	rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
 
-	if (!rq) {
-		blk_queue_exit(q);
+	blk_mq_put_ctx(alloc_data.ctx);
+	blk_queue_exit(q);
+
+	if (!rq)
 		return ERR_PTR(-EWOULDBLOCK);
-	}
 
 	rq->__data_len = 0;
 	rq->__sector = (sector_t) -1;
@@ -313,6 +313,9 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
 		goto out_queue_exit;
 	}
 
+	rq->tag = rq->internal_tag;
+	rq->internal_tag = -1;
+
 	return rq;
 
 out_queue_exit:
@@ -321,10 +324,10 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
 }
 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 
-void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
-			   struct request *rq)
+void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+			     struct request *rq)
 {
-	const int tag = rq->tag;
+	const int sched_tag = rq->internal_tag;
 	struct request_queue *q = rq->q;
 
 	if (rq->rq_flags & RQF_MQ_INFLIGHT)
@@ -335,22 +338,32 @@ void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
-	blk_mq_put_tag(hctx, hctx->tags, ctx, tag);
+	if (rq->tag != -1) {
+		WARN_ON_ONCE(hctx->tags->rqs[rq->tag] != rq);
+		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
+	}
+	if (sched_tag != -1)
+		blk_mq_sched_completed_request(hctx, rq);
 	blk_queue_exit(q);
 }
 
-static void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx,
+static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx,
 				     struct request *rq)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 
 	ctx->rq_completed[rq_is_sync(rq)]++;
-	__blk_mq_free_request(hctx, ctx, rq);
+	__blk_mq_finish_request(hctx, ctx, rq);
+}
+
+void blk_mq_finish_request(struct request *rq)
+{
+	blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
 }
 
 void blk_mq_free_request(struct request *rq)
 {
-	blk_mq_free_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
+	blk_mq_sched_put_request(rq);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
@@ -468,6 +481,8 @@ void blk_mq_start_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
+	blk_mq_sched_started_request(rq);
+
 	trace_block_rq_issue(q, rq);
 
 	rq->resid_len = blk_rq_bytes(rq);
@@ -516,6 +531,7 @@ static void __blk_mq_requeue_request(struct request *rq)
 
 	trace_block_rq_requeue(q, rq);
 	wbt_requeue(q->rq_wb, &rq->issue_stat);
+	blk_mq_sched_requeue_request(rq);
 
 	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
 		if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -550,13 +566,13 @@ static void blk_mq_requeue_work(struct work_struct *work)
 
 		rq->rq_flags &= ~RQF_SOFTBARRIER;
 		list_del_init(&rq->queuelist);
-		blk_mq_insert_request(rq, true, false, false);
+		blk_mq_sched_insert_request(rq, true, false, false);
 	}
 
 	while (!list_empty(&rq_list)) {
 		rq = list_entry(rq_list.next, struct request, queuelist);
 		list_del_init(&rq->queuelist);
-		blk_mq_insert_request(rq, false, false, false);
+		blk_mq_sched_insert_request(rq, false, false, false);
 	}
 
 	blk_mq_run_hw_queues(q, false);
@@ -762,8 +778,16 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
 
 		if (!blk_rq_merge_ok(rq, bio))
 			continue;
+		if (!blk_mq_sched_allow_merge(q, rq, bio))
+			break;
 
 		el_ret = blk_try_merge(rq, bio);
+		if (el_ret == ELEVATOR_NO_MERGE)
+			continue;
+
+		if (!blk_mq_sched_allow_merge(q, rq, bio))
+			break;
+
 		if (el_ret == ELEVATOR_BACK_MERGE) {
 			if (bio_attempt_back_merge(q, rq, bio)) {
 				ctx->rq_merged++;
@@ -823,6 +847,35 @@ static inline unsigned int queued_to_index(unsigned int queued)
 	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
 }
 
+static bool blk_mq_get_driver_tag(struct request *rq,
+				  struct blk_mq_hw_ctx **hctx, bool wait)
+{
+	struct blk_mq_alloc_data data = {
+		.q = rq->q,
+		.ctx = rq->mq_ctx,
+		.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
+		.flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
+	};
+
+	if (blk_mq_hctx_stopped(data.hctx))
+		return false;
+
+	if (rq->tag != -1) {
+done:
+		if (hctx)
+			*hctx = data.hctx;
+		return true;
+	}
+
+	rq->tag = blk_mq_get_tag(&data);
+	if (rq->tag >= 0) {
+		data.hctx->tags->rqs[rq->tag] = rq;
+		goto done;
+	}
+
+	return false;
+}
+
 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
 	struct request_queue *q = hctx->queue;
@@ -845,6 +898,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 		struct blk_mq_queue_data bd;
 
 		rq = list_first_entry(list, struct request, queuelist);
+		if (!blk_mq_get_driver_tag(rq, &hctx, false))
+			break;
 		list_del_init(&rq->queuelist);
 
 		bd.rq = rq;
@@ -905,41 +960,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 	return ret != BLK_MQ_RQ_QUEUE_BUSY;
 }
 
-/*
- * Run this hardware queue, pulling any software queues mapped to it in.
- * Note that this function currently has various problems around ordering
- * of IO. In particular, we'd like FIFO behaviour on handling existing
- * items on the hctx->dispatch list. Ignore that for now.
- */
-static void blk_mq_process_rq_list(struct blk_mq_hw_ctx *hctx)
-{
-	LIST_HEAD(rq_list);
-	LIST_HEAD(driver_list);
-
-	if (unlikely(blk_mq_hctx_stopped(hctx)))
-		return;
-
-	hctx->run++;
-
-	/*
-	 * Touch any software queue that has pending entries.
-	 */
-	blk_mq_flush_busy_ctxs(hctx, &rq_list);
-
-	/*
-	 * If we have previous entries on our dispatch list, grab them
-	 * and stuff them at the front for more fair dispatch.
-	 */
-	if (!list_empty_careful(&hctx->dispatch)) {
-		spin_lock(&hctx->lock);
-		if (!list_empty(&hctx->dispatch))
-			list_splice_init(&hctx->dispatch, &rq_list);
-		spin_unlock(&hctx->lock);
-	}
-
-	blk_mq_dispatch_rq_list(hctx, &rq_list);
-}
-
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	int srcu_idx;
@@ -949,11 +969,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
 		rcu_read_lock();
-		blk_mq_process_rq_list(hctx);
+		blk_mq_sched_dispatch_requests(hctx);
 		rcu_read_unlock();
 	} else {
 		srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
-		blk_mq_process_rq_list(hctx);
+		blk_mq_sched_dispatch_requests(hctx);
 		srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
 	}
 }
@@ -1147,32 +1167,10 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	blk_mq_hctx_mark_pending(hctx, ctx);
 }
 
-void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
-			   bool async)
-{
-	struct blk_mq_ctx *ctx = rq->mq_ctx;
-	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
-	spin_lock(&ctx->lock);
-	__blk_mq_insert_request(hctx, rq, at_head);
-	spin_unlock(&ctx->lock);
-
-	if (run_queue)
-		blk_mq_run_hw_queue(hctx, async);
-}
-
-static void blk_mq_insert_requests(struct request_queue *q,
-				     struct blk_mq_ctx *ctx,
-				     struct list_head *list,
-				     int depth,
-				     bool from_schedule)
+void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+			    struct list_head *list)
 
 {
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-
-	trace_block_unplug(q, depth, !from_schedule);
-
 	/*
 	 * preemption doesn't flush plug list, so it's possible ctx->cpu is
 	 * offline now
@@ -1188,8 +1186,6 @@ static void blk_mq_insert_requests(struct request_queue *q,
 	}
 	blk_mq_hctx_mark_pending(hctx, ctx);
 	spin_unlock(&ctx->lock);
-
-	blk_mq_run_hw_queue(hctx, from_schedule);
 }
 
 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
@@ -1225,9 +1221,10 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 		BUG_ON(!rq->q);
 		if (rq->mq_ctx != this_ctx) {
 			if (this_ctx) {
-				blk_mq_insert_requests(this_q, this_ctx,
-							&ctx_list, depth,
-							from_schedule);
+				trace_block_unplug(this_q, depth, from_schedule);
+				blk_mq_sched_insert_requests(this_q, this_ctx,
+								&ctx_list,
+								from_schedule);
 			}
 
 			this_ctx = rq->mq_ctx;
@@ -1244,8 +1241,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 	 * on 'ctx_list'. Do those.
 	 */
 	if (this_ctx) {
-		blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
-				       from_schedule);
+		trace_block_unplug(this_q, depth, from_schedule);
+		blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
+						from_schedule);
 	}
 }
 
@@ -1283,46 +1281,31 @@ static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
 		}
 
 		spin_unlock(&ctx->lock);
-		__blk_mq_free_request(hctx, ctx, rq);
+		__blk_mq_finish_request(hctx, ctx, rq);
 		return true;
 	}
 }
 
-static struct request *blk_mq_map_request(struct request_queue *q,
-					  struct bio *bio,
-					  struct blk_mq_alloc_data *data)
-{
-	struct blk_mq_hw_ctx *hctx;
-	struct blk_mq_ctx *ctx;
-	struct request *rq;
-
-	blk_queue_enter_live(q);
-	ctx = blk_mq_get_ctx(q);
-	hctx = blk_mq_map_queue(q, ctx->cpu);
-
-	trace_block_getrq(q, bio, bio->bi_opf);
-	blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
-	rq = __blk_mq_alloc_request(data, bio->bi_opf);
-
-	data->hctx->queued++;
-	return rq;
-}
-
 static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 {
-	int ret;
 	struct request_queue *q = rq->q;
-	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
 	struct blk_mq_queue_data bd = {
 		.rq = rq,
 		.list = NULL,
 		.last = 1
 	};
-	blk_qc_t new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
+	struct blk_mq_hw_ctx *hctx;
+	blk_qc_t new_cookie;
+	int ret;
 
-	if (blk_mq_hctx_stopped(hctx))
+	if (q->elevator)
 		goto insert;
 
+	if (!blk_mq_get_driver_tag(rq, &hctx, false))
+		goto insert;
+
+	new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
+
 	/*
 	 * For OK queue, we are done. For error, kill it. Any other
 	 * error (busy), just add it to our list as we previously
@@ -1344,7 +1327,7 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
 	}
 
 insert:
-	blk_mq_insert_request(rq, false, true, true);
+	blk_mq_sched_insert_request(rq, false, true, true);
 }
 
 /*
@@ -1377,9 +1360,14 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
 		return BLK_QC_T_NONE;
 
+	if (blk_mq_sched_bio_merge(q, bio))
+		return BLK_QC_T_NONE;
+
 	wb_acct = wbt_wait(q->rq_wb, bio, NULL);
 
-	rq = blk_mq_map_request(q, bio, &data);
+	trace_block_getrq(q, bio, bio->bi_opf);
+
+	rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
 	if (unlikely(!rq)) {
 		__wbt_done(q->rq_wb, wb_acct);
 		return BLK_QC_T_NONE;
@@ -1391,6 +1379,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 	if (unlikely(is_flush_fua)) {
 		blk_mq_bio_to_request(rq, bio);
+		blk_mq_get_driver_tag(rq, NULL, true);
 		blk_insert_flush(rq);
 		goto run_queue;
 	}
@@ -1441,6 +1430,12 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 		goto done;
 	}
 
+	if (q->elevator) {
+		blk_mq_put_ctx(data.ctx);
+		blk_mq_bio_to_request(rq, bio);
+		blk_mq_sched_insert_request(rq, false, true, true);
+		goto done;
+	}
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
 		/*
 		 * For a SYNC request, send it to the hardware immediately. For
@@ -1486,9 +1481,14 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	} else
 		request_count = blk_plug_queued_count(q);
 
+	if (blk_mq_sched_bio_merge(q, bio))
+		return BLK_QC_T_NONE;
+
 	wb_acct = wbt_wait(q->rq_wb, bio, NULL);
 
-	rq = blk_mq_map_request(q, bio, &data);
+	trace_block_getrq(q, bio, bio->bi_opf);
+
+	rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
 	if (unlikely(!rq)) {
 		__wbt_done(q->rq_wb, wb_acct);
 		return BLK_QC_T_NONE;
@@ -1500,6 +1500,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 
 	if (unlikely(is_flush_fua)) {
 		blk_mq_bio_to_request(rq, bio);
+		blk_mq_get_driver_tag(rq, NULL, true);
 		blk_insert_flush(rq);
 		goto run_queue;
 	}
@@ -1538,6 +1539,12 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 		return cookie;
 	}
 
+	if (q->elevator) {
+		blk_mq_put_ctx(data.ctx);
+		blk_mq_bio_to_request(rq, bio);
+		blk_mq_sched_insert_request(rq, false, true, true);
+		goto done;
+	}
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
 		/*
 		 * For a SYNC request, send it to the hardware immediately. For
@@ -1550,6 +1557,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	blk_mq_put_ctx(data.ctx);
+done:
 	return cookie;
 }
 
@@ -2483,14 +2491,22 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 	struct blk_mq_hw_ctx *hctx;
 	int i, ret;
 
-	if (!set || nr > set->queue_depth)
+	if (!set)
 		return -EINVAL;
 
 	ret = 0;
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!hctx->tags)
 			continue;
-		ret = blk_mq_tag_update_depth(hctx->tags, nr);
+		/*
+		 * If we're using an MQ scheduler, just update the scheduler
+		 * queue depth. This is similar to what the old code would do.
+		 */
+		if (!hctx->sched_tags)
+			ret = blk_mq_tag_update_depth(hctx->tags,
+							min(nr, set->queue_depth));
+		else
+			ret = blk_mq_tag_update_depth(hctx->sched_tags, nr);
 		if (ret)
 			break;
 	}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 1b279b02d0f6..0c7c034d9ddd 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -52,6 +52,8 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
  */
 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 				bool at_head);
+void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+				struct list_head *list);
 /*
  * CPU hotplug helpers
  */
@@ -124,6 +126,9 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
 
 static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
 {
+	if (data->flags & BLK_MQ_REQ_INTERNAL)
+		return data->hctx->sched_tags;
+
 	return data->hctx->tags;
 }
 
@@ -132,8 +137,9 @@ static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data
  */
 void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 			struct request *rq, unsigned int op);
-void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
+void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 				struct request *rq);
+void blk_mq_finish_request(struct request *rq);
 struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
 					unsigned int op);
 
diff --git a/block/blk-tag.c b/block/blk-tag.c
index bae1decb6ec3..07cc329fa4b0 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -272,6 +272,7 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
 	list_del_init(&rq->queuelist);
 	rq->rq_flags &= ~RQF_QUEUED;
 	rq->tag = -1;
+	rq->internal_tag = -1;
 
 	if (unlikely(bqt->tag_index[tag] == NULL))
 		printk(KERN_ERR "%s: tag %d is missing\n",
diff --git a/block/elevator.c b/block/elevator.c
index 022a26830297..79e74da26343 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -40,6 +40,7 @@
 #include <trace/events/block.h>
 
 #include "blk.h"
+#include "blk-mq-sched.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
@@ -58,7 +59,9 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.sq.elevator_allow_bio_merge_fn)
+	if (e->uses_mq && e->type->ops.mq.allow_merge)
+		return e->type->ops.mq.allow_merge(q, rq, bio);
+	else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn)
 		return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio);
 
 	return 1;
@@ -163,6 +166,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
 	kobject_init(&eq->kobj, &elv_ktype);
 	mutex_init(&eq->sysfs_lock);
 	hash_init(eq->hash);
+	eq->uses_mq = e->uses_mq;
 
 	return eq;
 }
@@ -219,14 +223,26 @@ int elevator_init(struct request_queue *q, char *name)
 		if (!e) {
 			printk(KERN_ERR
 				"Default I/O scheduler not found. " \
-				"Using noop.\n");
+				"Using noop/none.\n");
+			if (q->mq_ops) {
+				elevator_put(e);
+				return 0;
+			}
 			e = elevator_get("noop", false);
 		}
 	}
 
-	err = e->ops.sq.elevator_init_fn(q, e);
-	if (err)
+	if (e->uses_mq) {
+		err = blk_mq_sched_setup(q);
+		if (!err)
+			err = e->ops.mq.init_sched(q, e);
+	} else
+		err = e->ops.sq.elevator_init_fn(q, e);
+	if (err) {
+		if (e->uses_mq)
+			blk_mq_sched_teardown(q);
 		elevator_put(e);
+	}
 	return err;
 }
 EXPORT_SYMBOL(elevator_init);
@@ -234,7 +250,9 @@ EXPORT_SYMBOL(elevator_init);
 void elevator_exit(struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
-	if (e->type->ops.sq.elevator_exit_fn)
+	if (e->uses_mq && e->type->ops.mq.exit_sched)
+		e->type->ops.mq.exit_sched(e);
+	else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn)
 		e->type->ops.sq.elevator_exit_fn(e);
 	mutex_unlock(&e->sysfs_lock);
 
@@ -253,6 +271,7 @@ void elv_rqhash_del(struct request_queue *q, struct request *rq)
 	if (ELV_ON_HASH(rq))
 		__elv_rqhash_del(rq);
 }
+EXPORT_SYMBOL_GPL(elv_rqhash_del);
 
 void elv_rqhash_add(struct request_queue *q, struct request *rq)
 {
@@ -262,6 +281,7 @@ void elv_rqhash_add(struct request_queue *q, struct request *rq)
 	hash_add(e->hash, &rq->hash, rq_hash_key(rq));
 	rq->rq_flags |= RQF_HASHED;
 }
+EXPORT_SYMBOL_GPL(elv_rqhash_add);
 
 void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
 {
@@ -443,7 +463,9 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 		return ELEVATOR_BACK_MERGE;
 	}
 
-	if (e->type->ops.sq.elevator_merge_fn)
+	if (e->uses_mq && e->type->ops.mq.request_merge)
+		return e->type->ops.mq.request_merge(q, req, bio);
+	else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
 		return e->type->ops.sq.elevator_merge_fn(q, req, bio);
 
 	return ELEVATOR_NO_MERGE;
@@ -456,8 +478,7 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
  *
  * Returns true if we merged, false otherwise
  */
-static bool elv_attempt_insert_merge(struct request_queue *q,
-				     struct request *rq)
+bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
 {
 	struct request *__rq;
 	bool ret;
@@ -495,7 +516,9 @@ void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.sq.elevator_merged_fn)
+	if (e->uses_mq && e->type->ops.mq.request_merged)
+		e->type->ops.mq.request_merged(q, rq, type);
+	else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn)
 		e->type->ops.sq.elevator_merged_fn(q, rq, type);
 
 	if (type == ELEVATOR_BACK_MERGE)
@@ -508,10 +531,15 @@ void elv_merge_requests(struct request_queue *q, struct request *rq,
 			     struct request *next)
 {
 	struct elevator_queue *e = q->elevator;
-	const int next_sorted = next->rq_flags & RQF_SORTED;
-
-	if (next_sorted && e->type->ops.sq.elevator_merge_req_fn)
-		e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
+	bool next_sorted = false;
+
+	if (e->uses_mq && e->type->ops.mq.requests_merged)
+		e->type->ops.mq.requests_merged(q, rq, next);
+	else if (e->type->ops.sq.elevator_merge_req_fn) {
+		next_sorted = next->rq_flags & RQF_SORTED;
+		if (next_sorted)
+			e->type->ops.sq.elevator_merge_req_fn(q, rq, next);
+	}
 
 	elv_rqhash_reposition(q, rq);
 
@@ -528,6 +556,9 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return;
+
 	if (e->type->ops.sq.elevator_bio_merged_fn)
 		e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio);
 }
@@ -682,8 +713,11 @@ struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.sq.elevator_latter_req_fn)
+	if (e->uses_mq && e->type->ops.mq.next_request)
+		return e->type->ops.mq.next_request(q, rq);
+	else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn)
 		return e->type->ops.sq.elevator_latter_req_fn(q, rq);
+
 	return NULL;
 }
 
@@ -691,7 +725,9 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
-	if (e->type->ops.sq.elevator_former_req_fn)
+	if (e->uses_mq && e->type->ops.mq.former_request)
+		return e->type->ops.mq.former_request(q, rq);
+	if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn)
 		return e->type->ops.sq.elevator_former_req_fn(q, rq);
 	return NULL;
 }
@@ -701,6 +737,9 @@ int elv_set_request(struct request_queue *q, struct request *rq,
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return 0;
+
 	if (e->type->ops.sq.elevator_set_req_fn)
 		return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask);
 	return 0;
@@ -710,6 +749,9 @@ void elv_put_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return;
+
 	if (e->type->ops.sq.elevator_put_req_fn)
 		e->type->ops.sq.elevator_put_req_fn(rq);
 }
@@ -718,6 +760,9 @@ int elv_may_queue(struct request_queue *q, unsigned int op)
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return 0;
+
 	if (e->type->ops.sq.elevator_may_queue_fn)
 		return e->type->ops.sq.elevator_may_queue_fn(q, op);
 
@@ -728,6 +773,9 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
 
+	if (WARN_ON_ONCE(e->uses_mq))
+		return;
+
 	/*
 	 * request is released from the driver, io must be done
 	 */
@@ -803,7 +851,7 @@ int elv_register_queue(struct request_queue *q)
 		}
 		kobject_uevent(&e->kobj, KOBJ_ADD);
 		e->registered = 1;
-		if (e->type->ops.sq.elevator_registered_fn)
+		if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn)
 			e->type->ops.sq.elevator_registered_fn(q);
 	}
 	return error;
@@ -891,9 +939,14 @@ EXPORT_SYMBOL_GPL(elv_unregister);
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
 	struct elevator_queue *old = q->elevator;
-	bool registered = old->registered;
+	bool old_registered = false;
 	int err;
 
+	if (q->mq_ops) {
+		blk_mq_freeze_queue(q);
+		blk_mq_quiesce_queue(q);
+	}
+
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data.
 	 * Block layer doesn't call into a quiesced elevator - all requests
@@ -901,42 +954,79 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 	 * using INSERT_BACK.  All requests have SOFTBARRIER set and no
 	 * merge happens either.
 	 */
-	blk_queue_bypass_start(q);
+	if (old) {
+		old_registered = old->registered;
 
-	/* unregister and clear all auxiliary data of the old elevator */
-	if (registered)
-		elv_unregister_queue(q);
+		if (old->uses_mq) {
+			err = blk_mq_sched_teardown(q);
+			if (err)
+				goto fail_init;
+		}
 
-	spin_lock_irq(q->queue_lock);
-	ioc_clear_queue(q);
-	spin_unlock_irq(q->queue_lock);
+		if (!q->mq_ops)
+			blk_queue_bypass_start(q);
+
+		/* unregister and clear all auxiliary data of the old elevator */
+		if (old_registered)
+			elv_unregister_queue(q);
+
+		spin_lock_irq(q->queue_lock);
+		ioc_clear_queue(q);
+		spin_unlock_irq(q->queue_lock);
+	}
 
 	/* allocate, init and register new elevator */
-	err = new_e->ops.sq.elevator_init_fn(q, new_e);
-	if (err)
-		goto fail_init;
+	if (new_e) {
+		if (new_e->uses_mq) {
+			err = blk_mq_sched_setup(q);
+			if (!err)
+				err = new_e->ops.mq.init_sched(q, new_e);
+		} else
+			err = new_e->ops.sq.elevator_init_fn(q, new_e);
+		if (err)
+			goto fail_init;
 
-	if (registered) {
 		err = elv_register_queue(q);
 		if (err)
 			goto fail_register;
-	}
+	} else
+		q->elevator = NULL;
 
 	/* done, kill the old one and finish */
-	elevator_exit(old);
-	blk_queue_bypass_end(q);
+	if (old) {
+		elevator_exit(old);
+		if (!q->mq_ops)
+			blk_queue_bypass_end(q);
+	}
+
+	if (q->mq_ops) {
+		blk_mq_unfreeze_queue(q);
+		blk_mq_start_stopped_hw_queues(q, true);
+	}
 
-	blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+	if (new_e)
+		blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+	else
+		blk_add_trace_msg(q, "elv switch: none");
 
 	return 0;
 
 fail_register:
+	if (q->mq_ops)
+		blk_mq_sched_teardown(q);
 	elevator_exit(q->elevator);
 fail_init:
 	/* switch failed, restore and re-register old elevator */
-	q->elevator = old;
-	elv_register_queue(q);
-	blk_queue_bypass_end(q);
+	if (old) {
+		q->elevator = old;
+		elv_register_queue(q);
+		if (!q->mq_ops)
+			blk_queue_bypass_end(q);
+	}
+	if (q->mq_ops) {
+		blk_mq_unfreeze_queue(q);
+		blk_mq_start_stopped_hw_queues(q, true);
+	}
 
 	return err;
 }
@@ -949,8 +1039,11 @@ static int __elevator_change(struct request_queue *q, const char *name)
 	char elevator_name[ELV_NAME_MAX];
 	struct elevator_type *e;
 
-	if (!q->elevator)
-		return -ENXIO;
+	/*
+	 * Special case for mq, turn off scheduling
+	 */
+	if (q->mq_ops && !strncmp(name, "none", 4))
+		return elevator_switch(q, NULL);
 
 	strlcpy(elevator_name, name, sizeof(elevator_name));
 	e = elevator_get(strstrip(elevator_name), true);
@@ -959,11 +1052,21 @@ static int __elevator_change(struct request_queue *q, const char *name)
 		return -EINVAL;
 	}
 
-	if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
+	if (q->elevator &&
+	    !strcmp(elevator_name, q->elevator->type->elevator_name)) {
 		elevator_put(e);
 		return 0;
 	}
 
+	if (!e->uses_mq && q->mq_ops) {
+		elevator_put(e);
+		return -EINVAL;
+	}
+	if (e->uses_mq && !q->mq_ops) {
+		elevator_put(e);
+		return -EINVAL;
+	}
+
 	return elevator_switch(q, e);
 }
 
@@ -985,7 +1088,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 {
 	int ret;
 
-	if (!q->elevator)
+	if (!(q->mq_ops || q->request_fn))
 		return count;
 
 	ret = __elevator_change(q, name);
@@ -999,24 +1102,34 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 ssize_t elv_iosched_show(struct request_queue *q, char *name)
 {
 	struct elevator_queue *e = q->elevator;
-	struct elevator_type *elv;
+	struct elevator_type *elv = NULL;
 	struct elevator_type *__e;
 	int len = 0;
 
-	if (!q->elevator || !blk_queue_stackable(q))
+	if (!blk_queue_stackable(q))
 		return sprintf(name, "none\n");
 
-	elv = e->type;
+	if (!q->elevator)
+		len += sprintf(name+len, "[none] ");
+	else
+		elv = e->type;
 
 	spin_lock(&elv_list_lock);
 	list_for_each_entry(__e, &elv_list, list) {
-		if (!strcmp(elv->elevator_name, __e->elevator_name))
+		if (elv && !strcmp(elv->elevator_name, __e->elevator_name)) {
 			len += sprintf(name+len, "[%s] ", elv->elevator_name);
-		else
+			continue;
+		}
+		if (__e->uses_mq && q->mq_ops)
+			len += sprintf(name+len, "%s ", __e->elevator_name);
+		else if (!__e->uses_mq && !q->mq_ops)
 			len += sprintf(name+len, "%s ", __e->elevator_name);
 	}
 	spin_unlock(&elv_list_lock);
 
+	if (q->mq_ops && q->elevator)
+		len += sprintf(name+len, "none");
+
 	len += sprintf(len+name, "\n");
 	return len;
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2686f9e7302a..63569eb46d15 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -22,6 +22,7 @@ struct blk_mq_hw_ctx {
 
 	unsigned long		flags;		/* BLK_MQ_F_* flags */
 
+	void			*sched_data;
 	struct request_queue	*queue;
 	struct blk_flush_queue	*fq;
 
@@ -35,6 +36,7 @@ struct blk_mq_hw_ctx {
 	atomic_t		wait_index;
 
 	struct blk_mq_tags	*tags;
+	struct blk_mq_tags	*sched_tags;
 
 	struct srcu_struct	queue_rq_srcu;
 
@@ -156,6 +158,7 @@ enum {
 
 	BLK_MQ_S_STOPPED	= 0,
 	BLK_MQ_S_TAG_ACTIVE	= 1,
+	BLK_MQ_S_SCHED_RESTART	= 2,
 
 	BLK_MQ_MAX_DEPTH	= 10240,
 
@@ -179,13 +182,13 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
 
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
-void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 
 enum {
 	BLK_MQ_REQ_NOWAIT	= (1 << 0), /* return when out of requests */
 	BLK_MQ_REQ_RESERVED	= (1 << 1), /* allocate from reserved pool */
+	BLK_MQ_REQ_INTERNAL	= (1 << 2), /* allocate internal/sched tag */
 };
 
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e19fc8e01fb2..1d79837de994 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -154,6 +154,7 @@ struct request {
 
 	/* the following two fields are internal, NEVER access directly */
 	unsigned int __data_len;	/* total data len */
+	int tag;
 	sector_t __sector;		/* sector cursor */
 
 	struct bio *bio;
@@ -220,9 +221,10 @@ struct request {
 
 	unsigned short ioprio;
 
+	int internal_tag;
+
 	void *special;		/* opaque pointer available for LLD use */
 
-	int tag;
 	int errors;
 
 	/*
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 2a9e966eed03..ecb96fd67c6d 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -77,6 +77,34 @@ struct elevator_ops
 	elevator_registered_fn *elevator_registered_fn;
 };
 
+struct blk_mq_alloc_data;
+struct blk_mq_hw_ctx;
+
+struct elevator_mq_ops {
+	int (*init_sched)(struct request_queue *, struct elevator_type *);
+	void (*exit_sched)(struct elevator_queue *);
+
+	bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
+	bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *);
+	int (*request_merge)(struct request_queue *q, struct request **, struct bio *);
+	void (*request_merged)(struct request_queue *, struct request *, int);
+	void (*requests_merged)(struct request_queue *, struct request *, struct request *);
+	struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *);
+	void (*put_request)(struct request *);
+	void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
+	void (*dispatch_requests)(struct blk_mq_hw_ctx *, struct list_head *);
+	bool (*has_work)(struct blk_mq_hw_ctx *);
+	void (*completed_request)(struct blk_mq_hw_ctx *, struct request *);
+	void (*started_request)(struct request *);
+	void (*requeue_request)(struct request *);
+	struct request *(*former_request)(struct request_queue *, struct request *);
+	struct request *(*next_request)(struct request_queue *, struct request *);
+	int (*get_rq_priv)(struct request_queue *, struct request *);
+	void (*put_rq_priv)(struct request_queue *, struct request *);
+	void (*init_icq)(struct io_cq *);
+	void (*exit_icq)(struct io_cq *);
+};
+
 #define ELV_NAME_MAX	(16)
 
 struct elv_fs_entry {
@@ -96,12 +124,14 @@ struct elevator_type
 	/* fields provided by elevator implementation */
 	union {
 		struct elevator_ops sq;
+		struct elevator_mq_ops mq;
 	} ops;
 	size_t icq_size;	/* see iocontext.h */
 	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
 	struct module *elevator_owner;
+	bool uses_mq;
 
 	/* managed by elevator core */
 	char icq_cache_name[ELV_NAME_MAX + 5];	/* elvname + "_io_cq" */
@@ -125,6 +155,7 @@ struct elevator_queue
 	struct kobject kobj;
 	struct mutex sysfs_lock;
 	unsigned int registered:1;
+	unsigned int uses_mq:1;
 	DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
 };
 
@@ -141,6 +172,7 @@ extern void elv_merge_requests(struct request_queue *, struct request *,
 extern void elv_merged_request(struct request_queue *, struct request *, int);
 extern void elv_bio_merged(struct request_queue *q, struct request *,
 				struct bio *);
+extern bool elv_attempt_insert_merge(struct request_queue *, struct request *);
 extern void elv_requeue_request(struct request_queue *, struct request *);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 09/10] mq-deadline: add blk-mq adaptation of the deadline IO scheduler
  2017-01-11 21:39 [PATCHSET v6] blk-mq scheduling framework Jens Axboe
                   ` (7 preceding siblings ...)
  2017-01-11 21:40 ` [PATCH 08/10] blk-mq-sched: add framework for MQ capable IO schedulers Jens Axboe
@ 2017-01-11 21:40 ` Jens Axboe
  2017-01-12 21:53   ` Bart Van Assche
  2017-01-11 21:40 ` [PATCH 10/10] blk-mq-sched: allow setting of default " Jens Axboe
                   ` (4 subsequent siblings)
  13 siblings, 1 reply; 62+ messages in thread
From: Jens Axboe @ 2017-01-11 21:40 UTC (permalink / raw)
  To: linux-kernel, linux-block; +Cc: osandov, bart.vanassche, Jens Axboe

This is basically identical to deadline-iosched, except it registers
as a MQ capable scheduler. This is still a single queue design.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/Kconfig.iosched |   6 +
 block/Makefile        |   1 +
 block/mq-deadline.c   | 569 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 576 insertions(+)
 create mode 100644 block/mq-deadline.c

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9c4c48..490ef2850fae 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -32,6 +32,12 @@ config IOSCHED_CFQ
 
 	  This is the default I/O scheduler.
 
+config MQ_IOSCHED_DEADLINE
+	tristate "MQ deadline I/O scheduler"
+	default y
+	---help---
+	  MQ version of the deadline IO scheduler.
+
 config CFQ_GROUP_IOSCHED
 	bool "CFQ Group Scheduling support"
 	depends on IOSCHED_CFQ && BLK_CGROUP
diff --git a/block/Makefile b/block/Makefile
index 2eee9e1bb6db..3ee0abd7205a 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
+obj-$(CONFIG_MQ_IOSCHED_DEADLINE)	+= mq-deadline.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
new file mode 100644
index 000000000000..693f281607df
--- /dev/null
+++ b/block/mq-deadline.c
@@ -0,0 +1,569 @@
+/*
+ *  MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler,
+ *  for the blk-mq scheduling framework
+ *
+ *  Copyright (C) 2016 Jens Axboe <axboe@kernel.dk>
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/elevator.h>
+#include <linux/bio.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/rbtree.h>
+#include <linux/sbitmap.h>
+
+#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
+
+/*
+ * See Documentation/block/deadline-iosched.txt
+ */
+static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
+static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+static const int writes_starved = 2;    /* max times reads can starve a write */
+static const int fifo_batch = 16;       /* # of sequential requests treated as one
+				     by the above parameters. For throughput. */
+
+struct deadline_data {
+	/*
+	 * run time data
+	 */
+
+	/*
+	 * requests (deadline_rq s) are present on both sort_list and fifo_list
+	 */
+	struct rb_root sort_list[2];
+	struct list_head fifo_list[2];
+
+	/*
+	 * next in sort order. read, write or both are NULL
+	 */
+	struct request *next_rq[2];
+	unsigned int batching;		/* number of sequential requests made */
+	unsigned int starved;		/* times reads have starved writes */
+
+	/*
+	 * settings that change how the i/o scheduler behaves
+	 */
+	int fifo_expire[2];
+	int fifo_batch;
+	int writes_starved;
+	int front_merges;
+
+	spinlock_t lock;
+	struct list_head dispatch;
+};
+
+static inline struct rb_root *
+deadline_rb_root(struct deadline_data *dd, struct request *rq)
+{
+	return &dd->sort_list[rq_data_dir(rq)];
+}
+
+/*
+ * get the request after `rq' in sector-sorted order
+ */
+static inline struct request *
+deadline_latter_request(struct request *rq)
+{
+	struct rb_node *node = rb_next(&rq->rb_node);
+
+	if (node)
+		return rb_entry_rq(node);
+
+	return NULL;
+}
+
+static void
+deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
+{
+	struct rb_root *root = deadline_rb_root(dd, rq);
+
+	elv_rb_add(root, rq);
+}
+
+static inline void
+deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
+{
+	const int data_dir = rq_data_dir(rq);
+
+	if (dd->next_rq[data_dir] == rq)
+		dd->next_rq[data_dir] = deadline_latter_request(rq);
+
+	elv_rb_del(deadline_rb_root(dd, rq), rq);
+}
+
+/*
+ * remove rq from rbtree and fifo.
+ */
+static void deadline_remove_request(struct request_queue *q, struct request *rq)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+
+	list_del_init(&rq->queuelist);
+
+	/*
+	 * We might not be on the rbtree, if we are doing an insert merge
+	 */
+	if (!RB_EMPTY_NODE(&rq->rb_node))
+		deadline_del_rq_rb(dd, rq);
+
+	elv_rqhash_del(q, rq);
+	if (q->last_merge == rq)
+		q->last_merge = NULL;
+}
+
+static void dd_request_merged(struct request_queue *q, struct request *req,
+			      int type)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+
+	/*
+	 * if the merge was a front merge, we need to reposition request
+	 */
+	if (type == ELEVATOR_FRONT_MERGE) {
+		elv_rb_del(deadline_rb_root(dd, req), req);
+		deadline_add_rq_rb(dd, req);
+	}
+}
+
+static void dd_merged_requests(struct request_queue *q, struct request *req,
+			       struct request *next)
+{
+	/*
+	 * if next expires before rq, assign its expire time to rq
+	 * and move into next position (next will be deleted) in fifo
+	 */
+	if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) {
+		if (time_before((unsigned long)next->fifo_time,
+				(unsigned long)req->fifo_time)) {
+			list_move(&req->queuelist, &next->queuelist);
+			req->fifo_time = next->fifo_time;
+		}
+	}
+
+	/*
+	 * kill knowledge of next, this one is a goner
+	 */
+	deadline_remove_request(q, next);
+}
+
+/*
+ * move an entry to dispatch queue
+ */
+static void
+deadline_move_request(struct deadline_data *dd, struct request *rq)
+{
+	const int data_dir = rq_data_dir(rq);
+
+	dd->next_rq[READ] = NULL;
+	dd->next_rq[WRITE] = NULL;
+	dd->next_rq[data_dir] = deadline_latter_request(rq);
+
+	/*
+	 * take it off the sort and fifo list
+	 */
+	deadline_remove_request(rq->q, rq);
+}
+
+/*
+ * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
+ * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
+ */
+static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
+{
+	struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
+
+	/*
+	 * rq is expired!
+	 */
+	if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * deadline_dispatch_requests selects the best request according to
+ * read/write expire, fifo_batch, etc
+ */
+static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+	struct request *rq;
+	bool reads, writes;
+	int data_dir;
+
+	spin_lock(&dd->lock);
+
+	if (!list_empty(&dd->dispatch)) {
+		rq = list_first_entry(&dd->dispatch, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		goto done;
+	}
+
+	reads = !list_empty(&dd->fifo_list[READ]);
+	writes = !list_empty(&dd->fifo_list[WRITE]);
+
+	/*
+	 * batches are currently reads XOR writes
+	 */
+	if (dd->next_rq[WRITE])
+		rq = dd->next_rq[WRITE];
+	else
+		rq = dd->next_rq[READ];
+
+	if (rq && dd->batching < dd->fifo_batch)
+		/* we have a next request are still entitled to batch */
+		goto dispatch_request;
+
+	/*
+	 * at this point we are not running a batch. select the appropriate
+	 * data direction (read / write)
+	 */
+
+	if (reads) {
+		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
+
+		if (writes && (dd->starved++ >= dd->writes_starved))
+			goto dispatch_writes;
+
+		data_dir = READ;
+
+		goto dispatch_find_request;
+	}
+
+	/*
+	 * there are either no reads or writes have been starved
+	 */
+
+	if (writes) {
+dispatch_writes:
+		BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
+
+		dd->starved = 0;
+
+		data_dir = WRITE;
+
+		goto dispatch_find_request;
+	}
+
+	spin_unlock(&dd->lock);
+	return NULL;
+
+dispatch_find_request:
+	/*
+	 * we are not running a batch, find best request for selected data_dir
+	 */
+	if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
+		/*
+		 * A deadline has expired, the last request was in the other
+		 * direction, or we have run out of higher-sectored requests.
+		 * Start again from the request with the earliest expiry time.
+		 */
+		rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
+	} else {
+		/*
+		 * The last req was the same dir and we have a next request in
+		 * sort order. No expired requests so continue on from here.
+		 */
+		rq = dd->next_rq[data_dir];
+	}
+
+	dd->batching = 0;
+
+dispatch_request:
+	/*
+	 * rq is the selected appropriate request.
+	 */
+	dd->batching++;
+	deadline_move_request(dd, rq);
+done:
+	rq->rq_flags |= RQF_STARTED;
+	spin_unlock(&dd->lock);
+	return rq;
+}
+
+static void dd_dispatch_requests(struct blk_mq_hw_ctx *hctx,
+				 struct list_head *rq_list)
+{
+	do {
+		struct request *rq;
+
+		rq = __dd_dispatch_request(hctx);
+		if (!rq)
+			break;
+
+		list_add_tail(&rq->queuelist, rq_list);
+	} while (1);
+}
+
+static void dd_exit_queue(struct elevator_queue *e)
+{
+	struct deadline_data *dd = e->elevator_data;
+
+	BUG_ON(!list_empty(&dd->fifo_list[READ]));
+	BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
+
+	kfree(dd);
+}
+
+/*
+ * initialize elevator private data (deadline_data).
+ */
+static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
+{
+	struct deadline_data *dd;
+	struct elevator_queue *eq;
+
+	eq = elevator_alloc(q, e);
+	if (!eq)
+		return -ENOMEM;
+
+	dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
+	if (!dd) {
+		kobject_put(&eq->kobj);
+		return -ENOMEM;
+	}
+	eq->elevator_data = dd;
+
+	INIT_LIST_HEAD(&dd->fifo_list[READ]);
+	INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
+	dd->sort_list[READ] = RB_ROOT;
+	dd->sort_list[WRITE] = RB_ROOT;
+	dd->fifo_expire[READ] = read_expire;
+	dd->fifo_expire[WRITE] = write_expire;
+	dd->writes_starved = writes_starved;
+	dd->front_merges = 1;
+	dd->fifo_batch = fifo_batch;
+	spin_lock_init(&dd->lock);
+	INIT_LIST_HEAD(&dd->dispatch);
+
+	q->elevator = eq;
+	return 0;
+}
+
+static int dd_request_merge(struct request_queue *q, struct request **rq,
+			    struct bio *bio)
+{
+	struct deadline_data *dd = q->elevator->elevator_data;
+	sector_t sector = bio_end_sector(bio);
+	struct request *__rq;
+
+	if (!dd->front_merges)
+		return ELEVATOR_NO_MERGE;
+
+	__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
+	if (__rq) {
+		BUG_ON(sector != blk_rq_pos(__rq));
+
+		if (elv_bio_merge_ok(__rq, bio)) {
+			*rq = __rq;
+			return ELEVATOR_FRONT_MERGE;
+		}
+	}
+
+	return ELEVATOR_NO_MERGE;
+}
+
+static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+{
+	struct request_queue *q = hctx->queue;
+	struct deadline_data *dd = q->elevator->elevator_data;
+	int ret;
+
+	spin_lock(&dd->lock);
+	ret = blk_mq_sched_try_merge(q, bio);
+	spin_unlock(&dd->lock);
+
+	return ret;
+}
+
+/*
+ * add rq to rbtree and fifo
+ */
+static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+			      bool at_head)
+{
+	struct request_queue *q = hctx->queue;
+	struct deadline_data *dd = q->elevator->elevator_data;
+	const int data_dir = rq_data_dir(rq);
+
+	if (blk_mq_sched_try_insert_merge(q, rq))
+		return;
+
+	blk_mq_sched_request_inserted(rq);
+
+	/*
+	 * If we already have a real request tag, send directly to
+	 * the dispatch list.
+	 */
+	if (rq->tag != -1) {
+		spin_lock(&hctx->lock);
+		list_add_tail(&rq->queuelist, &hctx->dispatch);
+		spin_unlock(&hctx->lock);
+		return;
+	}
+
+	if (at_head || rq->cmd_type != REQ_TYPE_FS) {
+		if (at_head)
+			list_add(&rq->queuelist, &dd->dispatch);
+		else
+			list_add_tail(&rq->queuelist, &dd->dispatch);
+	} else {
+		deadline_add_rq_rb(dd, rq);
+
+		if (rq_mergeable(rq)) {
+			elv_rqhash_add(q, rq);
+			if (!q->last_merge)
+				q->last_merge = rq;
+		}
+
+		/*
+		 * set expire time and add to fifo list
+		 */
+		rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
+		list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
+	}
+}
+
+static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
+			       struct list_head *list, bool at_head)
+{
+	struct request_queue *q = hctx->queue;
+	struct deadline_data *dd = q->elevator->elevator_data;
+
+	spin_lock(&dd->lock);
+	while (!list_empty(list)) {
+		struct request *rq;
+
+		rq = list_first_entry(list, struct request, queuelist);
+		list_del_init(&rq->queuelist);
+		dd_insert_request(hctx, rq, at_head);
+	}
+	spin_unlock(&dd->lock);
+}
+
+static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
+{
+	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+
+	return !list_empty_careful(&dd->dispatch) ||
+		!list_empty_careful(&dd->fifo_list[0]) ||
+		!list_empty_careful(&dd->fifo_list[1]);
+}
+
+/*
+ * sysfs parts below
+ */
+static ssize_t
+deadline_var_show(int var, char *page)
+{
+	return sprintf(page, "%d\n", var);
+}
+
+static ssize_t
+deadline_var_store(int *var, const char *page, size_t count)
+{
+	char *p = (char *) page;
+
+	*var = simple_strtol(p, &p, 10);
+	return count;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
+{									\
+	struct deadline_data *dd = e->elevator_data;			\
+	int __data = __VAR;						\
+	if (__CONV)							\
+		__data = jiffies_to_msecs(__data);			\
+	return deadline_var_show(__data, (page));			\
+}
+SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
+SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
+SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
+SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
+SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
+#undef SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
+{									\
+	struct deadline_data *dd = e->elevator_data;			\
+	int __data;							\
+	int ret = deadline_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	if (__CONV)							\
+		*(__PTR) = msecs_to_jiffies(__data);			\
+	else								\
+		*(__PTR) = __data;					\
+	return ret;							\
+}
+STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
+STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
+STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
+STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
+#undef STORE_FUNCTION
+
+#define DD_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \
+				      deadline_##name##_store)
+
+static struct elv_fs_entry deadline_attrs[] = {
+	DD_ATTR(read_expire),
+	DD_ATTR(write_expire),
+	DD_ATTR(writes_starved),
+	DD_ATTR(front_merges),
+	DD_ATTR(fifo_batch),
+	__ATTR_NULL
+};
+
+static struct elevator_type mq_deadline = {
+	.ops.mq = {
+		.insert_requests	= dd_insert_requests,
+		.dispatch_requests	= dd_dispatch_requests,
+		.next_request		= elv_rb_latter_request,
+		.former_request		= elv_rb_former_request,
+		.bio_merge		= dd_bio_merge,
+		.request_merge		= dd_request_merge,
+		.requests_merged	= dd_merged_requests,
+		.request_merged		= dd_request_merged,
+		.has_work		= dd_has_work,
+		.init_sched		= dd_init_queue,
+		.exit_sched		= dd_exit_queue,
+	},
+
+	.uses_mq	= true,
+	.elevator_attrs = deadline_attrs,
+	.elevator_name = "mq-deadline",
+	.elevator_owner = THIS_MODULE,
+};
+
+static int __init deadline_init(void)
+{
+	return elv_register(&mq_deadline);
+}
+
+static void __exit deadline_exit(void)
+{
+	elv_unregister(&mq_deadline);
+}
+
+module_init(deadline_init);
+module_exit(deadline_exit);
+
+MODULE_AUTHOR("Jens Axboe");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MQ deadline IO scheduler");
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* [PATCH 10/10] blk-mq-sched: allow setting of default IO scheduler
  2017-01-11 21:39 [PATCHSET v6] blk-mq scheduling framework Jens Axboe
                   ` (8 preceding siblings ...)
  2017-01-11 21:40 ` [PATCH 09/10] mq-deadline: add blk-mq adaptation of the deadline IO scheduler Jens Axboe
@ 2017-01-11 21:40 ` Jens Axboe
  2017-01-12 21:54   ` Bart Van Assche
  2017-01-12 21:16 ` [PATCHSET v6] blk-mq scheduling framework Bart Van Assche
                   ` (3 subsequent siblings)
  13 siblings, 1 reply; 62+ messages in thread
From: Jens Axboe @ 2017-01-11 21:40 UTC (permalink / raw)
  To: linux-kernel, linux-block; +Cc: osandov, bart.vanassche, Jens Axboe

Add Kconfig entries to manage what devices get assigned an MQ
scheduler, and add a blk-mq flag for drivers to opt out of scheduling.
The latter is useful for admin type queues that still allocate a blk-mq
queue and tag set, but aren't use for normal IO.

Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/Kconfig.iosched   | 56 +++++++++++++++++++++++++++++++++++++++++++------
 block/blk-mq-sched.c    | 20 ++++++++++++++++++
 block/blk-mq-sched.h    |  2 ++
 block/blk-mq.c          |  8 +++++++
 block/elevator.c        |  8 ++++++-
 drivers/nvme/host/pci.c |  1 +
 include/linux/blk-mq.h  |  1 +
 7 files changed, 89 insertions(+), 7 deletions(-)

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 490ef2850fae..0715ce93daef 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -32,12 +32,6 @@ config IOSCHED_CFQ
 
 	  This is the default I/O scheduler.
 
-config MQ_IOSCHED_DEADLINE
-	tristate "MQ deadline I/O scheduler"
-	default y
-	---help---
-	  MQ version of the deadline IO scheduler.
-
 config CFQ_GROUP_IOSCHED
 	bool "CFQ Group Scheduling support"
 	depends on IOSCHED_CFQ && BLK_CGROUP
@@ -69,6 +63,56 @@ config DEFAULT_IOSCHED
 	default "cfq" if DEFAULT_CFQ
 	default "noop" if DEFAULT_NOOP
 
+config MQ_IOSCHED_DEADLINE
+	tristate "MQ deadline I/O scheduler"
+	default y
+	---help---
+	  MQ version of the deadline IO scheduler.
+
+config MQ_IOSCHED_NONE
+	bool
+	default y
+
+choice
+	prompt "Default single-queue blk-mq I/O scheduler"
+	default DEFAULT_SQ_NONE
+	help
+	  Select the I/O scheduler which will be used by default for blk-mq
+	  managed block devices with a single queue.
+
+	config DEFAULT_SQ_DEADLINE
+		bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y
+
+	config DEFAULT_SQ_NONE
+		bool "None"
+
+endchoice
+
+config DEFAULT_SQ_IOSCHED
+	string
+	default "mq-deadline" if DEFAULT_SQ_DEADLINE
+	default "none" if DEFAULT_SQ_NONE
+
+choice
+	prompt "Default multi-queue blk-mq I/O scheduler"
+	default DEFAULT_MQ_NONE
+	help
+	  Select the I/O scheduler which will be used by default for blk-mq
+	  managed block devices with multiple queues.
+
+	config DEFAULT_MQ_DEADLINE
+		bool "MQ Deadline" if MQ_IOSCHED_DEADLINE=y
+
+	config DEFAULT_MQ_NONE
+		bool "None"
+
+endchoice
+
+config DEFAULT_MQ_IOSCHED
+	string
+	default "mq-deadline" if DEFAULT_MQ_DEADLINE
+	default "none" if DEFAULT_MQ_NONE
+
 endmenu
 
 endif
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 44cf30eb1589..26e9e20f67ce 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -332,3 +332,23 @@ int blk_mq_sched_teardown(struct request_queue *q)
 
 	return 0;
 }
+
+int blk_mq_sched_init(struct request_queue *q)
+{
+	int ret;
+
+#if defined(CONFIG_DEFAULT_SQ_NONE)
+	if (q->nr_hw_queues == 1)
+		return 0;
+#endif
+#if defined(CONFIG_DEFAULT_MQ_NONE)
+	if (q->nr_hw_queues > 1)
+		return 0;
+#endif
+
+	mutex_lock(&q->sysfs_lock);
+	ret = elevator_init(q, NULL);
+	mutex_unlock(&q->sysfs_lock);
+
+	return ret;
+}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 68d6a202b827..77859eae19c9 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -25,6 +25,8 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
 int blk_mq_sched_setup(struct request_queue *q);
 int blk_mq_sched_teardown(struct request_queue *q);
 
+int blk_mq_sched_init(struct request_queue *q);
+
 static inline bool
 blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
 {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3180b5fac88c..0dcd593e4ddd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2227,6 +2227,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 	mutex_unlock(&all_q_mutex);
 	put_online_cpus();
 
+	if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
+		int ret;
+
+		ret = blk_mq_sched_init(q);
+		if (ret)
+			return ERR_PTR(ret);
+	}
+
 	return q;
 
 err_hctxs:
diff --git a/block/elevator.c b/block/elevator.c
index 79e74da26343..b3ea721e51b4 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -219,7 +219,13 @@ int elevator_init(struct request_queue *q, char *name)
 	}
 
 	if (!e) {
-		e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
+		if (q->mq_ops && q->nr_hw_queues == 1)
+			e = elevator_get(CONFIG_DEFAULT_SQ_IOSCHED, false);
+		else if (q->mq_ops)
+			e = elevator_get(CONFIG_DEFAULT_MQ_IOSCHED, false);
+		else
+			e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
+
 		if (!e) {
 			printk(KERN_ERR
 				"Default I/O scheduler not found. " \
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 19beeb7b2ac2..e1b4e603b1cf 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1181,6 +1181,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev)
 		dev->admin_tagset.timeout = ADMIN_TIMEOUT;
 		dev->admin_tagset.numa_node = dev_to_node(dev->dev);
 		dev->admin_tagset.cmd_size = nvme_cmd_size(dev);
+		dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
 		dev->admin_tagset.driver_data = dev;
 
 		if (blk_mq_alloc_tag_set(&dev->admin_tagset))
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 63569eb46d15..8e4df3d6c8cd 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -153,6 +153,7 @@ enum {
 	BLK_MQ_F_SG_MERGE	= 1 << 2,
 	BLK_MQ_F_DEFER_ISSUE	= 1 << 4,
 	BLK_MQ_F_BLOCKING	= 1 << 5,
+	BLK_MQ_F_NO_SCHED	= 1 << 6,
 	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
 	BLK_MQ_F_ALLOC_POLICY_BITS = 1,
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* Re: [PATCH 04/10] blk-mq: un-export blk_mq_free_hctx_request()
  2017-01-11 21:39 ` [PATCH 04/10] blk-mq: un-export blk_mq_free_hctx_request() Jens Axboe
@ 2017-01-12 10:13   ` Johannes Thumshirn
  2017-01-12 21:18   ` Bart Van Assche
  2017-01-13  8:16   ` Christoph Hellwig
  2 siblings, 0 replies; 62+ messages in thread
From: Johannes Thumshirn @ 2017-01-12 10:13 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

On Wed, Jan 11, 2017 at 02:39:57PM -0700, Jens Axboe wrote:
> It's only used in blk-mq, kill it from the main exported header
> and kill the symbol export as well.
> 
> Signed-off-by: Jens Axboe <axboe@fb.com>
> ---

Looks good,
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>

-- 
Johannes Thumshirn                                          Storage
jthumshirn@suse.de                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 03/10] block: move rq_ioc() to blk.h
  2017-01-11 21:39 ` [PATCH 03/10] block: move rq_ioc() to blk.h Jens Axboe
@ 2017-01-12 10:14   ` Johannes Thumshirn
  2017-01-12 21:18   ` Bart Van Assche
  2017-01-13  8:33   ` Christoph Hellwig
  2 siblings, 0 replies; 62+ messages in thread
From: Johannes Thumshirn @ 2017-01-12 10:14 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

On Wed, Jan 11, 2017 at 02:39:56PM -0700, Jens Axboe wrote:
> We want to use it outside of blk-core.c.
> 
> Signed-off-by: Jens Axboe <axboe@fb.com>
> ---
Looks good,
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>

-- 
Johannes Thumshirn                                          Storage
jthumshirn@suse.de                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 02/10] blk-mq: make mq_ops a const pointer
  2017-01-11 21:39 ` [PATCH 02/10] blk-mq: make mq_ops a const pointer Jens Axboe
@ 2017-01-12 10:14   ` Johannes Thumshirn
  2017-01-13  8:16   ` Christoph Hellwig
  1 sibling, 0 replies; 62+ messages in thread
From: Johannes Thumshirn @ 2017-01-12 10:14 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

On Wed, Jan 11, 2017 at 02:39:55PM -0700, Jens Axboe wrote:
> We never change it, make that clear.
> 
> Signed-off-by: Jens Axboe <axboe@fb.com>
> Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>
> ---

Looks good,
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>

-- 
Johannes Thumshirn                                          Storage
jthumshirn@suse.de                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 01/10] block: move existing elevator ops to union
  2017-01-11 21:39 ` [PATCH 01/10] block: move existing elevator ops to union Jens Axboe
@ 2017-01-12 10:15   ` Johannes Thumshirn
  2017-01-12 21:17   ` Bart Van Assche
  2017-01-13  8:34   ` Christoph Hellwig
  2 siblings, 0 replies; 62+ messages in thread
From: Johannes Thumshirn @ 2017-01-12 10:15 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

On Wed, Jan 11, 2017 at 02:39:54PM -0700, Jens Axboe wrote:
> Prep patch for adding MQ ops as well, since doing anon unions with
> named initializers doesn't work on older compilers.
> 
> Signed-off-by: Jens Axboe <axboe@fb.com>
> ---

Looks good,
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>

-- 
Johannes Thumshirn                                          Storage
jthumshirn@suse.de                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 05/10] blk-mq: export some helpers we need to the scheduling framework
  2017-01-11 21:39 ` [PATCH 05/10] blk-mq: export some helpers we need to the scheduling framework Jens Axboe
@ 2017-01-12 10:17   ` Johannes Thumshirn
  2017-01-12 21:20   ` Bart Van Assche
  2017-01-13  8:17   ` Christoph Hellwig
  2 siblings, 0 replies; 62+ messages in thread
From: Johannes Thumshirn @ 2017-01-12 10:17 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

On Wed, Jan 11, 2017 at 02:39:58PM -0700, Jens Axboe wrote:
> Signed-off-by: Jens Axboe <axboe@fb.com>
> ---

Looks good,
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>

-- 
Johannes Thumshirn                                          Storage
jthumshirn@suse.de                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-11 21:39 [PATCHSET v6] blk-mq scheduling framework Jens Axboe
                   ` (9 preceding siblings ...)
  2017-01-11 21:40 ` [PATCH 10/10] blk-mq-sched: allow setting of default " Jens Axboe
@ 2017-01-12 21:16 ` Bart Van Assche
  2017-01-13  8:15 ` Hannes Reinecke
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 62+ messages in thread
From: Bart Van Assche @ 2017-01-12 21:16 UTC (permalink / raw)
  To: linux-kernel, linux-block, axboe; +Cc: osandov

On Wed, 2017-01-11 at 14:39 -0700, Jens Axboe wrote:
> I've reworked bits of this to get rid of the shadow requests, thanks
> to Bart for the inspiration. The missing piece, for me, was the fact
> that we have the tags->rqs[] indirection array already. I've done this
> somewhat differently, though, by having the internal scheduler tag
> map be allocated/torn down when an IO scheduler is attached or
> detached. This also means that when we run without a scheduler, we
> don't have to do double tag allocations, it'll work like before.

Hello Jens,

Thanks for having done the rework! This series looks great to me. I have a
few small comments though. I will post these as replies to the individual
patches.

Bart.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 01/10] block: move existing elevator ops to union
  2017-01-11 21:39 ` [PATCH 01/10] block: move existing elevator ops to union Jens Axboe
  2017-01-12 10:15   ` Johannes Thumshirn
@ 2017-01-12 21:17   ` Bart Van Assche
  2017-01-13  8:34   ` Christoph Hellwig
  2 siblings, 0 replies; 62+ messages in thread
From: Bart Van Assche @ 2017-01-12 21:17 UTC (permalink / raw)
  To: linux-kernel, linux-block, axboe; +Cc: osandov

On Wed, 2017-01-11 at 14:39 -0700, Jens Axboe wrote:
> Prep patch for adding MQ ops as well, since doing anon unions with
> named initializers doesn't work on older compilers.

Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 03/10] block: move rq_ioc() to blk.h
  2017-01-11 21:39 ` [PATCH 03/10] block: move rq_ioc() to blk.h Jens Axboe
  2017-01-12 10:14   ` Johannes Thumshirn
@ 2017-01-12 21:18   ` Bart Van Assche
  2017-01-13  8:33   ` Christoph Hellwig
  2 siblings, 0 replies; 62+ messages in thread
From: Bart Van Assche @ 2017-01-12 21:18 UTC (permalink / raw)
  To: linux-kernel, linux-block, axboe; +Cc: osandov

On Wed, 2017-01-11 at 14:39 -0700, Jens Axboe wrote:
> We want to use it outside of blk-core.c.

Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 04/10] blk-mq: un-export blk_mq_free_hctx_request()
  2017-01-11 21:39 ` [PATCH 04/10] blk-mq: un-export blk_mq_free_hctx_request() Jens Axboe
  2017-01-12 10:13   ` Johannes Thumshirn
@ 2017-01-12 21:18   ` Bart Van Assche
  2017-01-13  8:16   ` Christoph Hellwig
  2 siblings, 0 replies; 62+ messages in thread
From: Bart Van Assche @ 2017-01-12 21:18 UTC (permalink / raw)
  To: linux-kernel, linux-block, axboe; +Cc: osandov

On Wed, 2017-01-11 at 14:39 -0700, Jens Axboe wrote:
> It's only used in blk-mq, kill it from the main exported header
> and kill the symbol export as well.

Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 05/10] blk-mq: export some helpers we need to the scheduling framework
  2017-01-11 21:39 ` [PATCH 05/10] blk-mq: export some helpers we need to the scheduling framework Jens Axboe
  2017-01-12 10:17   ` Johannes Thumshirn
@ 2017-01-12 21:20   ` Bart Van Assche
  2017-01-13  8:17   ` Christoph Hellwig
  2 siblings, 0 replies; 62+ messages in thread
From: Bart Van Assche @ 2017-01-12 21:20 UTC (permalink / raw)
  To: linux-kernel, linux-block, axboe; +Cc: osandov

On Wed, 2017-01-11 at 14:39 -0700, Jens Axboe wrote:
> [ ... ]

Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 06/10] blk-mq-tag: cleanup the normal/reserved tag allocation
  2017-01-11 21:39 ` [PATCH 06/10] blk-mq-tag: cleanup the normal/reserved tag allocation Jens Axboe
@ 2017-01-12 21:22   ` Bart Van Assche
  2017-01-12 22:07     ` Jens Axboe
  2017-01-13  8:30   ` Christoph Hellwig
  1 sibling, 1 reply; 62+ messages in thread
From: Bart Van Assche @ 2017-01-12 21:22 UTC (permalink / raw)
  To: linux-kernel, linux-block, axboe; +Cc: osandov

On Wed, 2017-01-11 at 14:39 -0700, Jens Axboe wrote:
> This is in preparation for having another tag set available. Cleanup
> the parameters, and allow passing in of tags fo blk_mq_put_tag().

It seems like an 'r' is missing from the description ("tags fo")?

Anyway:

Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 07/10] blk-mq: abstract out helpers for allocating/freeing tag maps
  2017-01-11 21:40 ` [PATCH 07/10] blk-mq: abstract out helpers for allocating/freeing tag maps Jens Axboe
@ 2017-01-12 21:29   ` Bart Van Assche
  2017-01-12 21:54     ` Jens Axboe
  0 siblings, 1 reply; 62+ messages in thread
From: Bart Van Assche @ 2017-01-12 21:29 UTC (permalink / raw)
  To: linux-kernel, linux-block, axboe; +Cc: osandov

On Wed, 2017-01-11 at 14:40 -0700, Jens Axboe wrote:
> @@ -2392,12 +2425,12 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
>  	if (set->nr_hw_queues > nr_cpu_ids)
>  		set->nr_hw_queues = nr_cpu_ids;
>  
> +	ret = -ENOMEM;
>  	set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
>  				 GFP_KERNEL, set->numa_node);
>  	if (!set->tags)
>  		return -ENOMEM;
>  
> -	ret = -ENOMEM;
>  	set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
>  			GFP_KERNEL, set->numa_node);
>  	if (!set->mq_map)

Not that it matters to me, but this change probably isn't needed?

Anyway:

Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 08/10] blk-mq-sched: add framework for MQ capable IO schedulers
  2017-01-11 21:40 ` [PATCH 08/10] blk-mq-sched: add framework for MQ capable IO schedulers Jens Axboe
@ 2017-01-12 21:45   ` Bart Van Assche
  2017-01-12 21:59     ` Jens Axboe
  2017-01-13 11:15   ` Hannes Reinecke
  1 sibling, 1 reply; 62+ messages in thread
From: Bart Van Assche @ 2017-01-12 21:45 UTC (permalink / raw)
  To: linux-kernel, linux-block, axboe; +Cc: osandov

On Wed, 2017-01-11 at 14:40 -0700, Jens Axboe wrote:
> @@ -451,11 +456,11 @@ void blk_insert_flush(struct request *rq)
>  	 * processed directly without going through flush machinery.  Queue
>  	 * for normal execution.
>  	 */
> -	if ((policy & REQ_FSEQ_DATA) &&
> -	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
> -		if (q->mq_ops) {
> -			blk_mq_insert_request(rq, false, true, false);
> -		} else
> +	if (((policy & REQ_FSEQ_DATA) &&
> +	     !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH)))) {
> +		if (q->mq_ops)
> +			blk_mq_sched_insert_request(rq, false, true, false);
> +		else
>  			list_add_tail(&rq->queuelist, &q->queue_head);
>  		return;
>  	}

Not that it really matters, but this change adds a pair of parentheses --
"if (e)" is changed into "if ((e))". Is this necessary?

> +void blk_mq_sched_free_hctx_data(struct request_queue *q,
> +				 void (*exit)(struct blk_mq_hw_ctx *))
> +{
> +	struct blk_mq_hw_ctx *hctx;
> +	int i;
> +
> +	queue_for_each_hw_ctx(q, hctx, i) {
> +		if (exit)
> +			exit(hctx);
> +		kfree(hctx->sched_data);
> +		hctx->sched_data = NULL;
> +	}
> +}
> +EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
> +
> +int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
> +				int (*init)(struct blk_mq_hw_ctx *),
> +				void (*exit)(struct blk_mq_hw_ctx *))
> +{
> +	struct blk_mq_hw_ctx *hctx;
> +	int ret;
> +	int i;
> +
> +	queue_for_each_hw_ctx(q, hctx, i) {
> +		hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
> +		if (!hctx->sched_data) {
> +			ret = -ENOMEM;
> +			goto error;
> +		}
> +
> +		if (init) {
> +			ret = init(hctx);
> +			if (ret) {
> +				/*
> +				 * We don't want to give exit() a partially
> +				 * initialized sched_data. init() must clean up
> +				 * if it fails.
> +				 */
> +				kfree(hctx->sched_data);
> +				hctx->sched_data = NULL;
> +				goto error;
> +			}
> +		}
> +	}
> +
> +	return 0;
> +error:
> +	blk_mq_sched_free_hctx_data(q, exit);
> +	return ret;
> +}

If one of the init() calls by blk_mq_sched_init_hctx_data() fails then
blk_mq_sched_free_hctx_data() will call exit() even for hctx's for which
init() has not been called. How about changing "if (exit)" into "if (exit &&
hctx->sched_data)" such that exit() is only called for hctx's for which
init() has been called?

> +struct request *blk_mq_sched_get_request(struct request_queue *q,
> +					 struct bio *bio,
> +					 unsigned int op,
> +					 struct blk_mq_alloc_data *data)
> +{
> +	struct elevator_queue *e = q->elevator;
> +	struct blk_mq_hw_ctx *hctx;
> +	struct blk_mq_ctx *ctx;
> +	struct request *rq;
> +
> +	blk_queue_enter_live(q);
> +	ctx = blk_mq_get_ctx(q);
> +	hctx = blk_mq_map_queue(q, ctx->cpu);
> +
> +	blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
> +
> +	if (e) {
> +		data->flags |= BLK_MQ_REQ_INTERNAL;
> +		if (e->type->ops.mq.get_request)
> +			rq = e->type->ops.mq.get_request(q, op, data);
> +		else
> +			rq = __blk_mq_alloc_request(data, op);
> +	} else {
> +		rq = __blk_mq_alloc_request(data, op);
> +		if (rq) {
> +			rq->tag = rq->internal_tag;
> +			rq->internal_tag = -1;
> +		}
> +	}
> +
> +	if (rq) {
> +		rq->elv.icq = NULL;
> +		if (e && e->type->icq_cache)
> +			blk_mq_sched_assign_ioc(q, rq, bio);
> +		data->hctx->queued++;
> +		return rq;
> +	}
> +
> +	blk_queue_exit(q);
> +	return NULL;
> +}

The "rq->tag = rq->internal_tag; rq->internal_tag = -1;" occurs not only
here but also in blk_mq_alloc_request_hctx(). Has it been considered to move
that code into __blk_mq_alloc_request()?

@@ -223,14 +225,17 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
>  
>  	tag = blk_mq_get_tag(data);
>  	if (tag != BLK_MQ_TAG_FAIL) {
> -		rq = data->hctx->tags->rqs[tag];
> +		struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
> +
> +		rq = tags->rqs[tag];
>  
>  		if (blk_mq_tag_busy(data->hctx)) {
>  			rq->rq_flags = RQF_MQ_INFLIGHT;
>  			atomic_inc(&data->hctx->nr_active);
>  		}
>  
> -		rq->tag = tag;
> +		rq->tag = -1;
> +		rq->internal_tag = tag;
>  		blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
>  		return rq;
>  	}

How about using the following code for tag assignment instead of "rq->tag =
-1; rq->internal_tag = tag"?

		if (data->flags & BLK_MQ_REQ_INTERNAL) {
			rq->tag = -1;
			rq->internal_tag = tag;
		} else {
			rq->tag = tag;
			rq->internal_tag = -1;
		}

> @@ -313,6 +313,9 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
>  		goto out_queue_exit;
>  	}
>  
> +	rq->tag = rq->internal_tag;
> +	rq->internal_tag = -1;
> +
>  	return rq;
>  
>  out_queue_exit:
> @@ -321,10 +324,10 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
>  }
>  EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);

Should something like "WARN_ON_ONCE(flags & BLK_MQ_REQ_INTERNAL)" be added
at the start of this function to avoid that BLK_MQ_REQ_INTERNAL is passed in
from outside the block layer?

Bart.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 09/10] mq-deadline: add blk-mq adaptation of the deadline IO scheduler
  2017-01-11 21:40 ` [PATCH 09/10] mq-deadline: add blk-mq adaptation of the deadline IO scheduler Jens Axboe
@ 2017-01-12 21:53   ` Bart Van Assche
  0 siblings, 0 replies; 62+ messages in thread
From: Bart Van Assche @ 2017-01-12 21:53 UTC (permalink / raw)
  To: linux-kernel, linux-block, axboe; +Cc: osandov

On Wed, 2017-01-11 at 14:40 -0700, Jens Axboe wrote:
> This is basically identical to deadline-iosched, except it registers
> as a MQ capable scheduler. This is still a single queue design.

Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 10/10] blk-mq-sched: allow setting of default IO scheduler
  2017-01-11 21:40 ` [PATCH 10/10] blk-mq-sched: allow setting of default " Jens Axboe
@ 2017-01-12 21:54   ` Bart Van Assche
  0 siblings, 0 replies; 62+ messages in thread
From: Bart Van Assche @ 2017-01-12 21:54 UTC (permalink / raw)
  To: linux-kernel, linux-block, axboe; +Cc: osandov

On Wed, 2017-01-11 at 14:40 -0700, Jens Axboe wrote:
> Add Kconfig entries to manage what devices get assigned an MQ
> scheduler, and add a blk-mq flag for drivers to opt out of scheduling.
> The latter is useful for admin type queues that still allocate a blk-mq
> queue and tag set, but aren't use for normal IO.

Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 07/10] blk-mq: abstract out helpers for allocating/freeing tag maps
  2017-01-12 21:29   ` Bart Van Assche
@ 2017-01-12 21:54     ` Jens Axboe
  2017-01-13  8:25       ` Johannes Thumshirn
  0 siblings, 1 reply; 62+ messages in thread
From: Jens Axboe @ 2017-01-12 21:54 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: linux-kernel, linux-block, osandov

On Thu, Jan 12 2017, Bart Van Assche wrote:
> On Wed, 2017-01-11 at 14:40 -0700, Jens Axboe wrote:
> > @@ -2392,12 +2425,12 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
> >  	if (set->nr_hw_queues > nr_cpu_ids)
> >  		set->nr_hw_queues = nr_cpu_ids;
> >  
> > +	ret = -ENOMEM;
> >  	set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
> >  				 GFP_KERNEL, set->numa_node);
> >  	if (!set->tags)
> >  		return -ENOMEM;
> >  
> > -	ret = -ENOMEM;
> >  	set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
> >  			GFP_KERNEL, set->numa_node);
> >  	if (!set->mq_map)
> 
> Not that it matters to me, but this change probably isn't needed?

Huh oops no, I'll move that back where it belongs.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 08/10] blk-mq-sched: add framework for MQ capable IO schedulers
  2017-01-12 21:45   ` Bart Van Assche
@ 2017-01-12 21:59     ` Jens Axboe
  0 siblings, 0 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-12 21:59 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: linux-kernel, linux-block, osandov

On Thu, Jan 12 2017, Bart Van Assche wrote:
> On Wed, 2017-01-11 at 14:40 -0700, Jens Axboe wrote:
> > @@ -451,11 +456,11 @@ void blk_insert_flush(struct request *rq)
> >  	 * processed directly without going through flush machinery.  Queue
> >  	 * for normal execution.
> >  	 */
> > -	if ((policy & REQ_FSEQ_DATA) &&
> > -	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
> > -		if (q->mq_ops) {
> > -			blk_mq_insert_request(rq, false, true, false);
> > -		} else
> > +	if (((policy & REQ_FSEQ_DATA) &&
> > +	     !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH)))) {
> > +		if (q->mq_ops)
> > +			blk_mq_sched_insert_request(rq, false, true, false);
> > +		else
> >  			list_add_tail(&rq->queuelist, &q->queue_head);
> >  		return;
> >  	}
> 
> Not that it really matters, but this change adds a pair of parentheses --
> "if (e)" is changed into "if ((e))". Is this necessary?

I fixed that up earlier today, as I noticed the same. So that's gone in
the current -git tree.

> > +void blk_mq_sched_free_hctx_data(struct request_queue *q,
> > +				 void (*exit)(struct blk_mq_hw_ctx *))
> > +{
> > +	struct blk_mq_hw_ctx *hctx;
> > +	int i;
> > +
> > +	queue_for_each_hw_ctx(q, hctx, i) {
> > +		if (exit)
> > +			exit(hctx);
> > +		kfree(hctx->sched_data);
> > +		hctx->sched_data = NULL;
> > +	}
> > +}
> > +EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
> > +
> > +int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
> > +				int (*init)(struct blk_mq_hw_ctx *),
> > +				void (*exit)(struct blk_mq_hw_ctx *))
> > +{
> > +	struct blk_mq_hw_ctx *hctx;
> > +	int ret;
> > +	int i;
> > +
> > +	queue_for_each_hw_ctx(q, hctx, i) {
> > +		hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
> > +		if (!hctx->sched_data) {
> > +			ret = -ENOMEM;
> > +			goto error;
> > +		}
> > +
> > +		if (init) {
> > +			ret = init(hctx);
> > +			if (ret) {
> > +				/*
> > +				 * We don't want to give exit() a partially
> > +				 * initialized sched_data. init() must clean up
> > +				 * if it fails.
> > +				 */
> > +				kfree(hctx->sched_data);
> > +				hctx->sched_data = NULL;
> > +				goto error;
> > +			}
> > +		}
> > +	}
> > +
> > +	return 0;
> > +error:
> > +	blk_mq_sched_free_hctx_data(q, exit);
> > +	return ret;
> > +}
> 
> If one of the init() calls by blk_mq_sched_init_hctx_data() fails then
> blk_mq_sched_free_hctx_data() will call exit() even for hctx's for which
> init() has not been called. How about changing "if (exit)" into "if (exit &&
> hctx->sched_data)" such that exit() is only called for hctx's for which
> init() has been called?

Good point, I'll make that change to the exit function.

> > +struct request *blk_mq_sched_get_request(struct request_queue *q,
> > +					 struct bio *bio,
> > +					 unsigned int op,
> > +					 struct blk_mq_alloc_data *data)
> > +{
> > +	struct elevator_queue *e = q->elevator;
> > +	struct blk_mq_hw_ctx *hctx;
> > +	struct blk_mq_ctx *ctx;
> > +	struct request *rq;
> > +
> > +	blk_queue_enter_live(q);
> > +	ctx = blk_mq_get_ctx(q);
> > +	hctx = blk_mq_map_queue(q, ctx->cpu);
> > +
> > +	blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
> > +
> > +	if (e) {
> > +		data->flags |= BLK_MQ_REQ_INTERNAL;
> > +		if (e->type->ops.mq.get_request)
> > +			rq = e->type->ops.mq.get_request(q, op, data);
> > +		else
> > +			rq = __blk_mq_alloc_request(data, op);
> > +	} else {
> > +		rq = __blk_mq_alloc_request(data, op);
> > +		if (rq) {
> > +			rq->tag = rq->internal_tag;
> > +			rq->internal_tag = -1;
> > +		}
> > +	}
> > +
> > +	if (rq) {
> > +		rq->elv.icq = NULL;
> > +		if (e && e->type->icq_cache)
> > +			blk_mq_sched_assign_ioc(q, rq, bio);
> > +		data->hctx->queued++;
> > +		return rq;
> > +	}
> > +
> > +	blk_queue_exit(q);
> > +	return NULL;
> > +}
> 
> The "rq->tag = rq->internal_tag; rq->internal_tag = -1;" occurs not only
> here but also in blk_mq_alloc_request_hctx(). Has it been considered to move
> that code into __blk_mq_alloc_request()?

Yes, it's in two locations. I wanted to keep it out of
__blk_mq_alloc_request(), so we can still use that for normal tag
allocations. But maybe it's better for __blk_mq_alloc_request() to just
do:

        if (flags & BLK_MQ_REQ_INTERNAL) {
                rq->tag = -1;
                rq->internal_tag = tag;
        } else {
                rq->tag = tag;
                rq->internal_tag = -1;
        }

and handle it directly in there. What do you think?

> @@ -223,14 +225,17 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
> >  
> >  	tag = blk_mq_get_tag(data);
> >  	if (tag != BLK_MQ_TAG_FAIL) {
> > -		rq = data->hctx->tags->rqs[tag];
> > +		struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
> > +
> > +		rq = tags->rqs[tag];
> >  
> >  		if (blk_mq_tag_busy(data->hctx)) {
> >  			rq->rq_flags = RQF_MQ_INFLIGHT;
> >  			atomic_inc(&data->hctx->nr_active);
> >  		}
> >  
> > -		rq->tag = tag;
> > +		rq->tag = -1;
> > +		rq->internal_tag = tag;
> >  		blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
> >  		return rq;
> >  	}
> 
> How about using the following code for tag assignment instead of "rq->tag =
> -1; rq->internal_tag = tag"?
> 
> 		if (data->flags & BLK_MQ_REQ_INTERNAL) {
> 			rq->tag = -1;
> 			rq->internal_tag = tag;
> 		} else {
> 			rq->tag = tag;
> 			rq->internal_tag = -1;
> 		}

Hah, nevermind, I should have read further. I guess we agree, I'll make
that change.

> > @@ -313,6 +313,9 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
> >  		goto out_queue_exit;
> >  	}
> >  
> > +	rq->tag = rq->internal_tag;
> > +	rq->internal_tag = -1;
> > +
> >  	return rq;
> >  
> >  out_queue_exit:
> > @@ -321,10 +324,10 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
> >  }
> >  EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
> 
> Should something like "WARN_ON_ONCE(flags & BLK_MQ_REQ_INTERNAL)" be added
> at the start of this function to avoid that BLK_MQ_REQ_INTERNAL is passed in
> from outside the block layer?

Yes, seems like a prudent safety check. I'll add it, thanks.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 06/10] blk-mq-tag: cleanup the normal/reserved tag allocation
  2017-01-12 21:22   ` Bart Van Assche
@ 2017-01-12 22:07     ` Jens Axboe
  0 siblings, 0 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-12 22:07 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: linux-kernel, linux-block, osandov

On Thu, Jan 12 2017, Bart Van Assche wrote:
> On Wed, 2017-01-11 at 14:39 -0700, Jens Axboe wrote:
> > This is in preparation for having another tag set available. Cleanup
> > the parameters, and allow passing in of tags fo blk_mq_put_tag().
> 
> It seems like an 'r' is missing from the description ("tags fo")?

Indeed, good eye. Added.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-11 21:39 [PATCHSET v6] blk-mq scheduling framework Jens Axboe
                   ` (10 preceding siblings ...)
  2017-01-12 21:16 ` [PATCHSET v6] blk-mq scheduling framework Bart Van Assche
@ 2017-01-13  8:15 ` Hannes Reinecke
  2017-01-13 11:04   ` Hannes Reinecke
  2017-01-13 10:09 ` Hannes Reinecke
  2017-01-15 10:12 ` Paolo Valente
  13 siblings, 1 reply; 62+ messages in thread
From: Hannes Reinecke @ 2017-01-13  8:15 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/11/2017 10:39 PM, Jens Axboe wrote:
> Another year, another posting of this patchset. The previous posting
> was here:
> 
> https://www.spinics.net/lists/kernel/msg2406106.html
> 
> (yes, I've skipped v5, it was fixes on top of v4, not the rework).
> 
> I've reworked bits of this to get rid of the shadow requests, thanks
> to Bart for the inspiration. The missing piece, for me, was the fact
> that we have the tags->rqs[] indirection array already. I've done this
> somewhat differently, though, by having the internal scheduler tag
> map be allocated/torn down when an IO scheduler is attached or
> detached. This also means that when we run without a scheduler, we
> don't have to do double tag allocations, it'll work like before.
> 
> The patchset applies on top of 4.10-rc3, or can be pulled here:
> 
> git://git.kernel.dk/linux-block blk-mq-sched.6
> 
Well ... something's wrong here on my machine:

[   39.886886] ------------[ cut here ]------------
[   39.886895] WARNING: CPU: 9 PID: 62 at block/blk-mq.c:342
__blk_mq_finish_request+0x124/0x140
[   39.886895] Modules linked in: sd_mod ahci uhci_hcd ehci_pci
mpt3sas(+) libahci ehci_hcd serio_raw crc32c_intel raid_class drm libata
usbcore hpsa usb_common scsi_transport_sas sg dm_multipath dm_mod
scsi_dh_rdac scsi_dh_emc scsi_dh_alua autofs4
[   39.886910] CPU: 9 PID: 62 Comm: kworker/u130:0 Not tainted
4.10.0-rc3+ #528
[   39.886911] Hardware name: HP ProLiant ML350p Gen8, BIOS P72 09/08/2013
[   39.886917] Workqueue: events_unbound async_run_entry_fn
[   39.886918] Call Trace:
[   39.886923]  dump_stack+0x85/0xc9
[   39.886927]  __warn+0xd1/0xf0
[   39.886928]  warn_slowpath_null+0x1d/0x20
[   39.886930]  __blk_mq_finish_request+0x124/0x140
[   39.886932]  blk_mq_finish_request+0x55/0x60
[   39.886934]  blk_mq_sched_put_request+0x78/0x80
[   39.886936]  blk_mq_free_request+0xe/0x10
[   39.886938]  blk_put_request+0x25/0x60
[   39.886944]  __scsi_execute.isra.24+0x104/0x160
[   39.886946]  scsi_execute_req_flags+0x94/0x100
[   39.886948]  scsi_report_opcode+0xab/0x100

checking ...

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 02/10] blk-mq: make mq_ops a const pointer
  2017-01-11 21:39 ` [PATCH 02/10] blk-mq: make mq_ops a const pointer Jens Axboe
  2017-01-12 10:14   ` Johannes Thumshirn
@ 2017-01-13  8:16   ` Christoph Hellwig
  1 sibling, 0 replies; 62+ messages in thread
From: Christoph Hellwig @ 2017-01-13  8:16 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

Looks fine, even totally independent of any sched work, like most
of the cleanups following this one

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 04/10] blk-mq: un-export blk_mq_free_hctx_request()
  2017-01-11 21:39 ` [PATCH 04/10] blk-mq: un-export blk_mq_free_hctx_request() Jens Axboe
  2017-01-12 10:13   ` Johannes Thumshirn
  2017-01-12 21:18   ` Bart Van Assche
@ 2017-01-13  8:16   ` Christoph Hellwig
  2 siblings, 0 replies; 62+ messages in thread
From: Christoph Hellwig @ 2017-01-13  8:16 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

On Wed, Jan 11, 2017 at 02:39:57PM -0700, Jens Axboe wrote:
> It's only used in blk-mq, kill it from the main exported header
> and kill the symbol export as well.

Looks fine,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 05/10] blk-mq: export some helpers we need to the scheduling framework
  2017-01-11 21:39 ` [PATCH 05/10] blk-mq: export some helpers we need to the scheduling framework Jens Axboe
  2017-01-12 10:17   ` Johannes Thumshirn
  2017-01-12 21:20   ` Bart Van Assche
@ 2017-01-13  8:17   ` Christoph Hellwig
  2017-01-13 15:01     ` Jens Axboe
  2 siblings, 1 reply; 62+ messages in thread
From: Christoph Hellwig @ 2017-01-13  8:17 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

Do we really need to make any of the blk-mq scheduling modular?

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 07/10] blk-mq: abstract out helpers for allocating/freeing tag maps
  2017-01-12 21:54     ` Jens Axboe
@ 2017-01-13  8:25       ` Johannes Thumshirn
  0 siblings, 0 replies; 62+ messages in thread
From: Johannes Thumshirn @ 2017-01-13  8:25 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Bart Van Assche, linux-kernel, linux-block, osandov

On Thu, Jan 12, 2017 at 02:54:38PM -0700, Jens Axboe wrote:
> On Thu, Jan 12 2017, Bart Van Assche wrote:
> > On Wed, 2017-01-11 at 14:40 -0700, Jens Axboe wrote:
> > > @@ -2392,12 +2425,12 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
> > >  	if (set->nr_hw_queues > nr_cpu_ids)
> > >  		set->nr_hw_queues = nr_cpu_ids;
> > >  
> > > +	ret = -ENOMEM;
> > >  	set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
> > >  				 GFP_KERNEL, set->numa_node);
> > >  	if (!set->tags)
> > >  		return -ENOMEM;
> > >  
> > > -	ret = -ENOMEM;
> > >  	set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
> > >  			GFP_KERNEL, set->numa_node);
> > >  	if (!set->mq_map)
> > 
> > Not that it matters to me, but this change probably isn't needed?
> 
> Huh oops no, I'll move that back where it belongs.

Maybe even:

+ ret = -ENOMEM;
  set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
  			 GFP_KERNEL, set->numa_node);
  if (!set->tags)
-  	return -ENOMEM;
+	return ret;

- ret = -ENOMEM;

-- 
Johannes Thumshirn                                          Storage
jthumshirn@suse.de                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 06/10] blk-mq-tag: cleanup the normal/reserved tag allocation
  2017-01-11 21:39 ` [PATCH 06/10] blk-mq-tag: cleanup the normal/reserved tag allocation Jens Axboe
  2017-01-12 21:22   ` Bart Van Assche
@ 2017-01-13  8:30   ` Christoph Hellwig
  2017-01-13 15:06     ` Jens Axboe
  1 sibling, 1 reply; 62+ messages in thread
From: Christoph Hellwig @ 2017-01-13  8:30 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

On Wed, Jan 11, 2017 at 02:39:59PM -0700, Jens Axboe wrote:
> This is in preparation for having another tag set available. Cleanup
> the parameters, and allow passing in of tags fo blk_mq_put_tag().

I've been playing around with this are a bit but never submitted
anything.  Below is an untested merge of my previous bits with your
patch, let me know what you think about it:

---
>From c9ea92bfb468e9116149db95d246e48ace2b87f1 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 11 Jan 2017 11:04:53 -0700
Subject: blk-mq-tag: cleanup the normal/reserved tag allocation

This is in preparation for having another tag set available. Cleanup
the parameters, and allow passing in of tags for blk_mq_put_tag().

Signed-off-by: Jens Axboe <axboe@fb.com>
[hch: even more cleanups]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-tag.c | 94 +++++++++++++++++++++---------------------------------
 block/blk-mq-tag.h |  4 +--
 block/blk-mq.c     |  2 +-
 block/blk-mq.h     |  5 +++
 4 files changed, 44 insertions(+), 61 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index dcf5ce3..ced7527 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -90,32 +90,46 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	return atomic_read(&hctx->nr_active) < depth;
 }
 
-static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt)
+static int __blk_mq_get_tag(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt)
 {
 	if (!hctx_may_queue(hctx, bt))
 		return -1;
 	return __sbitmap_queue_get(bt);
 }
 
-static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
-		  struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags)
+unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 {
+	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+	struct sbitmap_queue *bt;
 	struct sbq_wait_state *ws;
 	DEFINE_WAIT(wait);
+	unsigned int tag_offset;
 	int tag;
 
-	tag = __bt_get(hctx, bt);
+	if (data->flags & BLK_MQ_REQ_RESERVED) {
+		if (unlikely(!tags->nr_reserved_tags)) {
+			WARN_ON_ONCE(1);
+			return BLK_MQ_TAG_FAIL;
+		}
+		bt = &tags->breserved_tags;
+		tag_offset = 0;
+	} else {
+		bt = &tags->bitmap_tags;
+		tag_offset = tags->nr_reserved_tags;
+	}
+
+	tag = __blk_mq_get_tag(data->hctx, bt);
 	if (tag != -1)
-		return tag;
+		goto found_tag;
 
 	if (data->flags & BLK_MQ_REQ_NOWAIT)
-		return -1;
+		return BLK_MQ_TAG_FAIL;
 
-	ws = bt_wait_ptr(bt, hctx);
+	ws = bt_wait_ptr(bt, data->hctx);
 	do {
 		prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
 
-		tag = __bt_get(hctx, bt);
+		tag = __blk_mq_get_tag(data->hctx, bt);
 		if (tag != -1)
 			break;
 
@@ -125,14 +139,14 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
 		 * some to complete. Note that hctx can be NULL here for
 		 * reserved tag allocation.
 		 */
-		if (hctx)
-			blk_mq_run_hw_queue(hctx, false);
+		if (data->hctx)
+			blk_mq_run_hw_queue(data->hctx, false);
 
 		/*
 		 * Retry tag allocation after running the hardware queue,
 		 * as running the queue may also have found completions.
 		 */
-		tag = __bt_get(hctx, bt);
+		tag = __blk_mq_get_tag(data->hctx, bt);
 		if (tag != -1)
 			break;
 
@@ -142,61 +156,25 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
 
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu);
-		if (data->flags & BLK_MQ_REQ_RESERVED) {
-			bt = &data->hctx->tags->breserved_tags;
-		} else {
-			hctx = data->hctx;
-			bt = &hctx->tags->bitmap_tags;
-		}
+		tags = blk_mq_tags_from_data(data);
+		if (data->flags & BLK_MQ_REQ_RESERVED)
+			bt = &tags->breserved_tags;
+		else
+			bt = &tags->bitmap_tags;
+
 		finish_wait(&ws->wait, &wait);
-		ws = bt_wait_ptr(bt, hctx);
+		ws = bt_wait_ptr(bt, data->hctx);
 	} while (1);
 
 	finish_wait(&ws->wait, &wait);
-	return tag;
-}
-
-static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
-{
-	int tag;
-
-	tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
-		     data->hctx->tags);
-	if (tag >= 0)
-		return tag + data->hctx->tags->nr_reserved_tags;
-
-	return BLK_MQ_TAG_FAIL;
-}
-
-static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
-{
-	int tag;
 
-	if (unlikely(!data->hctx->tags->nr_reserved_tags)) {
-		WARN_ON_ONCE(1);
-		return BLK_MQ_TAG_FAIL;
-	}
-
-	tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL,
-		     data->hctx->tags);
-	if (tag < 0)
-		return BLK_MQ_TAG_FAIL;
-
-	return tag;
+found_tag:
+	return tag + tag_offset;
 }
 
-unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
+void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
+		    struct blk_mq_ctx *ctx, unsigned int tag)
 {
-	if (data->flags & BLK_MQ_REQ_RESERVED)
-		return __blk_mq_get_reserved_tag(data);
-	return __blk_mq_get_tag(data);
-}
-
-void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
-		    unsigned int tag)
-{
-	struct blk_mq_tags *tags = hctx->tags;
-
 	if (tag >= tags->nr_reserved_tags) {
 		const int real_tag = tag - tags->nr_reserved_tags;
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index d166273..923602d 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -24,8 +24,8 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
 extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
-extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
-			   unsigned int tag);
+extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
+			   struct blk_mq_ctx *ctx, unsigned int tag);
 extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
 extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
 extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9fc5217..6fab8e9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -335,7 +335,7 @@ void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
-	blk_mq_put_tag(hctx, ctx, tag);
+	blk_mq_put_tag(hctx, hctx->tags, ctx, tag);
 	blk_queue_exit(q);
 }
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e59f5ca..48b7771 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -118,6 +118,11 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
 	data->hctx = hctx;
 }
 
+static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
+{
+	return data->hctx->tags;
+}
+
 /*
  * Internal helpers for request allocation/init/free
  */
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 62+ messages in thread

* Re: [PATCH 03/10] block: move rq_ioc() to blk.h
  2017-01-11 21:39 ` [PATCH 03/10] block: move rq_ioc() to blk.h Jens Axboe
  2017-01-12 10:14   ` Johannes Thumshirn
  2017-01-12 21:18   ` Bart Van Assche
@ 2017-01-13  8:33   ` Christoph Hellwig
  2 siblings, 0 replies; 62+ messages in thread
From: Christoph Hellwig @ 2017-01-13  8:33 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

Looks fine,

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 01/10] block: move existing elevator ops to union
  2017-01-11 21:39 ` [PATCH 01/10] block: move existing elevator ops to union Jens Axboe
  2017-01-12 10:15   ` Johannes Thumshirn
  2017-01-12 21:17   ` Bart Van Assche
@ 2017-01-13  8:34   ` Christoph Hellwig
  2017-01-13 15:00     ` Jens Axboe
  2 siblings, 1 reply; 62+ messages in thread
From: Christoph Hellwig @ 2017-01-13  8:34 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

On Wed, Jan 11, 2017 at 02:39:54PM -0700, Jens Axboe wrote:
> Prep patch for adding MQ ops as well, since doing anon unions with
> named initializers doesn't work on older compilers.

My gut feeling tells me MQ schedulers shouldn't share elevator_type.
I'll play around a bit with the code later to see if my gut was right.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-11 21:39 [PATCHSET v6] blk-mq scheduling framework Jens Axboe
                   ` (11 preceding siblings ...)
  2017-01-13  8:15 ` Hannes Reinecke
@ 2017-01-13 10:09 ` Hannes Reinecke
  2017-01-15 10:12 ` Paolo Valente
  13 siblings, 0 replies; 62+ messages in thread
From: Hannes Reinecke @ 2017-01-13 10:09 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/11/2017 10:39 PM, Jens Axboe wrote:
> Another year, another posting of this patchset. The previous posting
> was here:
> 
> https://www.spinics.net/lists/kernel/msg2406106.html
> 
> (yes, I've skipped v5, it was fixes on top of v4, not the rework).
> 
> I've reworked bits of this to get rid of the shadow requests, thanks
> to Bart for the inspiration. The missing piece, for me, was the fact
> that we have the tags->rqs[] indirection array already. I've done this
> somewhat differently, though, by having the internal scheduler tag
> map be allocated/torn down when an IO scheduler is attached or
> detached. This also means that when we run without a scheduler, we
> don't have to do double tag allocations, it'll work like before.
> 
> The patchset applies on top of 4.10-rc3, or can be pulled here:
> 
> git://git.kernel.dk/linux-block blk-mq-sched.6
> 
Fun continues:

[   28.976708] ata3.00: configured for UDMA/100
[   28.987625] BUG: unable to handle kernel NULL pointer dereference at
0000000000000048
[   28.987632] IP: deadline_add_request+0x15/0x70
[   28.987633] PGD 0
[   28.987634]
[   28.987636] Oops: 0000 [#1] SMP
[   28.987638] Modules linked in: ahci libahci libata uhci_hcd(+)
mgag200(+) i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt
fb_sys_fops ttm drm tg3 libphy ehci_pci ehci_hcd usbcore usb_common
ixgbe mdio hpsa(+) dca ptp pps_core scsi_transport_sas fjes(+) sg
dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc scsi_dh_alua autofs4
[   28.987654] CPU: 0 PID: 268 Comm: kworker/u2:2 Not tainted
4.10.0-rc3+ #535
[   28.987655] Hardware name: HP ProLiant ML350p Gen8, BIOS P72 09/08/2013
[   28.987660] Workqueue: events_unbound async_run_entry_fn
[   28.987661] task: ffff880029391600 task.stack: ffffc9000038c000
[   28.987663] RIP: 0010:deadline_add_request+0x15/0x70
[   28.987664] RSP: 0018:ffffc9000038fb00 EFLAGS: 00010286
[   28.987665] RAX: ffff88003260c400 RBX: 0000000000000000 RCX:
0000000000000000
[   28.987666] RDX: ffffc9000038fb68 RSI: 0000000000000000 RDI:
ffff8800293b9040
[   28.987666] RBP: ffffc9000038fb18 R08: 000000000087668] R13:
ffff88003260c400 R14: 0000000000000000 R15: 0000000000000000
[   28.987670] FS:  0000000000000000(0000) GS:ffff880035c00000(0000)
knlGS:0000000000000000
[   28.987670] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   28.987671] CR2: 0000000000000048 CR3: 0000000032b64000 CR4:
00000000000406f0
[   28.987672] Call Trace:
[   28.987677]  blk_mq_sched_get_request+0x12e/0x310
[   28.987678]  ? blk_mq_sched_get_request+0x5/0x310
[   28.987681]  blk_mq_alloc_request+0x40/0x90
[   28.987684]  blk_get_request+0x35/0x110
[   28.987689]  __scsi_execute.isra.24+0x3c/0x160
[   28.987691]  scsi_execute_req_flags+0x94/0x100
[   28.987694]  scsi_probe_and_add_lun+0x207/0xd60
[   28.987699]  ? __pm_rme_resume+0x5c/0x80
[   28.987701]  __scsi_add_device+0x103/0x120
[   28.987709]  ata_scsi_scan_host+0xa3/0x1d0 [libata]
[   28.987716]  async_port_probe+0x43/0x60 [libata]
[   28.987718]  async_run_entry_fn+0x37/0x150
[   28.987722]  process_one_work+0x1d0/0x660

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-13  8:15 ` Hannes Reinecke
@ 2017-01-13 11:04   ` Hannes Reinecke
  2017-01-13 12:10     ` Hannes Reinecke
                       ` (2 more replies)
  0 siblings, 3 replies; 62+ messages in thread
From: Hannes Reinecke @ 2017-01-13 11:04 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/13/2017 09:15 AM, Hannes Reinecke wrote:
> On 01/11/2017 10:39 PM, Jens Axboe wrote:
>> Another year, another posting of this patchset. The previous posting
>> was here:
>>
>> https://www.spinics.net/lists/kernel/msg2406106.html
>>
>> (yes, I've skipped v5, it was fixes on top of v4, not the rework).
>>
>> I've reworked bits of this to get rid of the shadow requests, thanks
>> to Bart for the inspiration. The missing piece, for me, was the fact
>> that we have the tags->rqs[] indirection array already. I've done this
>> somewhat differently, though, by having the internal scheduler tag
>> map be allocated/torn down when an IO scheduler is attached or
>> detached. This also means that when we run without a scheduler, we
>> don't have to do double tag allocations, it'll work like before.
>>
>> The patchset applies on top of 4.10-rc3, or can be pulled here:
>>
>> git://git.kernel.dk/linux-block blk-mq-sched.6
>>
> Well ... something's wrong here on my machine:
> 
> [   39.886886] ------------[ cut here ]------------
> [   39.886895] WARNING: CPU: 9 PID: 62 at block/blk-mq.c:342
> __blk_mq_finish_request+0x124/0x140
> [   39.886895] Modules linked in: sd_mod ahci uhci_hcd ehci_pci
> mpt3sas(+) libahci ehci_hcd serio_raw crc32c_intel raid_class drm libata
> usbcore hpsa usb_common scsi_transport_sas sg dm_multipath dm_mod
> scsi_dh_rdac scsi_dh_emc scsi_dh_alua autofs4
> [   39.886910] CPU: 9 PID: 62 Comm: kworker/u130:0 Not tainted
> 4.10.0-rc3+ #528
> [   39.886911] Hardware name: HP ProLiant ML350p Gen8, BIOS P72 09/08/2013
> [   39.886917] Workqueue: events_unbound async_run_entry_fn
> [   39.886918] Call Trace:
> [   39.886923]  dump_stack+0x85/0xc9
> [   39.886927]  __warn+0xd1/0xf0
> [   39.886928]  warn_slowpath_null+0x1d/0x20
> [   39.886930]  __blk_mq_finish_request+0x124/0x140
> [   39.886932]  blk_mq_finish_request+0x55/0x60
> [   39.886934]  blk_mq_sched_put_request+0x78/0x80
> [   39.886936]  blk_mq_free_request+0xe/0x10
> [   39.886938]  blk_put_request+0x25/0x60
> [   39.886944]  __scsi_execute.isra.24+0x104/0x160
> [   39.886946]  scsi_execute_req_flags+0x94/0x100
> [   39.886948]  scsi_report_opcode+0xab/0x100
> 
> checking ...
> 
Ah.
Seems like the elevator switch races with device setup:

 1188.490326] ------------[ cut here ]------------
[ 1188.490334] WARNING: CPU: 9 PID: 30155 at block/blk-mq.c:342
__blk_mq_finish_request+0x172/0x180
[ 1188.490335] Modules linked in: mpt3sas(+) raid_class rpcsec_gss_krb5
auth_rpcgss nfsv4 nfs lockd grace fscache ebtable_filt
er ebtables ip6table_filter ip6_tables iptable_filter ip_tables x_tables
af_packet br_netfilter bridge stp llc iscsi_ibft iscs
i_boot_sysfs sb_edac edac_core x86_pkg_temp_thermal intel_powerclamp
coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_p
clmul tg3 ixgbe ghash_clmulni_intel pcbc ptp aesni_intel pps_core
aes_x86_64 ipmi_ssif hpilo hpwdt mdio libphy pcc_cpufreq cry
pto_simd glue_helper iTCO_wdt iTCO_vendor_support acpi_cpufreq tpm_tis
ipmi_si ipmi_devintf cryptd lpc_ich pcspkr ioatdma tpm_
tis_core thermal wmi shpchp dca ipmi_msghandler tpm fjes button sunrpc
btrfs xor sr_mod raid6_pq cdrom ehci_pci mgag200 i2c_al
go_bit drm_kms_helper syscopyarea sysfillrect uhci_hcd
[ 1188.490399]  sysimgblt fb_sys_fops sd_mod ahci ehci_hcd ttm libahci
crc32c_intel serio_raw drm libata usbcore usb_common hp
sa scsi_transport_sas sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc
scsi_dh_alua autofs4
[ 1188.490411] CPU: 9 PID: 30155 Comm: kworker/u130:6 Not tainted
4.10.0-rc3+ #535
[ 1188.490411] Hardware name: HP ProLiant ML350p Gen8, BIOS P72 09/08/2013
[ 1188.490425] Workqueue: events_unbound async_run_entry_fn
[ 1188.490427] Call Trace:
[ 1188.490433]  dump_stack+0x85/0xc9
[ 1188.490436]  __warn+0xd1/0xf0
[ 1188.490438]  warn_slowpath_null+0x1d/0x20
[ 1188.490440]  __blk_mq_finish_request+0x172/0x180
[ 1188.490442]  blk_mq_finish_request+0x55/0x60
[ 1188.490443]  blk_mq_sched_put_request+0x78/0x80
[ 1188.490445]  blk_mq_free_request+0xe/0x10
[ 1188.490448]  blk_put_request+0x25/0x60
[ 1188.490453]  __scsi_execute.isra.24+0x104/0x160
[ 1188.490455]  scsi_execute_req_flags+0x94/0x100
[ 1188.490457]  scsi_report_opcode+0xab/0x100
[ 1188.490461]  sd_revalidate_disk+0xaef/0x1450 [sd_mod]
[ 1188.490464]  sd_probe_async+0xd1/0x1d0 [sd_mod]
[ 1188.490466]  async_run_entry_fn+0x37/0x150
[ 1188.490470]  process_one_work+0x1d0/0x660
[ 1188.490472]  ? process_one_work+0x151/0x660
[ 1188.490474]  worker_thread+0x12b/0x4a0
[ 1188.490475]  kthread+0x10c/0x140
[ 1188.490477]  ? process_one_work+0x660/0x660
[ 1188.490478]  ? kthread_create_on_node+0x40/0x40
[ 1188.490483]  ret_from_fork+0x2a/0x40
[ 1188.490484] ---[ end trace d5e3a32ac269fc2a ]---
[ 1188.490485] rq (487/52) rqs (-1/-1)
[ 1188.523518] sd 7:0:0:0: [sdb] Attached SCSI disk
[ 1188.540954] elevator: switch to deadline failed

(The 'rqs' line is a debug output from me:

			struct request *rqs_rq =
				hctx->tags->rqs[rq->tag];

			printk(KERN_WARNING "rq (%d/%d) rqs (%d/%d)\n",
			       rq->tag, sched_tag,
			       rqs_rq ? rqs_rq->tag : -1,
			       rqs_rq ? rqs_rq->internal_tag : -1);

telling us that request with 'tag' 487 is _not_ at position
487 in the rqs array; rather that position is empty.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 08/10] blk-mq-sched: add framework for MQ capable IO schedulers
  2017-01-11 21:40 ` [PATCH 08/10] blk-mq-sched: add framework for MQ capable IO schedulers Jens Axboe
  2017-01-12 21:45   ` Bart Van Assche
@ 2017-01-13 11:15   ` Hannes Reinecke
  2017-01-13 16:39     ` Bart Van Assche
  2017-01-13 16:41     ` Omar Sandoval
  1 sibling, 2 replies; 62+ messages in thread
From: Hannes Reinecke @ 2017-01-13 11:15 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/11/2017 10:40 PM, Jens Axboe wrote:
> This adds a set of hooks that intercepts the blk-mq path of
> allocating/inserting/issuing/completing requests, allowing
> us to develop a scheduler within that framework.
> 
> We reuse the existing elevator scheduler API on the registration
> side, but augment that with the scheduler flagging support for
> the blk-mq interfce, and with a separate set of ops hooks for MQ
> devices.
> 
> We split driver and scheduler tags, so we can run the scheduling
> independent of device queue depth.
> 
> Signed-off-by: Jens Axboe <axboe@fb.com>
[ .. ]
> @@ -823,6 +847,35 @@ static inline unsigned int queued_to_index(unsigned int queued)
>  	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
>  }
>  
> +static bool blk_mq_get_driver_tag(struct request *rq,
> +				  struct blk_mq_hw_ctx **hctx, bool wait)
> +{
> +	struct blk_mq_alloc_data data = {
> +		.q = rq->q,
> +		.ctx = rq->mq_ctx,
> +		.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
> +		.flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
> +	};
> +
> +	if (blk_mq_hctx_stopped(data.hctx))
> +		return false;
> +
> +	if (rq->tag != -1) {
> +done:
> +		if (hctx)
> +			*hctx = data.hctx;
> +		return true;
> +	}
> +
> +	rq->tag = blk_mq_get_tag(&data);
> +	if (rq->tag >= 0) {
> +		data.hctx->tags->rqs[rq->tag] = rq;
> +		goto done;
> +	}
> +
> +	return false;
> +}
> +
What happens with the existing request at 'rqs[rq->tag]' ?
Surely there is one already, right?
Things like '->init_request' assume a fully populated array, so moving
one entry to another location is ... interesting.

I would have thought we need to do a request cloning here,
otherwise this would introduce a memory leak, right?
(Not to mention a potential double completion, as the request is now at
two positions in the array)

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-13 11:04   ` Hannes Reinecke
@ 2017-01-13 12:10     ` Hannes Reinecke
  2017-01-13 15:05       ` Jens Axboe
  2017-01-13 15:03     ` Jens Axboe
  2017-01-13 15:23     ` Jens Axboe
  2 siblings, 1 reply; 62+ messages in thread
From: Hannes Reinecke @ 2017-01-13 12:10 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/13/2017 12:04 PM, Hannes Reinecke wrote:
> On 01/13/2017 09:15 AM, Hannes Reinecke wrote:
>> On 01/11/2017 10:39 PM, Jens Axboe wrote:
>>> Another year, another posting of this patchset. The previous posting
>>> was here:
>>>
>>> https://www.spinics.net/lists/kernel/msg2406106.html
>>>
>>> (yes, I've skipped v5, it was fixes on top of v4, not the rework).
>>>
>>> I've reworked bits of this to get rid of the shadow requests, thanks
>>> to Bart for the inspiration. The missing piece, for me, was the fact
>>> that we have the tags->rqs[] indirection array already. I've done this
>>> somewhat differently, though, by having the internal scheduler tag
>>> map be allocated/torn down when an IO scheduler is attached or
>>> detached. This also means that when we run without a scheduler, we
>>> don't have to do double tag allocations, it'll work like before.
>>>
>>> The patchset applies on top of 4.10-rc3, or can be pulled here:
>>>
>>> git://git.kernel.dk/linux-block blk-mq-sched.6
>>>
>> Well ... something's wrong here on my machine:
>>
[ .. ]

Turns out that selecting CONFIG_DEFAULT_MQ_DEADLINE is the culprit;
switching to CONFIG_DEFAULT_MQ_NONE and selecting mq-deadline after
booting manually makes the problem go away.

So there is a race condition during device init and switching the I/O
scheduler.

But the results from using mq-deadline are promising; the performance
drop I've seen on older hardware seems to be resolved:

mq iosched:
 seq read : io=13383MB, bw=228349KB/s, iops=57087
 rand read : io=12876MB, bw=219709KB/s, iops=54927
 seq write: io=14532MB, bw=247987KB/s, iops=61996
 rand write: io=13779MB, bw=235127KB/s, iops=58781
mq default:
 seq read : io=13056MB, bw=222588KB/s, iops=55647
 rand read : io=12908MB, bw=220069KB/s, iops=55017
 seq write: io=13986MB, bw=238444KB/s, iops=59611
 rand write: io=13733MB, bw=234128KB/s, iops=58532
sq default:
 seq read : io=10240MB, bw=194787KB/s, iops=48696
 rand read : io=10240MB, bw=191374KB/s, iops=47843
 seq write: io=10240MB, bw=245333KB/s, iops=61333
 rand write: io=10240MB, bw=228239KB/s, iops=57059

measured on mpt2sas with SSD devices.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 01/10] block: move existing elevator ops to union
  2017-01-13  8:34   ` Christoph Hellwig
@ 2017-01-13 15:00     ` Jens Axboe
  0 siblings, 0 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-13 15:00 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

On Fri, Jan 13 2017, Christoph Hellwig wrote:
> On Wed, Jan 11, 2017 at 02:39:54PM -0700, Jens Axboe wrote:
> > Prep patch for adding MQ ops as well, since doing anon unions with
> > named initializers doesn't work on older compilers.
> 
> My gut feeling tells me MQ schedulers shouldn't share elevator_type.
> I'll play around a bit with the code later to see if my gut was right.

It's crazy to re-invent a new framework just for MQ schedulers, imho. So
you're going to have to come up with something better than a gut
feeling.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 05/10] blk-mq: export some helpers we need to the scheduling framework
  2017-01-13  8:17   ` Christoph Hellwig
@ 2017-01-13 15:01     ` Jens Axboe
  0 siblings, 0 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-13 15:01 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

On Fri, Jan 13 2017, Christoph Hellwig wrote:
> Do we really need to make any of the blk-mq scheduling modular?

I think so - I want to be able to turn it on and off. And since we
already have a framework for this (see reply in other patch), I don't
see a reason NOT to do this.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-13 11:04   ` Hannes Reinecke
  2017-01-13 12:10     ` Hannes Reinecke
@ 2017-01-13 15:03     ` Jens Axboe
  2017-01-13 15:23     ` Jens Axboe
  2 siblings, 0 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-13 15:03 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

On Fri, Jan 13 2017, Hannes Reinecke wrote:
> On 01/13/2017 09:15 AM, Hannes Reinecke wrote:
> > On 01/11/2017 10:39 PM, Jens Axboe wrote:
> >> Another year, another posting of this patchset. The previous posting
> >> was here:
> >>
> >> https://www.spinics.net/lists/kernel/msg2406106.html
> >>
> >> (yes, I've skipped v5, it was fixes on top of v4, not the rework).
> >>
> >> I've reworked bits of this to get rid of the shadow requests, thanks
> >> to Bart for the inspiration. The missing piece, for me, was the fact
> >> that we have the tags->rqs[] indirection array already. I've done this
> >> somewhat differently, though, by having the internal scheduler tag
> >> map be allocated/torn down when an IO scheduler is attached or
> >> detached. This also means that when we run without a scheduler, we
> >> don't have to do double tag allocations, it'll work like before.
> >>
> >> The patchset applies on top of 4.10-rc3, or can be pulled here:
> >>
> >> git://git.kernel.dk/linux-block blk-mq-sched.6
> >>
> > Well ... something's wrong here on my machine:
> > 
> > [   39.886886] ------------[ cut here ]------------
> > [   39.886895] WARNING: CPU: 9 PID: 62 at block/blk-mq.c:342
> > __blk_mq_finish_request+0x124/0x140
> > [   39.886895] Modules linked in: sd_mod ahci uhci_hcd ehci_pci
> > mpt3sas(+) libahci ehci_hcd serio_raw crc32c_intel raid_class drm libata
> > usbcore hpsa usb_common scsi_transport_sas sg dm_multipath dm_mod
> > scsi_dh_rdac scsi_dh_emc scsi_dh_alua autofs4
> > [   39.886910] CPU: 9 PID: 62 Comm: kworker/u130:0 Not tainted
> > 4.10.0-rc3+ #528
> > [   39.886911] Hardware name: HP ProLiant ML350p Gen8, BIOS P72 09/08/2013
> > [   39.886917] Workqueue: events_unbound async_run_entry_fn
> > [   39.886918] Call Trace:
> > [   39.886923]  dump_stack+0x85/0xc9
> > [   39.886927]  __warn+0xd1/0xf0
> > [   39.886928]  warn_slowpath_null+0x1d/0x20
> > [   39.886930]  __blk_mq_finish_request+0x124/0x140
> > [   39.886932]  blk_mq_finish_request+0x55/0x60
> > [   39.886934]  blk_mq_sched_put_request+0x78/0x80
> > [   39.886936]  blk_mq_free_request+0xe/0x10
> > [   39.886938]  blk_put_request+0x25/0x60
> > [   39.886944]  __scsi_execute.isra.24+0x104/0x160
> > [   39.886946]  scsi_execute_req_flags+0x94/0x100
> > [   39.886948]  scsi_report_opcode+0xab/0x100
> > 
> > checking ...
> > 
> Ah.
> Seems like the elevator switch races with device setup:

Huh, funky, haven't seen that. I'll see if I can reproduce it here. I
don't have SCAN_ASYNC turned on, on my test box.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-13 12:10     ` Hannes Reinecke
@ 2017-01-13 15:05       ` Jens Axboe
  0 siblings, 0 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-13 15:05 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

On Fri, Jan 13 2017, Hannes Reinecke wrote:
> On 01/13/2017 12:04 PM, Hannes Reinecke wrote:
> > On 01/13/2017 09:15 AM, Hannes Reinecke wrote:
> >> On 01/11/2017 10:39 PM, Jens Axboe wrote:
> >>> Another year, another posting of this patchset. The previous posting
> >>> was here:
> >>>
> >>> https://www.spinics.net/lists/kernel/msg2406106.html
> >>>
> >>> (yes, I've skipped v5, it was fixes on top of v4, not the rework).
> >>>
> >>> I've reworked bits of this to get rid of the shadow requests, thanks
> >>> to Bart for the inspiration. The missing piece, for me, was the fact
> >>> that we have the tags->rqs[] indirection array already. I've done this
> >>> somewhat differently, though, by having the internal scheduler tag
> >>> map be allocated/torn down when an IO scheduler is attached or
> >>> detached. This also means that when we run without a scheduler, we
> >>> don't have to do double tag allocations, it'll work like before.
> >>>
> >>> The patchset applies on top of 4.10-rc3, or can be pulled here:
> >>>
> >>> git://git.kernel.dk/linux-block blk-mq-sched.6
> >>>
> >> Well ... something's wrong here on my machine:
> >>
> [ .. ]
> 
> Turns out that selecting CONFIG_DEFAULT_MQ_DEADLINE is the culprit;
> switching to CONFIG_DEFAULT_MQ_NONE and selecting mq-deadline after
> booting manually makes the problem go away.
> 
> So there is a race condition during device init and switching the I/O
> scheduler.
> 
> But the results from using mq-deadline are promising; the performance
> drop I've seen on older hardware seems to be resolved:
> 
> mq iosched:
>  seq read : io=13383MB, bw=228349KB/s, iops=57087
>  rand read : io=12876MB, bw=219709KB/s, iops=54927
>  seq write: io=14532MB, bw=247987KB/s, iops=61996
>  rand write: io=13779MB, bw=235127KB/s, iops=58781
> mq default:
>  seq read : io=13056MB, bw=222588KB/s, iops=55647
>  rand read : io=12908MB, bw=220069KB/s, iops=55017
>  seq write: io=13986MB, bw=238444KB/s, iops=59611
>  rand write: io=13733MB, bw=234128KB/s, iops=58532
> sq default:
>  seq read : io=10240MB, bw=194787KB/s, iops=48696
>  rand read : io=10240MB, bw=191374KB/s, iops=47843
>  seq write: io=10240MB, bw=245333KB/s, iops=61333
>  rand write: io=10240MB, bw=228239KB/s, iops=57059
> 
> measured on mpt2sas with SSD devices.

Perfect! Straight on the path of kill of non scsi-mq, then.

I'll fix up the async scan issue. The new mq schedulers don't really
behave differently in this regard, so I'm a bit puzzled. Hopefully it
reproduces here.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 06/10] blk-mq-tag: cleanup the normal/reserved tag allocation
  2017-01-13  8:30   ` Christoph Hellwig
@ 2017-01-13 15:06     ` Jens Axboe
  0 siblings, 0 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-13 15:06 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-kernel, linux-block, osandov, bart.vanassche

On Fri, Jan 13 2017, Christoph Hellwig wrote:
> On Wed, Jan 11, 2017 at 02:39:59PM -0700, Jens Axboe wrote:
> > This is in preparation for having another tag set available. Cleanup
> > the parameters, and allow passing in of tags fo blk_mq_put_tag().
> 
> I've been playing around with this are a bit but never submitted
> anything.  Below is an untested merge of my previous bits with your
> patch, let me know what you think about it:

Looks good to me. I'll test it and merge it with the other patch, if
successful.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-13 11:04   ` Hannes Reinecke
  2017-01-13 12:10     ` Hannes Reinecke
  2017-01-13 15:03     ` Jens Axboe
@ 2017-01-13 15:23     ` Jens Axboe
  2017-01-13 15:33       ` Hannes Reinecke
  2 siblings, 1 reply; 62+ messages in thread
From: Jens Axboe @ 2017-01-13 15:23 UTC (permalink / raw)
  To: Hannes Reinecke, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/13/2017 04:04 AM, Hannes Reinecke wrote:
> On 01/13/2017 09:15 AM, Hannes Reinecke wrote:
>> On 01/11/2017 10:39 PM, Jens Axboe wrote:
>>> Another year, another posting of this patchset. The previous posting
>>> was here:
>>>
>>> https://www.spinics.net/lists/kernel/msg2406106.html
>>>
>>> (yes, I've skipped v5, it was fixes on top of v4, not the rework).
>>>
>>> I've reworked bits of this to get rid of the shadow requests, thanks
>>> to Bart for the inspiration. The missing piece, for me, was the fact
>>> that we have the tags->rqs[] indirection array already. I've done this
>>> somewhat differently, though, by having the internal scheduler tag
>>> map be allocated/torn down when an IO scheduler is attached or
>>> detached. This also means that when we run without a scheduler, we
>>> don't have to do double tag allocations, it'll work like before.
>>>
>>> The patchset applies on top of 4.10-rc3, or can be pulled here:
>>>
>>> git://git.kernel.dk/linux-block blk-mq-sched.6
>>>
>> Well ... something's wrong here on my machine:
>>
>> [   39.886886] ------------[ cut here ]------------
>> [   39.886895] WARNING: CPU: 9 PID: 62 at block/blk-mq.c:342
>> __blk_mq_finish_request+0x124/0x140
>> [   39.886895] Modules linked in: sd_mod ahci uhci_hcd ehci_pci
>> mpt3sas(+) libahci ehci_hcd serio_raw crc32c_intel raid_class drm libata
>> usbcore hpsa usb_common scsi_transport_sas sg dm_multipath dm_mod
>> scsi_dh_rdac scsi_dh_emc scsi_dh_alua autofs4
>> [   39.886910] CPU: 9 PID: 62 Comm: kworker/u130:0 Not tainted
>> 4.10.0-rc3+ #528
>> [   39.886911] Hardware name: HP ProLiant ML350p Gen8, BIOS P72 09/08/2013
>> [   39.886917] Workqueue: events_unbound async_run_entry_fn
>> [   39.886918] Call Trace:
>> [   39.886923]  dump_stack+0x85/0xc9
>> [   39.886927]  __warn+0xd1/0xf0
>> [   39.886928]  warn_slowpath_null+0x1d/0x20
>> [   39.886930]  __blk_mq_finish_request+0x124/0x140
>> [   39.886932]  blk_mq_finish_request+0x55/0x60
>> [   39.886934]  blk_mq_sched_put_request+0x78/0x80
>> [   39.886936]  blk_mq_free_request+0xe/0x10
>> [   39.886938]  blk_put_request+0x25/0x60
>> [   39.886944]  __scsi_execute.isra.24+0x104/0x160
>> [   39.886946]  scsi_execute_req_flags+0x94/0x100
>> [   39.886948]  scsi_report_opcode+0xab/0x100
>>
>> checking ...
>>
> Ah.
> Seems like the elevator switch races with device setup:
> 
>  1188.490326] ------------[ cut here ]------------
> [ 1188.490334] WARNING: CPU: 9 PID: 30155 at block/blk-mq.c:342
> __blk_mq_finish_request+0x172/0x180
> [ 1188.490335] Modules linked in: mpt3sas(+) raid_class rpcsec_gss_krb5
> auth_rpcgss nfsv4 nfs lockd grace fscache ebtable_filt
> er ebtables ip6table_filter ip6_tables iptable_filter ip_tables x_tables
> af_packet br_netfilter bridge stp llc iscsi_ibft iscs
> i_boot_sysfs sb_edac edac_core x86_pkg_temp_thermal intel_powerclamp
> coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_p
> clmul tg3 ixgbe ghash_clmulni_intel pcbc ptp aesni_intel pps_core
> aes_x86_64 ipmi_ssif hpilo hpwdt mdio libphy pcc_cpufreq cry
> pto_simd glue_helper iTCO_wdt iTCO_vendor_support acpi_cpufreq tpm_tis
> ipmi_si ipmi_devintf cryptd lpc_ich pcspkr ioatdma tpm_
> tis_core thermal wmi shpchp dca ipmi_msghandler tpm fjes button sunrpc
> btrfs xor sr_mod raid6_pq cdrom ehci_pci mgag200 i2c_al
> go_bit drm_kms_helper syscopyarea sysfillrect uhci_hcd
> [ 1188.490399]  sysimgblt fb_sys_fops sd_mod ahci ehci_hcd ttm libahci
> crc32c_intel serio_raw drm libata usbcore usb_common hp
> sa scsi_transport_sas sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc
> scsi_dh_alua autofs4
> [ 1188.490411] CPU: 9 PID: 30155 Comm: kworker/u130:6 Not tainted
> 4.10.0-rc3+ #535
> [ 1188.490411] Hardware name: HP ProLiant ML350p Gen8, BIOS P72 09/08/2013
> [ 1188.490425] Workqueue: events_unbound async_run_entry_fn
> [ 1188.490427] Call Trace:
> [ 1188.490433]  dump_stack+0x85/0xc9
> [ 1188.490436]  __warn+0xd1/0xf0
> [ 1188.490438]  warn_slowpath_null+0x1d/0x20
> [ 1188.490440]  __blk_mq_finish_request+0x172/0x180
> [ 1188.490442]  blk_mq_finish_request+0x55/0x60
> [ 1188.490443]  blk_mq_sched_put_request+0x78/0x80
> [ 1188.490445]  blk_mq_free_request+0xe/0x10
> [ 1188.490448]  blk_put_request+0x25/0x60
> [ 1188.490453]  __scsi_execute.isra.24+0x104/0x160
> [ 1188.490455]  scsi_execute_req_flags+0x94/0x100
> [ 1188.490457]  scsi_report_opcode+0xab/0x100
> [ 1188.490461]  sd_revalidate_disk+0xaef/0x1450 [sd_mod]
> [ 1188.490464]  sd_probe_async+0xd1/0x1d0 [sd_mod]
> [ 1188.490466]  async_run_entry_fn+0x37/0x150
> [ 1188.490470]  process_one_work+0x1d0/0x660
> [ 1188.490472]  ? process_one_work+0x151/0x660
> [ 1188.490474]  worker_thread+0x12b/0x4a0
> [ 1188.490475]  kthread+0x10c/0x140
> [ 1188.490477]  ? process_one_work+0x660/0x660
> [ 1188.490478]  ? kthread_create_on_node+0x40/0x40
> [ 1188.490483]  ret_from_fork+0x2a/0x40
> [ 1188.490484] ---[ end trace d5e3a32ac269fc2a ]---
> [ 1188.490485] rq (487/52) rqs (-1/-1)
> [ 1188.523518] sd 7:0:0:0: [sdb] Attached SCSI disk
> [ 1188.540954] elevator: switch to deadline failed

Where did this last line come from? Do you have a udev rule or something
that attempts to switch to deadline, which will fail with scsi-mq?

Trying to get closer to reproducing this, no luck just setting the async
scan option, mine still boots fine.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-13 15:23     ` Jens Axboe
@ 2017-01-13 15:33       ` Hannes Reinecke
  2017-01-13 15:34         ` Jens Axboe
  0 siblings, 1 reply; 62+ messages in thread
From: Hannes Reinecke @ 2017-01-13 15:33 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/13/2017 04:23 PM, Jens Axboe wrote:
> On 01/13/2017 04:04 AM, Hannes Reinecke wrote:
>> On 01/13/2017 09:15 AM, Hannes Reinecke wrote:
>>> On 01/11/2017 10:39 PM, Jens Axboe wrote:
>>>> Another year, another posting of this patchset. The previous posting
>>>> was here:
>>>>
>>>> https://www.spinics.net/lists/kernel/msg2406106.html
>>>>
>>>> (yes, I've skipped v5, it was fixes on top of v4, not the rework).
>>>>
>>>> I've reworked bits of this to get rid of the shadow requests, thanks
>>>> to Bart for the inspiration. The missing piece, for me, was the fact
>>>> that we have the tags->rqs[] indirection array already. I've done this
>>>> somewhat differently, though, by having the internal scheduler tag
>>>> map be allocated/torn down when an IO scheduler is attached or
>>>> detached. This also means that when we run without a scheduler, we
>>>> don't have to do double tag allocations, it'll work like before.
>>>>
>>>> The patchset applies on top of 4.10-rc3, or can be pulled here:
>>>>
>>>> git://git.kernel.dk/linux-block blk-mq-sched.6
>>>>
>>> Well ... something's wrong here on my machine:
>>>
>>> [   39.886886] ------------[ cut here ]------------
>>> [   39.886895] WARNING: CPU: 9 PID: 62 at block/blk-mq.c:342
>>> __blk_mq_finish_request+0x124/0x140
>>> [   39.886895] Modules linked in: sd_mod ahci uhci_hcd ehci_pci
>>> mpt3sas(+) libahci ehci_hcd serio_raw crc32c_intel raid_class drm libata
>>> usbcore hpsa usb_common scsi_transport_sas sg dm_multipath dm_mod
>>> scsi_dh_rdac scsi_dh_emc scsi_dh_alua autofs4
>>> [   39.886910] CPU: 9 PID: 62 Comm: kworker/u130:0 Not tainted
>>> 4.10.0-rc3+ #528
>>> [   39.886911] Hardware name: HP ProLiant ML350p Gen8, BIOS P72 09/08/2013
>>> [   39.886917] Workqueue: events_unbound async_run_entry_fn
>>> [   39.886918] Call Trace:
>>> [   39.886923]  dump_stack+0x85/0xc9
>>> [   39.886927]  __warn+0xd1/0xf0
>>> [   39.886928]  warn_slowpath_null+0x1d/0x20
>>> [   39.886930]  __blk_mq_finish_request+0x124/0x140
>>> [   39.886932]  blk_mq_finish_request+0x55/0x60
>>> [   39.886934]  blk_mq_sched_put_request+0x78/0x80
>>> [   39.886936]  blk_mq_free_request+0xe/0x10
>>> [   39.886938]  blk_put_request+0x25/0x60
>>> [   39.886944]  __scsi_execute.isra.24+0x104/0x160
>>> [   39.886946]  scsi_execute_req_flags+0x94/0x100
>>> [   39.886948]  scsi_report_opcode+0xab/0x100
>>>
>>> checking ...
>>>
>> Ah.
>> Seems like the elevator switch races with device setup:
>>
>>  1188.490326] ------------[ cut here ]------------
>> [ 1188.490334] WARNING: CPU: 9 PID: 30155 at block/blk-mq.c:342
>> __blk_mq_finish_request+0x172/0x180
>> [ 1188.490335] Modules linked in: mpt3sas(+) raid_class rpcsec_gss_krb5
>> auth_rpcgss nfsv4 nfs lockd grace fscache ebtable_filt
>> er ebtables ip6table_filter ip6_tables iptable_filter ip_tables x_tables
>> af_packet br_netfilter bridge stp llc iscsi_ibft iscs
>> i_boot_sysfs sb_edac edac_core x86_pkg_temp_thermal intel_powerclamp
>> coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_p
>> clmul tg3 ixgbe ghash_clmulni_intel pcbc ptp aesni_intel pps_core
>> aes_x86_64 ipmi_ssif hpilo hpwdt mdio libphy pcc_cpufreq cry
>> pto_simd glue_helper iTCO_wdt iTCO_vendor_support acpi_cpufreq tpm_tis
>> ipmi_si ipmi_devintf cryptd lpc_ich pcspkr ioatdma tpm_
>> tis_core thermal wmi shpchp dca ipmi_msghandler tpm fjes button sunrpc
>> btrfs xor sr_mod raid6_pq cdrom ehci_pci mgag200 i2c_al
>> go_bit drm_kms_helper syscopyarea sysfillrect uhci_hcd
>> [ 1188.490399]  sysimgblt fb_sys_fops sd_mod ahci ehci_hcd ttm libahci
>> crc32c_intel serio_raw drm libata usbcore usb_common hp
>> sa scsi_transport_sas sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc
>> scsi_dh_alua autofs4
>> [ 1188.490411] CPU: 9 PID: 30155 Comm: kworker/u130:6 Not tainted
>> 4.10.0-rc3+ #535
>> [ 1188.490411] Hardware name: HP ProLiant ML350p Gen8, BIOS P72 09/08/2013
>> [ 1188.490425] Workqueue: events_unbound async_run_entry_fn
>> [ 1188.490427] Call Trace:
>> [ 1188.490433]  dump_stack+0x85/0xc9
>> [ 1188.490436]  __warn+0xd1/0xf0
>> [ 1188.490438]  warn_slowpath_null+0x1d/0x20
>> [ 1188.490440]  __blk_mq_finish_request+0x172/0x180
>> [ 1188.490442]  blk_mq_finish_request+0x55/0x60
>> [ 1188.490443]  blk_mq_sched_put_request+0x78/0x80
>> [ 1188.490445]  blk_mq_free_request+0xe/0x10
>> [ 1188.490448]  blk_put_request+0x25/0x60
>> [ 1188.490453]  __scsi_execute.isra.24+0x104/0x160
>> [ 1188.490455]  scsi_execute_req_flags+0x94/0x100
>> [ 1188.490457]  scsi_report_opcode+0xab/0x100
>> [ 1188.490461]  sd_revalidate_disk+0xaef/0x1450 [sd_mod]
>> [ 1188.490464]  sd_probe_async+0xd1/0x1d0 [sd_mod]
>> [ 1188.490466]  async_run_entry_fn+0x37/0x150
>> [ 1188.490470]  process_one_work+0x1d0/0x660
>> [ 1188.490472]  ? process_one_work+0x151/0x660
>> [ 1188.490474]  worker_thread+0x12b/0x4a0
>> [ 1188.490475]  kthread+0x10c/0x140
>> [ 1188.490477]  ? process_one_work+0x660/0x660
>> [ 1188.490478]  ? kthread_create_on_node+0x40/0x40
>> [ 1188.490483]  ret_from_fork+0x2a/0x40
>> [ 1188.490484] ---[ end trace d5e3a32ac269fc2a ]---
>> [ 1188.490485] rq (487/52) rqs (-1/-1)
>> [ 1188.523518] sd 7:0:0:0: [sdb] Attached SCSI disk
>> [ 1188.540954] elevator: switch to deadline failed
> 
> Where did this last line come from? Do you have a udev rule or something
> that attempts to switch to deadline, which will fail with scsi-mq?
> 
> Trying to get closer to reproducing this, no luck just setting the async
> scan option, mine still boots fine.
> 
Ah, indeed.
There is an ominous udev rule here, trying to switch to 'deadline'.

# cat 60-ssd-scheduler.rules
# do not edit this file, it will be overwritten on update

ACTION!="add", GOTO="ssd_scheduler_end"
SUBSYSTEM!="block", GOTO="ssd_scheduler_end"

IMPORT{cmdline}="elevator"
ENV{elevator}=="*?", GOTO="ssd_scheduler_end"

KERNEL=="sd*[!0-9]", ATTR{queue/rotational}=="0",
ATTR{queue/scheduler}="deadline"

LABEL="ssd_scheduler_end"

Still shouldn't crash the kernel, though ...

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-13 15:33       ` Hannes Reinecke
@ 2017-01-13 15:34         ` Jens Axboe
  2017-01-13 15:59           ` Hannes Reinecke
  0 siblings, 1 reply; 62+ messages in thread
From: Jens Axboe @ 2017-01-13 15:34 UTC (permalink / raw)
  To: Hannes Reinecke, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/13/2017 08:33 AM, Hannes Reinecke wrote:
> On 01/13/2017 04:23 PM, Jens Axboe wrote:
>> On 01/13/2017 04:04 AM, Hannes Reinecke wrote:
>>> On 01/13/2017 09:15 AM, Hannes Reinecke wrote:
>>>> On 01/11/2017 10:39 PM, Jens Axboe wrote:
>>>>> Another year, another posting of this patchset. The previous posting
>>>>> was here:
>>>>>
>>>>> https://www.spinics.net/lists/kernel/msg2406106.html
>>>>>
>>>>> (yes, I've skipped v5, it was fixes on top of v4, not the rework).
>>>>>
>>>>> I've reworked bits of this to get rid of the shadow requests, thanks
>>>>> to Bart for the inspiration. The missing piece, for me, was the fact
>>>>> that we have the tags->rqs[] indirection array already. I've done this
>>>>> somewhat differently, though, by having the internal scheduler tag
>>>>> map be allocated/torn down when an IO scheduler is attached or
>>>>> detached. This also means that when we run without a scheduler, we
>>>>> don't have to do double tag allocations, it'll work like before.
>>>>>
>>>>> The patchset applies on top of 4.10-rc3, or can be pulled here:
>>>>>
>>>>> git://git.kernel.dk/linux-block blk-mq-sched.6
>>>>>
>>>> Well ... something's wrong here on my machine:
>>>>
>>>> [   39.886886] ------------[ cut here ]------------
>>>> [   39.886895] WARNING: CPU: 9 PID: 62 at block/blk-mq.c:342
>>>> __blk_mq_finish_request+0x124/0x140
>>>> [   39.886895] Modules linked in: sd_mod ahci uhci_hcd ehci_pci
>>>> mpt3sas(+) libahci ehci_hcd serio_raw crc32c_intel raid_class drm libata
>>>> usbcore hpsa usb_common scsi_transport_sas sg dm_multipath dm_mod
>>>> scsi_dh_rdac scsi_dh_emc scsi_dh_alua autofs4
>>>> [   39.886910] CPU: 9 PID: 62 Comm: kworker/u130:0 Not tainted
>>>> 4.10.0-rc3+ #528
>>>> [   39.886911] Hardware name: HP ProLiant ML350p Gen8, BIOS P72 09/08/2013
>>>> [   39.886917] Workqueue: events_unbound async_run_entry_fn
>>>> [   39.886918] Call Trace:
>>>> [   39.886923]  dump_stack+0x85/0xc9
>>>> [   39.886927]  __warn+0xd1/0xf0
>>>> [   39.886928]  warn_slowpath_null+0x1d/0x20
>>>> [   39.886930]  __blk_mq_finish_request+0x124/0x140
>>>> [   39.886932]  blk_mq_finish_request+0x55/0x60
>>>> [   39.886934]  blk_mq_sched_put_request+0x78/0x80
>>>> [   39.886936]  blk_mq_free_request+0xe/0x10
>>>> [   39.886938]  blk_put_request+0x25/0x60
>>>> [   39.886944]  __scsi_execute.isra.24+0x104/0x160
>>>> [   39.886946]  scsi_execute_req_flags+0x94/0x100
>>>> [   39.886948]  scsi_report_opcode+0xab/0x100
>>>>
>>>> checking ...
>>>>
>>> Ah.
>>> Seems like the elevator switch races with device setup:
>>>
>>>  1188.490326] ------------[ cut here ]------------
>>> [ 1188.490334] WARNING: CPU: 9 PID: 30155 at block/blk-mq.c:342
>>> __blk_mq_finish_request+0x172/0x180
>>> [ 1188.490335] Modules linked in: mpt3sas(+) raid_class rpcsec_gss_krb5
>>> auth_rpcgss nfsv4 nfs lockd grace fscache ebtable_filt
>>> er ebtables ip6table_filter ip6_tables iptable_filter ip_tables x_tables
>>> af_packet br_netfilter bridge stp llc iscsi_ibft iscs
>>> i_boot_sysfs sb_edac edac_core x86_pkg_temp_thermal intel_powerclamp
>>> coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_p
>>> clmul tg3 ixgbe ghash_clmulni_intel pcbc ptp aesni_intel pps_core
>>> aes_x86_64 ipmi_ssif hpilo hpwdt mdio libphy pcc_cpufreq cry
>>> pto_simd glue_helper iTCO_wdt iTCO_vendor_support acpi_cpufreq tpm_tis
>>> ipmi_si ipmi_devintf cryptd lpc_ich pcspkr ioatdma tpm_
>>> tis_core thermal wmi shpchp dca ipmi_msghandler tpm fjes button sunrpc
>>> btrfs xor sr_mod raid6_pq cdrom ehci_pci mgag200 i2c_al
>>> go_bit drm_kms_helper syscopyarea sysfillrect uhci_hcd
>>> [ 1188.490399]  sysimgblt fb_sys_fops sd_mod ahci ehci_hcd ttm libahci
>>> crc32c_intel serio_raw drm libata usbcore usb_common hp
>>> sa scsi_transport_sas sg dm_multipath dm_mod scsi_dh_rdac scsi_dh_emc
>>> scsi_dh_alua autofs4
>>> [ 1188.490411] CPU: 9 PID: 30155 Comm: kworker/u130:6 Not tainted
>>> 4.10.0-rc3+ #535
>>> [ 1188.490411] Hardware name: HP ProLiant ML350p Gen8, BIOS P72 09/08/2013
>>> [ 1188.490425] Workqueue: events_unbound async_run_entry_fn
>>> [ 1188.490427] Call Trace:
>>> [ 1188.490433]  dump_stack+0x85/0xc9
>>> [ 1188.490436]  __warn+0xd1/0xf0
>>> [ 1188.490438]  warn_slowpath_null+0x1d/0x20
>>> [ 1188.490440]  __blk_mq_finish_request+0x172/0x180
>>> [ 1188.490442]  blk_mq_finish_request+0x55/0x60
>>> [ 1188.490443]  blk_mq_sched_put_request+0x78/0x80
>>> [ 1188.490445]  blk_mq_free_request+0xe/0x10
>>> [ 1188.490448]  blk_put_request+0x25/0x60
>>> [ 1188.490453]  __scsi_execute.isra.24+0x104/0x160
>>> [ 1188.490455]  scsi_execute_req_flags+0x94/0x100
>>> [ 1188.490457]  scsi_report_opcode+0xab/0x100
>>> [ 1188.490461]  sd_revalidate_disk+0xaef/0x1450 [sd_mod]
>>> [ 1188.490464]  sd_probe_async+0xd1/0x1d0 [sd_mod]
>>> [ 1188.490466]  async_run_entry_fn+0x37/0x150
>>> [ 1188.490470]  process_one_work+0x1d0/0x660
>>> [ 1188.490472]  ? process_one_work+0x151/0x660
>>> [ 1188.490474]  worker_thread+0x12b/0x4a0
>>> [ 1188.490475]  kthread+0x10c/0x140
>>> [ 1188.490477]  ? process_one_work+0x660/0x660
>>> [ 1188.490478]  ? kthread_create_on_node+0x40/0x40
>>> [ 1188.490483]  ret_from_fork+0x2a/0x40
>>> [ 1188.490484] ---[ end trace d5e3a32ac269fc2a ]---
>>> [ 1188.490485] rq (487/52) rqs (-1/-1)
>>> [ 1188.523518] sd 7:0:0:0: [sdb] Attached SCSI disk
>>> [ 1188.540954] elevator: switch to deadline failed
>>
>> Where did this last line come from? Do you have a udev rule or something
>> that attempts to switch to deadline, which will fail with scsi-mq?
>>
>> Trying to get closer to reproducing this, no luck just setting the async
>> scan option, mine still boots fine.
>>
> Ah, indeed.
> There is an ominous udev rule here, trying to switch to 'deadline'.
> 
> # cat 60-ssd-scheduler.rules
> # do not edit this file, it will be overwritten on update
> 
> ACTION!="add", GOTO="ssd_scheduler_end"
> SUBSYSTEM!="block", GOTO="ssd_scheduler_end"
> 
> IMPORT{cmdline}="elevator"
> ENV{elevator}=="*?", GOTO="ssd_scheduler_end"
> 
> KERNEL=="sd*[!0-9]", ATTR{queue/rotational}=="0",
> ATTR{queue/scheduler}="deadline"
> 
> LABEL="ssd_scheduler_end"
> 
> Still shouldn't crash the kernel, though ...

Of course not, and it's not a given that it does, it could just be
triggering after the device load and failing like expected. But just in
case, can you try and disable that rule and see if it still crashes with
MQ_DEADLINE set as the default?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-13 15:34         ` Jens Axboe
@ 2017-01-13 15:59           ` Hannes Reinecke
  2017-01-13 16:00             ` Jens Axboe
  0 siblings, 1 reply; 62+ messages in thread
From: Hannes Reinecke @ 2017-01-13 15:59 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/13/2017 04:34 PM, Jens Axboe wrote:
> On 01/13/2017 08:33 AM, Hannes Reinecke wrote:
[ .. ]
>> Ah, indeed.
>> There is an ominous udev rule here, trying to switch to 'deadline'.
>>
>> # cat 60-ssd-scheduler.rules
>> # do not edit this file, it will be overwritten on update
>>
>> ACTION!="add", GOTO="ssd_scheduler_end"
>> SUBSYSTEM!="block", GOTO="ssd_scheduler_end"
>>
>> IMPORT{cmdline}="elevator"
>> ENV{elevator}=="*?", GOTO="ssd_scheduler_end"
>>
>> KERNEL=="sd*[!0-9]", ATTR{queue/rotational}=="0",
>> ATTR{queue/scheduler}="deadline"
>>
>> LABEL="ssd_scheduler_end"
>>
>> Still shouldn't crash the kernel, though ...
> 
> Of course not, and it's not a given that it does, it could just be
> triggering after the device load and failing like expected. But just in
> case, can you try and disable that rule and see if it still crashes with
> MQ_DEADLINE set as the default?
> 
Yes, it does.
Same stacktrace as before.

Cheers

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-13 15:59           ` Hannes Reinecke
@ 2017-01-13 16:00             ` Jens Axboe
  2017-01-13 16:02               ` Jens Axboe
  0 siblings, 1 reply; 62+ messages in thread
From: Jens Axboe @ 2017-01-13 16:00 UTC (permalink / raw)
  To: Hannes Reinecke, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/13/2017 08:59 AM, Hannes Reinecke wrote:
> On 01/13/2017 04:34 PM, Jens Axboe wrote:
>> On 01/13/2017 08:33 AM, Hannes Reinecke wrote:
> [ .. ]
>>> Ah, indeed.
>>> There is an ominous udev rule here, trying to switch to 'deadline'.
>>>
>>> # cat 60-ssd-scheduler.rules
>>> # do not edit this file, it will be overwritten on update
>>>
>>> ACTION!="add", GOTO="ssd_scheduler_end"
>>> SUBSYSTEM!="block", GOTO="ssd_scheduler_end"
>>>
>>> IMPORT{cmdline}="elevator"
>>> ENV{elevator}=="*?", GOTO="ssd_scheduler_end"
>>>
>>> KERNEL=="sd*[!0-9]", ATTR{queue/rotational}=="0",
>>> ATTR{queue/scheduler}="deadline"
>>>
>>> LABEL="ssd_scheduler_end"
>>>
>>> Still shouldn't crash the kernel, though ...
>>
>> Of course not, and it's not a given that it does, it could just be
>> triggering after the device load and failing like expected. But just in
>> case, can you try and disable that rule and see if it still crashes with
>> MQ_DEADLINE set as the default?
>>
> Yes, it does.
> Same stacktrace as before.

Alright, that's as expected. I've tried with your rule and making
everything modular, but it still boots fine for me. Very odd. Can you
send me your .config? And are all the SCSI disks hanging off ahci? Or
sdb specifically, is that ahci or something else?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-13 16:00             ` Jens Axboe
@ 2017-01-13 16:02               ` Jens Axboe
  2017-01-13 21:45                 ` Jens Axboe
  2017-01-16  8:11                 ` Hannes Reinecke
  0 siblings, 2 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-13 16:02 UTC (permalink / raw)
  To: Hannes Reinecke, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/13/2017 09:00 AM, Jens Axboe wrote:
> On 01/13/2017 08:59 AM, Hannes Reinecke wrote:
>> On 01/13/2017 04:34 PM, Jens Axboe wrote:
>>> On 01/13/2017 08:33 AM, Hannes Reinecke wrote:
>> [ .. ]
>>>> Ah, indeed.
>>>> There is an ominous udev rule here, trying to switch to 'deadline'.
>>>>
>>>> # cat 60-ssd-scheduler.rules
>>>> # do not edit this file, it will be overwritten on update
>>>>
>>>> ACTION!="add", GOTO="ssd_scheduler_end"
>>>> SUBSYSTEM!="block", GOTO="ssd_scheduler_end"
>>>>
>>>> IMPORT{cmdline}="elevator"
>>>> ENV{elevator}=="*?", GOTO="ssd_scheduler_end"
>>>>
>>>> KERNEL=="sd*[!0-9]", ATTR{queue/rotational}=="0",
>>>> ATTR{queue/scheduler}="deadline"
>>>>
>>>> LABEL="ssd_scheduler_end"
>>>>
>>>> Still shouldn't crash the kernel, though ...
>>>
>>> Of course not, and it's not a given that it does, it could just be
>>> triggering after the device load and failing like expected. But just in
>>> case, can you try and disable that rule and see if it still crashes with
>>> MQ_DEADLINE set as the default?
>>>
>> Yes, it does.
>> Same stacktrace as before.
> 
> Alright, that's as expected. I've tried with your rule and making
> everything modular, but it still boots fine for me. Very odd. Can you
> send me your .config? And are all the SCSI disks hanging off ahci? Or
> sdb specifically, is that ahci or something else?

Also, would be great if you could pull:

git://git.kernel.dk/linux-block blk-mq-sched

into current 'master' and see if it still reproduces. I expect that it
will, but just want to ensure that it's a problem in the current code
base as well.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 08/10] blk-mq-sched: add framework for MQ capable IO schedulers
  2017-01-13 11:15   ` Hannes Reinecke
@ 2017-01-13 16:39     ` Bart Van Assche
  2017-01-13 16:41     ` Omar Sandoval
  1 sibling, 0 replies; 62+ messages in thread
From: Bart Van Assche @ 2017-01-13 16:39 UTC (permalink / raw)
  To: linux-kernel, hare, linux-block, axboe; +Cc: osandov

On Fri, 2017-01-13 at 12:15 +0100, Hannes Reinecke wrote:
> On 01/11/2017 10:40 PM, Jens Axboe wrote:
> > This adds a set of hooks that intercepts the blk-mq path of
> > allocating/inserting/issuing/completing requests, allowing
> > us to develop a scheduler within that framework.
> > 
> > We reuse the existing elevator scheduler API on the registration
> > side, but augment that with the scheduler flagging support for
> > the blk-mq interfce, and with a separate set of ops hooks for MQ
> > devices.
> > 
> > We split driver and scheduler tags, so we can run the scheduling
> > independent of device queue depth.
> > 
> > Signed-off-by: Jens Axboe <axboe@fb.com>
> 
> [ .. ]
> > @@ -823,6 +847,35 @@ static inline unsigned int queued_to_index(unsigned int queued)
> >  	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
> >  }
> >  
> > +static bool blk_mq_get_driver_tag(struct request *rq,
> > +				  struct blk_mq_hw_ctx **hctx, bool wait)
> > +{
> > +	struct blk_mq_alloc_data data = {
> > +		.q = rq->q,
> > +		.ctx = rq->mq_ctx,
> > +		.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
> > +		.flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
> > +	};
> > +
> > +	if (blk_mq_hctx_stopped(data.hctx))
> > +		return false;
> > +
> > +	if (rq->tag != -1) {
> > +done:
> > +		if (hctx)
> > +			*hctx = data.hctx;
> > +		return true;
> > +	}
> > +
> > +	rq->tag = blk_mq_get_tag(&data);
> > +	if (rq->tag >= 0) {
> > +		data.hctx->tags->rqs[rq->tag] = rq;
> > +		goto done;
> > +	}
> > +
> > +	return false;
> > +}
> > +
> 
> What happens with the existing request at 'rqs[rq->tag]' ?
> Surely there is one already, right?
> Things like '->init_request' assume a fully populated array, so moving
> one entry to another location is ... interesting.
> 
> I would have thought we need to do a request cloning here,
> otherwise this would introduce a memory leak, right?
> (Not to mention a potential double completion, as the request is now at
> two positions in the array)

Hello Hannes,

Have you noticed that there are two .rqs[] arrays - tags->rqs and
sched_tags->rqs[]? .init_request() loops over sched_tags->rqs[]. The
above assignment applies to tags->rqs[].

Bart.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 08/10] blk-mq-sched: add framework for MQ capable IO schedulers
  2017-01-13 11:15   ` Hannes Reinecke
  2017-01-13 16:39     ` Bart Van Assche
@ 2017-01-13 16:41     ` Omar Sandoval
  2017-01-13 17:43       ` Hannes Reinecke
  1 sibling, 1 reply; 62+ messages in thread
From: Omar Sandoval @ 2017-01-13 16:41 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: Jens Axboe, linux-kernel, linux-block, bart.vanassche

On Fri, Jan 13, 2017 at 12:15:17PM +0100, Hannes Reinecke wrote:
> On 01/11/2017 10:40 PM, Jens Axboe wrote:
> > This adds a set of hooks that intercepts the blk-mq path of
> > allocating/inserting/issuing/completing requests, allowing
> > us to develop a scheduler within that framework.
> > 
> > We reuse the existing elevator scheduler API on the registration
> > side, but augment that with the scheduler flagging support for
> > the blk-mq interfce, and with a separate set of ops hooks for MQ
> > devices.
> > 
> > We split driver and scheduler tags, so we can run the scheduling
> > independent of device queue depth.
> > 
> > Signed-off-by: Jens Axboe <axboe@fb.com>
> [ .. ]
> > @@ -823,6 +847,35 @@ static inline unsigned int queued_to_index(unsigned int queued)
> >  	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
> >  }
> >  
> > +static bool blk_mq_get_driver_tag(struct request *rq,
> > +				  struct blk_mq_hw_ctx **hctx, bool wait)
> > +{
> > +	struct blk_mq_alloc_data data = {
> > +		.q = rq->q,
> > +		.ctx = rq->mq_ctx,
> > +		.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
> > +		.flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
> > +	};
> > +
> > +	if (blk_mq_hctx_stopped(data.hctx))
> > +		return false;
> > +
> > +	if (rq->tag != -1) {
> > +done:
> > +		if (hctx)
> > +			*hctx = data.hctx;
> > +		return true;
> > +	}
> > +
> > +	rq->tag = blk_mq_get_tag(&data);
> > +	if (rq->tag >= 0) {
> > +		data.hctx->tags->rqs[rq->tag] = rq;
> > +		goto done;
> > +	}
> > +
> > +	return false;
> > +}
> > +
> What happens with the existing request at 'rqs[rq->tag]' ?
> Surely there is one already, right?
> Things like '->init_request' assume a fully populated array, so moving
> one entry to another location is ... interesting.
> 
> I would have thought we need to do a request cloning here,
> otherwise this would introduce a memory leak, right?
> (Not to mention a potential double completion, as the request is now at
> two positions in the array)
> 
> Cheers,
> 
> Hannes

The entries in tags->rqs aren't slab objects, they're pointers into
pages allocated separately and tracked on tags->page_list. See
blk_mq_alloc_rqs(). In blk_mq_free_rqs(), we free all of the pages on
tags->page_list, so there shouldn't be a memory leak.

As for hctx->tags->rqs, entries are only overwritten when a scheduler is
enabled. In that case, the rqs array is storing pointers to requests
actually from hctx->sched_tags, so overwriting/leaking isn't an issue.

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCH 08/10] blk-mq-sched: add framework for MQ capable IO schedulers
  2017-01-13 16:41     ` Omar Sandoval
@ 2017-01-13 17:43       ` Hannes Reinecke
  0 siblings, 0 replies; 62+ messages in thread
From: Hannes Reinecke @ 2017-01-13 17:43 UTC (permalink / raw)
  To: Omar Sandoval; +Cc: Jens Axboe, linux-kernel, linux-block, bart.vanassche

On 01/13/2017 05:41 PM, Omar Sandoval wrote:
> On Fri, Jan 13, 2017 at 12:15:17PM +0100, Hannes Reinecke wrote:
>> On 01/11/2017 10:40 PM, Jens Axboe wrote:
>>> This adds a set of hooks that intercepts the blk-mq path of
>>> allocating/inserting/issuing/completing requests, allowing
>>> us to develop a scheduler within that framework.
>>>
>>> We reuse the existing elevator scheduler API on the registration
>>> side, but augment that with the scheduler flagging support for
>>> the blk-mq interfce, and with a separate set of ops hooks for MQ
>>> devices.
>>>
>>> We split driver and scheduler tags, so we can run the scheduling
>>> independent of device queue depth.
>>>
>>> Signed-off-by: Jens Axboe <axboe@fb.com>
>> [ .. ]
>>> @@ -823,6 +847,35 @@ static inline unsigned int queued_to_index(unsigned int queued)
>>>  	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
>>>  }
>>>  
>>> +static bool blk_mq_get_driver_tag(struct request *rq,
>>> +				  struct blk_mq_hw_ctx **hctx, bool wait)
>>> +{
>>> +	struct blk_mq_alloc_data data = {
>>> +		.q = rq->q,
>>> +		.ctx = rq->mq_ctx,
>>> +		.hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
>>> +		.flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
>>> +	};
>>> +
>>> +	if (blk_mq_hctx_stopped(data.hctx))
>>> +		return false;
>>> +
>>> +	if (rq->tag != -1) {
>>> +done:
>>> +		if (hctx)
>>> +			*hctx = data.hctx;
>>> +		return true;
>>> +	}
>>> +
>>> +	rq->tag = blk_mq_get_tag(&data);
>>> +	if (rq->tag >= 0) {
>>> +		data.hctx->tags->rqs[rq->tag] = rq;
>>> +		goto done;
>>> +	}
>>> +
>>> +	return false;
>>> +}
>>> +
>> What happens with the existing request at 'rqs[rq->tag]' ?
>> Surely there is one already, right?
>> Things like '->init_request' assume a fully populated array, so moving
>> one entry to another location is ... interesting.
>>
>> I would have thought we need to do a request cloning here,
>> otherwise this would introduce a memory leak, right?
>> (Not to mention a potential double completion, as the request is now at
>> two positions in the array)
>>
>> Cheers,
>>
>> Hannes
> 
> The entries in tags->rqs aren't slab objects, they're pointers into
> pages allocated separately and tracked on tags->page_list. See
> blk_mq_alloc_rqs(). In blk_mq_free_rqs(), we free all of the pages on
> tags->page_list, so there shouldn't be a memory leak.
> 
> As for hctx->tags->rqs, entries are only overwritten when a scheduler is
> enabled. In that case, the rqs array is storing pointers to requests
> actually from hctx->sched_tags, so overwriting/leaking isn't an issue.

Ah. Thanks.
That explains it.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-13 16:02               ` Jens Axboe
@ 2017-01-13 21:45                 ` Jens Axboe
  2017-01-16  8:11                 ` Hannes Reinecke
  1 sibling, 0 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-13 21:45 UTC (permalink / raw)
  To: Hannes Reinecke, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/13/2017 09:02 AM, Jens Axboe wrote:
> Also, would be great if you could pull:
> 
> git://git.kernel.dk/linux-block blk-mq-sched
> 
> into current 'master' and see if it still reproduces. I expect that it
> will, but just want to ensure that it's a problem in the current code
> base as well.

Hannes, can you try the current branch? I believe your problem should be
fixed now, would be great if you could verify.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-11 21:39 [PATCHSET v6] blk-mq scheduling framework Jens Axboe
                   ` (12 preceding siblings ...)
  2017-01-13 10:09 ` Hannes Reinecke
@ 2017-01-15 10:12 ` Paolo Valente
  2017-01-15 15:55   ` Jens Axboe
  13 siblings, 1 reply; 62+ messages in thread
From: Paolo Valente @ 2017-01-15 10:12 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Linux-Kernal, linux-block, Omar Sandoval, bart.vanassche


> Il giorno 11 gen 2017, alle ore 22:39, Jens Axboe <axboe@fb.com> ha scritto:
> 
> Another year, another posting of this patchset. The previous posting
> was here:
> 
> https://www.spinics.net/lists/kernel/msg2406106.html
> 
> (yes, I've skipped v5, it was fixes on top of v4, not the rework).
> 
> I've reworked bits of this to get rid of the shadow requests, thanks
> to Bart for the inspiration. The missing piece, for me, was the fact
> that we have the tags->rqs[] indirection array already. I've done this
> somewhat differently, though, by having the internal scheduler tag
> map be allocated/torn down when an IO scheduler is attached or
> detached. This also means that when we run without a scheduler, we
> don't have to do double tag allocations, it'll work like before.
> 
> The patchset applies on top of 4.10-rc3, or can be pulled here:
> 
> git://git.kernel.dk/linux-block blk-mq-sched.6
> 
> 

Hi Jens,
I have checked this new version to find solutions to the apparent
errors, mistakes or just unclear parts (to me) that I have pointed out
before Christmas last year.  But I have found no changes related to
these problems.

As I have already written, I'm willing to try to fix those errors
myself, if they really are errors, but I would first need at least
some minimal initial feedback and guidance.  If needed, tell me how I
can help you get in sync again with these issues (sending my reports
again, sending a digest of them, ...).

Thanks,
Paolo

> block/Kconfig.iosched    |   50 ++++
> block/Makefile           |    3 
> block/blk-core.c         |   19 -
> block/blk-exec.c         |    3 
> block/blk-flush.c        |   15 -
> block/blk-ioc.c          |   12 
> block/blk-merge.c        |    4 
> block/blk-mq-sched.c     |  354 +++++++++++++++++++++++++++++
> block/blk-mq-sched.h     |  157 ++++++++++++
> block/blk-mq-sysfs.c     |   13 +
> block/blk-mq-tag.c       |   58 ++--
> block/blk-mq-tag.h       |    4 
> block/blk-mq.c           |  413 +++++++++++++++++++---------------
> block/blk-mq.h           |   40 +++
> block/blk-tag.c          |    1 
> block/blk.h              |   26 +-
> block/cfq-iosched.c      |    2 
> block/deadline-iosched.c |    2 
> block/elevator.c         |  247 +++++++++++++++-----
> block/mq-deadline.c      |  569 +++++++++++++++++++++++++++++++++++++++++++++++
> block/noop-iosched.c     |    2 
> drivers/nvme/host/pci.c  |    1 
> include/linux/blk-mq.h   |    9 
> include/linux/blkdev.h   |    6 
> include/linux/elevator.h |   36 ++
> 25 files changed, 1732 insertions(+), 314 deletions(-)
> 
> -- 
> Jens Axboe
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-block" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-15 10:12 ` Paolo Valente
@ 2017-01-15 15:55   ` Jens Axboe
  0 siblings, 0 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-15 15:55 UTC (permalink / raw)
  To: Paolo Valente; +Cc: Linux-Kernal, linux-block, Omar Sandoval, bart.vanassche

On 01/15/2017 03:12 AM, Paolo Valente wrote:
> 
>> Il giorno 11 gen 2017, alle ore 22:39, Jens Axboe <axboe@fb.com> ha scritto:
>>
>> Another year, another posting of this patchset. The previous posting
>> was here:
>>
>> https://www.spinics.net/lists/kernel/msg2406106.html
>>
>> (yes, I've skipped v5, it was fixes on top of v4, not the rework).
>>
>> I've reworked bits of this to get rid of the shadow requests, thanks
>> to Bart for the inspiration. The missing piece, for me, was the fact
>> that we have the tags->rqs[] indirection array already. I've done this
>> somewhat differently, though, by having the internal scheduler tag
>> map be allocated/torn down when an IO scheduler is attached or
>> detached. This also means that when we run without a scheduler, we
>> don't have to do double tag allocations, it'll work like before.
>>
>> The patchset applies on top of 4.10-rc3, or can be pulled here:
>>
>> git://git.kernel.dk/linux-block blk-mq-sched.6
>>
>>
> 
> Hi Jens,
> I have checked this new version to find solutions to the apparent
> errors, mistakes or just unclear parts (to me) that I have pointed out
> before Christmas last year.  But I have found no changes related to
> these problems.
> 
> As I have already written, I'm willing to try to fix those errors
> myself, if they really are errors, but I would first need at least
> some minimal initial feedback and guidance.  If needed, tell me how I
> can help you get in sync again with these issues (sending my reports
> again, sending a digest of them, ...).

Sorry Paolo, but focus has been on getting the framework in both
a mergeable and stable state, which it is now. I'll tend to BFQ
specific issues next week, so we can get those resolved as well.

Do you have a place where you have posted your in-progress
conversion?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-13 16:02               ` Jens Axboe
  2017-01-13 21:45                 ` Jens Axboe
@ 2017-01-16  8:11                 ` Hannes Reinecke
  2017-01-16 15:12                   ` Jens Axboe
  1 sibling, 1 reply; 62+ messages in thread
From: Hannes Reinecke @ 2017-01-16  8:11 UTC (permalink / raw)
  To: Jens Axboe, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/13/2017 05:02 PM, Jens Axboe wrote:
> On 01/13/2017 09:00 AM, Jens Axboe wrote:
>> On 01/13/2017 08:59 AM, Hannes Reinecke wrote:
>>> On 01/13/2017 04:34 PM, Jens Axboe wrote:
>>>> On 01/13/2017 08:33 AM, Hannes Reinecke wrote:
>>> [ .. ]
>>>>> Ah, indeed.
>>>>> There is an ominous udev rule here, trying to switch to 'deadline'.
>>>>>
>>>>> # cat 60-ssd-scheduler.rules
>>>>> # do not edit this file, it will be overwritten on update
>>>>>
>>>>> ACTION!="add", GOTO="ssd_scheduler_end"
>>>>> SUBSYSTEM!="block", GOTO="ssd_scheduler_end"
>>>>>
>>>>> IMPORT{cmdline}="elevator"
>>>>> ENV{elevator}=="*?", GOTO="ssd_scheduler_end"
>>>>>
>>>>> KERNEL=="sd*[!0-9]", ATTR{queue/rotational}=="0",
>>>>> ATTR{queue/scheduler}="deadline"
>>>>>
>>>>> LABEL="ssd_scheduler_end"
>>>>>
>>>>> Still shouldn't crash the kernel, though ...
>>>>
>>>> Of course not, and it's not a given that it does, it could just be
>>>> triggering after the device load and failing like expected. But just in
>>>> case, can you try and disable that rule and see if it still crashes with
>>>> MQ_DEADLINE set as the default?
>>>>
>>> Yes, it does.
>>> Same stacktrace as before.
>>
>> Alright, that's as expected. I've tried with your rule and making
>> everything modular, but it still boots fine for me. Very odd. Can you
>> send me your .config? And are all the SCSI disks hanging off ahci? Or
>> sdb specifically, is that ahci or something else?
> 
> Also, would be great if you could pull:
> 
> git://git.kernel.dk/linux-block blk-mq-sched
> 
> into current 'master' and see if it still reproduces. I expect that it
> will, but just want to ensure that it's a problem in the current code
> base as well.
> 
Actually, it doesn't. Seems to have resolved itself with the latest drop.

However, not I've got a lockdep splat:

Jan 16 09:05:02 lammermuir kernel: ------------[ cut here ]------------
Jan 16 09:05:02 lammermuir kernel: WARNING: CPU: 29 PID: 5860 at
kernel/locking/lockdep.c:3514 lock_release+0x2a7/0x490
Jan 16 09:05:02 lammermuir kernel: DEBUG_LOCKS_WARN_ON(depth <= 0)
Jan 16 09:05:02 lammermuir kernel: Modules linked in: raid0 mpt3sas
raid_class rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs lockd grace fscache e
Jan 16 09:05:02 lammermuir kernel:  fb_sys_fops ahci uhci_hcd ttm
ehci_pci libahci ehci_hcd serio_raw crc32c_intel drm libata usbcore hpsa
Jan 16 09:05:02 lammermuir kernel: CPU: 29 PID: 5860 Comm: fio Not
tainted 4.10.0-rc3+ #540
Jan 16 09:05:02 lammermuir kernel: Hardware name: HP ProLiant ML350p
Gen8, BIOS P72 09/08/2013
Jan 16 09:05:02 lammermuir kernel: Call Trace:
Jan 16 09:05:02 lammermuir kernel:  dump_stack+0x85/0xc9
Jan 16 09:05:02 lammermuir kernel:  __warn+0xd1/0xf0
Jan 16 09:05:02 lammermuir kernel:  ? aio_write+0x118/0x170
Jan 16 09:05:02 lammermuir kernel:  warn_slowpath_fmt+0x4f/0x60
Jan 16 09:05:02 lammermuir kernel:  lock_release+0x2a7/0x490
Jan 16 09:05:02 lammermuir kernel:  ? blkdev_write_iter+0x89/0xd0
Jan 16 09:05:02 lammermuir kernel:  aio_write+0x138/0x170
Jan 16 09:05:02 lammermuir kernel:  do_io_submit+0x4d2/0x8f0
Jan 16 09:05:02 lammermuir kernel:  ? do_io_submit+0x413/0x8f0
Jan 16 09:05:02 lammermuir kernel:  SyS_io_submit+0x10/0x20
Jan 16 09:05:02 lammermuir kernel:  entry_SYSCALL_64_fastpath+0x23/0xc6

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-16  8:11                 ` Hannes Reinecke
@ 2017-01-16 15:12                   ` Jens Axboe
  2017-01-16 15:16                     ` Jens Axboe
  0 siblings, 1 reply; 62+ messages in thread
From: Jens Axboe @ 2017-01-16 15:12 UTC (permalink / raw)
  To: Hannes Reinecke, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/16/2017 01:11 AM, Hannes Reinecke wrote:
> On 01/13/2017 05:02 PM, Jens Axboe wrote:
>> On 01/13/2017 09:00 AM, Jens Axboe wrote:
>>> On 01/13/2017 08:59 AM, Hannes Reinecke wrote:
>>>> On 01/13/2017 04:34 PM, Jens Axboe wrote:
>>>>> On 01/13/2017 08:33 AM, Hannes Reinecke wrote:
>>>> [ .. ]
>>>>>> Ah, indeed.
>>>>>> There is an ominous udev rule here, trying to switch to 'deadline'.
>>>>>>
>>>>>> # cat 60-ssd-scheduler.rules
>>>>>> # do not edit this file, it will be overwritten on update
>>>>>>
>>>>>> ACTION!="add", GOTO="ssd_scheduler_end"
>>>>>> SUBSYSTEM!="block", GOTO="ssd_scheduler_end"
>>>>>>
>>>>>> IMPORT{cmdline}="elevator"
>>>>>> ENV{elevator}=="*?", GOTO="ssd_scheduler_end"
>>>>>>
>>>>>> KERNEL=="sd*[!0-9]", ATTR{queue/rotational}=="0",
>>>>>> ATTR{queue/scheduler}="deadline"
>>>>>>
>>>>>> LABEL="ssd_scheduler_end"
>>>>>>
>>>>>> Still shouldn't crash the kernel, though ...
>>>>>
>>>>> Of course not, and it's not a given that it does, it could just be
>>>>> triggering after the device load and failing like expected. But just in
>>>>> case, can you try and disable that rule and see if it still crashes with
>>>>> MQ_DEADLINE set as the default?
>>>>>
>>>> Yes, it does.
>>>> Same stacktrace as before.
>>>
>>> Alright, that's as expected. I've tried with your rule and making
>>> everything modular, but it still boots fine for me. Very odd. Can you
>>> send me your .config? And are all the SCSI disks hanging off ahci? Or
>>> sdb specifically, is that ahci or something else?
>>
>> Also, would be great if you could pull:
>>
>> git://git.kernel.dk/linux-block blk-mq-sched
>>
>> into current 'master' and see if it still reproduces. I expect that it
>> will, but just want to ensure that it's a problem in the current code
>> base as well.
>>
> Actually, it doesn't. Seems to have resolved itself with the latest drop.
> 
> However, not I've got a lockdep splat:
> 
> Jan 16 09:05:02 lammermuir kernel: ------------[ cut here ]------------
> Jan 16 09:05:02 lammermuir kernel: WARNING: CPU: 29 PID: 5860 at
> kernel/locking/lockdep.c:3514 lock_release+0x2a7/0x490
> Jan 16 09:05:02 lammermuir kernel: DEBUG_LOCKS_WARN_ON(depth <= 0)
> Jan 16 09:05:02 lammermuir kernel: Modules linked in: raid0 mpt3sas
> raid_class rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs lockd grace fscache e
> Jan 16 09:05:02 lammermuir kernel:  fb_sys_fops ahci uhci_hcd ttm
> ehci_pci libahci ehci_hcd serio_raw crc32c_intel drm libata usbcore hpsa
> Jan 16 09:05:02 lammermuir kernel: CPU: 29 PID: 5860 Comm: fio Not
> tainted 4.10.0-rc3+ #540
> Jan 16 09:05:02 lammermuir kernel: Hardware name: HP ProLiant ML350p
> Gen8, BIOS P72 09/08/2013
> Jan 16 09:05:02 lammermuir kernel: Call Trace:
> Jan 16 09:05:02 lammermuir kernel:  dump_stack+0x85/0xc9
> Jan 16 09:05:02 lammermuir kernel:  __warn+0xd1/0xf0
> Jan 16 09:05:02 lammermuir kernel:  ? aio_write+0x118/0x170
> Jan 16 09:05:02 lammermuir kernel:  warn_slowpath_fmt+0x4f/0x60
> Jan 16 09:05:02 lammermuir kernel:  lock_release+0x2a7/0x490
> Jan 16 09:05:02 lammermuir kernel:  ? blkdev_write_iter+0x89/0xd0
> Jan 16 09:05:02 lammermuir kernel:  aio_write+0x138/0x170
> Jan 16 09:05:02 lammermuir kernel:  do_io_submit+0x4d2/0x8f0
> Jan 16 09:05:02 lammermuir kernel:  ? do_io_submit+0x413/0x8f0
> Jan 16 09:05:02 lammermuir kernel:  SyS_io_submit+0x10/0x20
> Jan 16 09:05:02 lammermuir kernel:  entry_SYSCALL_64_fastpath+0x23/0xc6

Odd, not sure that's me. What did you pull my branch into? And what is the
sha of the stuff you pulled in?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-16 15:12                   ` Jens Axboe
@ 2017-01-16 15:16                     ` Jens Axboe
  2017-01-16 15:47                       ` Jens Axboe
  0 siblings, 1 reply; 62+ messages in thread
From: Jens Axboe @ 2017-01-16 15:16 UTC (permalink / raw)
  To: Hannes Reinecke, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/16/2017 08:12 AM, Jens Axboe wrote:
> On 01/16/2017 01:11 AM, Hannes Reinecke wrote:
>> On 01/13/2017 05:02 PM, Jens Axboe wrote:
>>> On 01/13/2017 09:00 AM, Jens Axboe wrote:
>>>> On 01/13/2017 08:59 AM, Hannes Reinecke wrote:
>>>>> On 01/13/2017 04:34 PM, Jens Axboe wrote:
>>>>>> On 01/13/2017 08:33 AM, Hannes Reinecke wrote:
>>>>> [ .. ]
>>>>>>> Ah, indeed.
>>>>>>> There is an ominous udev rule here, trying to switch to 'deadline'.
>>>>>>>
>>>>>>> # cat 60-ssd-scheduler.rules
>>>>>>> # do not edit this file, it will be overwritten on update
>>>>>>>
>>>>>>> ACTION!="add", GOTO="ssd_scheduler_end"
>>>>>>> SUBSYSTEM!="block", GOTO="ssd_scheduler_end"
>>>>>>>
>>>>>>> IMPORT{cmdline}="elevator"
>>>>>>> ENV{elevator}=="*?", GOTO="ssd_scheduler_end"
>>>>>>>
>>>>>>> KERNEL=="sd*[!0-9]", ATTR{queue/rotational}=="0",
>>>>>>> ATTR{queue/scheduler}="deadline"
>>>>>>>
>>>>>>> LABEL="ssd_scheduler_end"
>>>>>>>
>>>>>>> Still shouldn't crash the kernel, though ...
>>>>>>
>>>>>> Of course not, and it's not a given that it does, it could just be
>>>>>> triggering after the device load and failing like expected. But just in
>>>>>> case, can you try and disable that rule and see if it still crashes with
>>>>>> MQ_DEADLINE set as the default?
>>>>>>
>>>>> Yes, it does.
>>>>> Same stacktrace as before.
>>>>
>>>> Alright, that's as expected. I've tried with your rule and making
>>>> everything modular, but it still boots fine for me. Very odd. Can you
>>>> send me your .config? And are all the SCSI disks hanging off ahci? Or
>>>> sdb specifically, is that ahci or something else?
>>>
>>> Also, would be great if you could pull:
>>>
>>> git://git.kernel.dk/linux-block blk-mq-sched
>>>
>>> into current 'master' and see if it still reproduces. I expect that it
>>> will, but just want to ensure that it's a problem in the current code
>>> base as well.
>>>
>> Actually, it doesn't. Seems to have resolved itself with the latest drop.
>>
>> However, not I've got a lockdep splat:
>>
>> Jan 16 09:05:02 lammermuir kernel: ------------[ cut here ]------------
>> Jan 16 09:05:02 lammermuir kernel: WARNING: CPU: 29 PID: 5860 at
>> kernel/locking/lockdep.c:3514 lock_release+0x2a7/0x490
>> Jan 16 09:05:02 lammermuir kernel: DEBUG_LOCKS_WARN_ON(depth <= 0)
>> Jan 16 09:05:02 lammermuir kernel: Modules linked in: raid0 mpt3sas
>> raid_class rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs lockd grace fscache e
>> Jan 16 09:05:02 lammermuir kernel:  fb_sys_fops ahci uhci_hcd ttm
>> ehci_pci libahci ehci_hcd serio_raw crc32c_intel drm libata usbcore hpsa
>> Jan 16 09:05:02 lammermuir kernel: CPU: 29 PID: 5860 Comm: fio Not
>> tainted 4.10.0-rc3+ #540
>> Jan 16 09:05:02 lammermuir kernel: Hardware name: HP ProLiant ML350p
>> Gen8, BIOS P72 09/08/2013
>> Jan 16 09:05:02 lammermuir kernel: Call Trace:
>> Jan 16 09:05:02 lammermuir kernel:  dump_stack+0x85/0xc9
>> Jan 16 09:05:02 lammermuir kernel:  __warn+0xd1/0xf0
>> Jan 16 09:05:02 lammermuir kernel:  ? aio_write+0x118/0x170
>> Jan 16 09:05:02 lammermuir kernel:  warn_slowpath_fmt+0x4f/0x60
>> Jan 16 09:05:02 lammermuir kernel:  lock_release+0x2a7/0x490
>> Jan 16 09:05:02 lammermuir kernel:  ? blkdev_write_iter+0x89/0xd0
>> Jan 16 09:05:02 lammermuir kernel:  aio_write+0x138/0x170
>> Jan 16 09:05:02 lammermuir kernel:  do_io_submit+0x4d2/0x8f0
>> Jan 16 09:05:02 lammermuir kernel:  ? do_io_submit+0x413/0x8f0
>> Jan 16 09:05:02 lammermuir kernel:  SyS_io_submit+0x10/0x20
>> Jan 16 09:05:02 lammermuir kernel:  entry_SYSCALL_64_fastpath+0x23/0xc6
> 
> Odd, not sure that's me. What did you pull my branch into? And what is the
> sha of the stuff you pulled in?

Forgot to ask, please send me the fio job you ran here.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

* Re: [PATCHSET v6] blk-mq scheduling framework
  2017-01-16 15:16                     ` Jens Axboe
@ 2017-01-16 15:47                       ` Jens Axboe
  0 siblings, 0 replies; 62+ messages in thread
From: Jens Axboe @ 2017-01-16 15:47 UTC (permalink / raw)
  To: Hannes Reinecke, linux-kernel, linux-block; +Cc: osandov, bart.vanassche

On 01/16/2017 08:16 AM, Jens Axboe wrote:
> On 01/16/2017 08:12 AM, Jens Axboe wrote:
>> On 01/16/2017 01:11 AM, Hannes Reinecke wrote:
>>> On 01/13/2017 05:02 PM, Jens Axboe wrote:
>>>> On 01/13/2017 09:00 AM, Jens Axboe wrote:
>>>>> On 01/13/2017 08:59 AM, Hannes Reinecke wrote:
>>>>>> On 01/13/2017 04:34 PM, Jens Axboe wrote:
>>>>>>> On 01/13/2017 08:33 AM, Hannes Reinecke wrote:
>>>>>> [ .. ]
>>>>>>>> Ah, indeed.
>>>>>>>> There is an ominous udev rule here, trying to switch to 'deadline'.
>>>>>>>>
>>>>>>>> # cat 60-ssd-scheduler.rules
>>>>>>>> # do not edit this file, it will be overwritten on update
>>>>>>>>
>>>>>>>> ACTION!="add", GOTO="ssd_scheduler_end"
>>>>>>>> SUBSYSTEM!="block", GOTO="ssd_scheduler_end"
>>>>>>>>
>>>>>>>> IMPORT{cmdline}="elevator"
>>>>>>>> ENV{elevator}=="*?", GOTO="ssd_scheduler_end"
>>>>>>>>
>>>>>>>> KERNEL=="sd*[!0-9]", ATTR{queue/rotational}=="0",
>>>>>>>> ATTR{queue/scheduler}="deadline"
>>>>>>>>
>>>>>>>> LABEL="ssd_scheduler_end"
>>>>>>>>
>>>>>>>> Still shouldn't crash the kernel, though ...
>>>>>>>
>>>>>>> Of course not, and it's not a given that it does, it could just be
>>>>>>> triggering after the device load and failing like expected. But just in
>>>>>>> case, can you try and disable that rule and see if it still crashes with
>>>>>>> MQ_DEADLINE set as the default?
>>>>>>>
>>>>>> Yes, it does.
>>>>>> Same stacktrace as before.
>>>>>
>>>>> Alright, that's as expected. I've tried with your rule and making
>>>>> everything modular, but it still boots fine for me. Very odd. Can you
>>>>> send me your .config? And are all the SCSI disks hanging off ahci? Or
>>>>> sdb specifically, is that ahci or something else?
>>>>
>>>> Also, would be great if you could pull:
>>>>
>>>> git://git.kernel.dk/linux-block blk-mq-sched
>>>>
>>>> into current 'master' and see if it still reproduces. I expect that it
>>>> will, but just want to ensure that it's a problem in the current code
>>>> base as well.
>>>>
>>> Actually, it doesn't. Seems to have resolved itself with the latest drop.
>>>
>>> However, not I've got a lockdep splat:
>>>
>>> Jan 16 09:05:02 lammermuir kernel: ------------[ cut here ]------------
>>> Jan 16 09:05:02 lammermuir kernel: WARNING: CPU: 29 PID: 5860 at
>>> kernel/locking/lockdep.c:3514 lock_release+0x2a7/0x490
>>> Jan 16 09:05:02 lammermuir kernel: DEBUG_LOCKS_WARN_ON(depth <= 0)
>>> Jan 16 09:05:02 lammermuir kernel: Modules linked in: raid0 mpt3sas
>>> raid_class rpcsec_gss_krb5 auth_rpcgss nfsv4 nfs lockd grace fscache e
>>> Jan 16 09:05:02 lammermuir kernel:  fb_sys_fops ahci uhci_hcd ttm
>>> ehci_pci libahci ehci_hcd serio_raw crc32c_intel drm libata usbcore hpsa
>>> Jan 16 09:05:02 lammermuir kernel: CPU: 29 PID: 5860 Comm: fio Not
>>> tainted 4.10.0-rc3+ #540
>>> Jan 16 09:05:02 lammermuir kernel: Hardware name: HP ProLiant ML350p
>>> Gen8, BIOS P72 09/08/2013
>>> Jan 16 09:05:02 lammermuir kernel: Call Trace:
>>> Jan 16 09:05:02 lammermuir kernel:  dump_stack+0x85/0xc9
>>> Jan 16 09:05:02 lammermuir kernel:  __warn+0xd1/0xf0
>>> Jan 16 09:05:02 lammermuir kernel:  ? aio_write+0x118/0x170
>>> Jan 16 09:05:02 lammermuir kernel:  warn_slowpath_fmt+0x4f/0x60
>>> Jan 16 09:05:02 lammermuir kernel:  lock_release+0x2a7/0x490
>>> Jan 16 09:05:02 lammermuir kernel:  ? blkdev_write_iter+0x89/0xd0
>>> Jan 16 09:05:02 lammermuir kernel:  aio_write+0x138/0x170
>>> Jan 16 09:05:02 lammermuir kernel:  do_io_submit+0x4d2/0x8f0
>>> Jan 16 09:05:02 lammermuir kernel:  ? do_io_submit+0x413/0x8f0
>>> Jan 16 09:05:02 lammermuir kernel:  SyS_io_submit+0x10/0x20
>>> Jan 16 09:05:02 lammermuir kernel:  entry_SYSCALL_64_fastpath+0x23/0xc6
>>
>> Odd, not sure that's me. What did you pull my branch into? And what is the
>> sha of the stuff you pulled in?
> 
> Forgot to ask, please send me the fio job you ran here.

Nevermind, it's a mainline bug that's fixed in -rc4:

commit a12f1ae61c489076a9aeb90bddca7722bf330df3
Author: Shaohua Li <shli@fb.com>
Date:   Tue Dec 13 12:09:56 2016 -0800

    aio: fix lock dep warning

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 62+ messages in thread

end of thread, other threads:[~2017-01-16 15:47 UTC | newest]

Thread overview: 62+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-01-11 21:39 [PATCHSET v6] blk-mq scheduling framework Jens Axboe
2017-01-11 21:39 ` [PATCH 01/10] block: move existing elevator ops to union Jens Axboe
2017-01-12 10:15   ` Johannes Thumshirn
2017-01-12 21:17   ` Bart Van Assche
2017-01-13  8:34   ` Christoph Hellwig
2017-01-13 15:00     ` Jens Axboe
2017-01-11 21:39 ` [PATCH 02/10] blk-mq: make mq_ops a const pointer Jens Axboe
2017-01-12 10:14   ` Johannes Thumshirn
2017-01-13  8:16   ` Christoph Hellwig
2017-01-11 21:39 ` [PATCH 03/10] block: move rq_ioc() to blk.h Jens Axboe
2017-01-12 10:14   ` Johannes Thumshirn
2017-01-12 21:18   ` Bart Van Assche
2017-01-13  8:33   ` Christoph Hellwig
2017-01-11 21:39 ` [PATCH 04/10] blk-mq: un-export blk_mq_free_hctx_request() Jens Axboe
2017-01-12 10:13   ` Johannes Thumshirn
2017-01-12 21:18   ` Bart Van Assche
2017-01-13  8:16   ` Christoph Hellwig
2017-01-11 21:39 ` [PATCH 05/10] blk-mq: export some helpers we need to the scheduling framework Jens Axboe
2017-01-12 10:17   ` Johannes Thumshirn
2017-01-12 21:20   ` Bart Van Assche
2017-01-13  8:17   ` Christoph Hellwig
2017-01-13 15:01     ` Jens Axboe
2017-01-11 21:39 ` [PATCH 06/10] blk-mq-tag: cleanup the normal/reserved tag allocation Jens Axboe
2017-01-12 21:22   ` Bart Van Assche
2017-01-12 22:07     ` Jens Axboe
2017-01-13  8:30   ` Christoph Hellwig
2017-01-13 15:06     ` Jens Axboe
2017-01-11 21:40 ` [PATCH 07/10] blk-mq: abstract out helpers for allocating/freeing tag maps Jens Axboe
2017-01-12 21:29   ` Bart Van Assche
2017-01-12 21:54     ` Jens Axboe
2017-01-13  8:25       ` Johannes Thumshirn
2017-01-11 21:40 ` [PATCH 08/10] blk-mq-sched: add framework for MQ capable IO schedulers Jens Axboe
2017-01-12 21:45   ` Bart Van Assche
2017-01-12 21:59     ` Jens Axboe
2017-01-13 11:15   ` Hannes Reinecke
2017-01-13 16:39     ` Bart Van Assche
2017-01-13 16:41     ` Omar Sandoval
2017-01-13 17:43       ` Hannes Reinecke
2017-01-11 21:40 ` [PATCH 09/10] mq-deadline: add blk-mq adaptation of the deadline IO scheduler Jens Axboe
2017-01-12 21:53   ` Bart Van Assche
2017-01-11 21:40 ` [PATCH 10/10] blk-mq-sched: allow setting of default " Jens Axboe
2017-01-12 21:54   ` Bart Van Assche
2017-01-12 21:16 ` [PATCHSET v6] blk-mq scheduling framework Bart Van Assche
2017-01-13  8:15 ` Hannes Reinecke
2017-01-13 11:04   ` Hannes Reinecke
2017-01-13 12:10     ` Hannes Reinecke
2017-01-13 15:05       ` Jens Axboe
2017-01-13 15:03     ` Jens Axboe
2017-01-13 15:23     ` Jens Axboe
2017-01-13 15:33       ` Hannes Reinecke
2017-01-13 15:34         ` Jens Axboe
2017-01-13 15:59           ` Hannes Reinecke
2017-01-13 16:00             ` Jens Axboe
2017-01-13 16:02               ` Jens Axboe
2017-01-13 21:45                 ` Jens Axboe
2017-01-16  8:11                 ` Hannes Reinecke
2017-01-16 15:12                   ` Jens Axboe
2017-01-16 15:16                     ` Jens Axboe
2017-01-16 15:47                       ` Jens Axboe
2017-01-13 10:09 ` Hannes Reinecke
2017-01-15 10:12 ` Paolo Valente
2017-01-15 15:55   ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).