All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] blk-mq: support for shared tags
@ 2014-03-31 14:46 Christoph Hellwig
  2014-03-31 14:46 ` [PATCH 1/4] blk->mq: stop pre-initializing req->special Christoph Hellwig
                   ` (4 more replies)
  0 siblings, 5 replies; 20+ messages in thread
From: Christoph Hellwig @ 2014-03-31 14:46 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Matias Bjorling, linux-kernel, linux-scsi

This series adds support for sharing tags (and thus requests) between
multiple request_queues.  We'll need this for SCSI, and I think Martin
also wants something similar for nvme.

Besides the mess with request contructors/destructors the major RFC here
is how the blk_mq_alloc_shared_tags API should look like.  For now I've
been lazy and reused struct blk_mq_reg, but that feels a bit cumbersome.
Either a separate blk_mq_tags_reg or just passing the few arguments directly
would work fine for me.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/4] blk->mq: stop pre-initializing req->special
  2014-03-31 14:46 [RFC] blk-mq: support for shared tags Christoph Hellwig
@ 2014-03-31 14:46 ` Christoph Hellwig
  2014-03-31 14:46 ` [PATCH 2/4] blk-mq: initialize request on allocation Christoph Hellwig
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2014-03-31 14:46 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Matias Bjorling, linux-kernel, linux-scsi

We can get at the private data easil using pointer arithmetics.  Do so
instead of initializing req->special so that we don't rely on the
request state in various initialization functions and shave off another
few instructions in the fast path.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-flush.c          |   10 ++--------
 block/blk-mq.c             |   15 ++-------------
 block/blk-mq.h             |    1 -
 drivers/block/null_blk.c   |    4 ++--
 drivers/block/virtio_blk.c |    6 +++---
 5 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/block/blk-flush.c b/block/blk-flush.c
index 43e6b47..9a0c427 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -306,22 +306,16 @@ static bool blk_kick_flush(struct request_queue *q)
 	 */
 	q->flush_pending_idx ^= 1;
 
+	blk_rq_init(q, q->flush_rq);
 	if (q->mq_ops) {
-		struct blk_mq_ctx *ctx = first_rq->mq_ctx;
-		struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu);
-
-		blk_mq_rq_init(hctx, q->flush_rq);
-		q->flush_rq->mq_ctx = ctx;
-
 		/*
 		 * Reuse the tag value from the fist waiting request,
 		 * with blk-mq the tag is generated during request
 		 * allocation and drivers can rely on it being inside
 		 * the range they asked for.
 		 */
+		q->flush_rq->mq_ctx = first_rq->mq_ctx;
 		q->flush_rq->tag = first_rq->tag;
-	} else {
-		blk_rq_init(q, q->flush_rq);
 	}
 
 	q->flush_rq->cmd_type = REQ_TYPE_FS;
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4274ee0..871acd6 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -248,24 +248,13 @@ struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw,
 }
 EXPORT_SYMBOL(blk_mq_alloc_reserved_request);
 
-/*
- * Re-init and set pdu, if we have it
- */
-void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq)
-{
-	blk_rq_init(hctx->queue, rq);
-
-	if (hctx->cmd_size)
-		rq->special = blk_mq_rq_to_pdu(rq);
-}
-
 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 				  struct blk_mq_ctx *ctx, struct request *rq)
 {
 	const int tag = rq->tag;
 	struct request_queue *q = rq->q;
 
-	blk_mq_rq_init(hctx, rq);
+	blk_rq_init(hctx->queue, rq);
 	blk_mq_put_tag(hctx->tags, tag);
 
 	blk_mq_queue_exit(q);
@@ -1139,7 +1128,7 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
 		left -= to_do * rq_size;
 		for (j = 0; j < to_do; j++) {
 			hctx->rqs[i] = p;
-			blk_mq_rq_init(hctx, hctx->rqs[i]);
+			blk_rq_init(hctx->queue, hctx->rqs[i]);
 			p += rq_size;
 			i++;
 		}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index ebbe6ba..238379a 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -27,7 +27,6 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_init_flush(struct request_queue *q);
 void blk_mq_drain_queue(struct request_queue *q);
 void blk_mq_free_queue(struct request_queue *q);
-void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq);
 
 /*
  * CPU hotplug helpers
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 091b9ea..71df69d 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -226,7 +226,7 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
 
 static void null_softirq_done_fn(struct request *rq)
 {
-	end_cmd(rq->special);
+	end_cmd(blk_mq_rq_to_pdu(rq));
 }
 
 static inline void null_handle_cmd(struct nullb_cmd *cmd)
@@ -311,7 +311,7 @@ static void null_request_fn(struct request_queue *q)
 
 static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
 {
-	struct nullb_cmd *cmd = rq->special;
+	struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
 
 	cmd->rq = rq;
 	cmd->nq = hctx->driver_data;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 0eace43..11e8f4b 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -112,7 +112,7 @@ static int __virtblk_add_req(struct virtqueue *vq,
 
 static inline void virtblk_request_done(struct request *req)
 {
-	struct virtblk_req *vbr = req->special;
+	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 	int error = virtblk_result(vbr);
 
 	if (req->cmd_type == REQ_TYPE_BLOCK_PC) {
@@ -154,7 +154,7 @@ static void virtblk_done(struct virtqueue *vq)
 static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req)
 {
 	struct virtio_blk *vblk = hctx->queue->queuedata;
-	struct virtblk_req *vbr = req->special;
+	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
 	unsigned long flags;
 	unsigned int num;
 	const bool last = (req->cmd_flags & REQ_END) != 0;
@@ -494,7 +494,7 @@ static int virtblk_init_vbr(void *data, struct blk_mq_hw_ctx *hctx,
 			     struct request *rq, unsigned int nr)
 {
 	struct virtio_blk *vblk = data;
-	struct virtblk_req *vbr = rq->special;
+	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
 
 	sg_init_table(vbr->sg, vblk->sg_elems);
 	return 0;
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/4] blk-mq: initialize request on allocation
  2014-03-31 14:46 [RFC] blk-mq: support for shared tags Christoph Hellwig
  2014-03-31 14:46 ` [PATCH 1/4] blk->mq: stop pre-initializing req->special Christoph Hellwig
@ 2014-03-31 14:46 ` Christoph Hellwig
  2014-03-31 14:46 ` [PATCH 3/4] blk-mq: move request structures into struct blk_mq_tags Christoph Hellwig
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2014-03-31 14:46 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Matias Bjorling, linux-kernel, linux-scsi

If we want to share tag and request allocation between queues we cannot
initialize the request at init/free time, but need to initialize it
at allocation time as it might get used for different queues over its
lifetime.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq.c |    4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 871acd6..ec0c276 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -82,6 +82,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
 	tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
 	if (tag != BLK_MQ_TAG_FAIL) {
 		rq = hctx->rqs[tag];
+		blk_rq_init(hctx->queue, rq);
 		rq->tag = tag;
 
 		return rq;
@@ -254,9 +255,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 	const int tag = rq->tag;
 	struct request_queue *q = rq->q;
 
-	blk_rq_init(hctx->queue, rq);
 	blk_mq_put_tag(hctx->tags, tag);
-
 	blk_mq_queue_exit(q);
 }
 
@@ -1128,7 +1127,6 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
 		left -= to_do * rq_size;
 		for (j = 0; j < to_do; j++) {
 			hctx->rqs[i] = p;
-			blk_rq_init(hctx->queue, hctx->rqs[i]);
 			p += rq_size;
 			i++;
 		}
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 3/4] blk-mq: move request structures into struct blk_mq_tags
  2014-03-31 14:46 [RFC] blk-mq: support for shared tags Christoph Hellwig
  2014-03-31 14:46 ` [PATCH 1/4] blk->mq: stop pre-initializing req->special Christoph Hellwig
  2014-03-31 14:46 ` [PATCH 2/4] blk-mq: initialize request on allocation Christoph Hellwig
@ 2014-03-31 14:46 ` Christoph Hellwig
  2014-04-09 16:23     ` Jens Axboe
  2014-03-31 14:46 ` [PATCH 4/4] blk-mq: support shared tag maps Christoph Hellwig
  2014-04-02  0:16 ` [RFC] blk-mq: support for shared tags Matias Bjorling
  4 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2014-03-31 14:46 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Matias Bjorling, linux-kernel, linux-scsi

This is in preparation for allowing to share the tags, and thus request
allocation between multiple queues.

Also remove blk_mq_tag_to_rq, as it was unused and thus untestable.  If we
need it back it can easil be re-added as a non-inline function.

Note that we also now straight out fail queue initialization if we can't
allocate tags - keeping track of a reduced queue_depth over a more complex
call chain isn't easil possible and this shouldn't happen on an of todays
systems.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-tag.c     |   13 --------
 block/blk-mq.c         |   84 +++++++++++++++++++++++++-----------------------
 block/blk-mq.h         |   18 +++++++++++
 include/linux/blk-mq.h |    8 -----
 4 files changed, 61 insertions(+), 62 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 83ae96c..108f82b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -7,19 +7,6 @@
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
-/*
- * Per tagged queue (tag address space) map
- */
-struct blk_mq_tags {
-	unsigned int nr_tags;
-	unsigned int nr_reserved_tags;
-	unsigned int nr_batch_move;
-	unsigned int nr_max_cache;
-
-	struct percpu_ida free_tags;
-	struct percpu_ida reserved_tags;
-};
-
 void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
 {
 	int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ec0c276..f1b5d52 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -81,7 +81,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
 
 	tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
 	if (tag != BLK_MQ_TAG_FAIL) {
-		rq = hctx->rqs[tag];
+		rq = hctx->tags->rqs[tag];
 		blk_rq_init(hctx->queue, rq);
 		rq->tag = tag;
 
@@ -406,7 +406,9 @@ static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
 		if (tag >= hctx->queue_depth)
 			break;
 
-		rq = hctx->rqs[tag++];
+		rq = hctx->tags->rqs[tag++];
+		if (rq->q != hctx->queue)
+			continue;
 
 		if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 			continue;
@@ -993,7 +995,7 @@ static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx,
 	int ret = 0;
 
 	for (i = 0; i < hctx->queue_depth; i++) {
-		struct request *rq = hctx->rqs[i];
+		struct request *rq = hctx->tags->rqs[i];
 
 		ret = init(data, hctx, rq, i);
 		if (ret)
@@ -1030,7 +1032,7 @@ static void blk_mq_free_hw_commands(struct blk_mq_hw_ctx *hctx,
 	unsigned int i;
 
 	for (i = 0; i < hctx->queue_depth; i++) {
-		struct request *rq = hctx->rqs[i];
+		struct request *rq = hctx->tags->rqs[i];
 
 		free(data, hctx, rq, i);
 	}
@@ -1049,20 +1051,19 @@ void blk_mq_free_commands(struct request_queue *q,
 }
 EXPORT_SYMBOL(blk_mq_free_commands);
 
-static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx)
+static void blk_mq_free_rq_map(struct blk_mq_tags *tags)
 {
 	struct page *page;
 
-	while (!list_empty(&hctx->page_list)) {
-		page = list_first_entry(&hctx->page_list, struct page, lru);
+	while (!list_empty(&tags->page_list)) {
+		page = list_first_entry(&tags->page_list, struct page, lru);
 		list_del_init(&page->lru);
 		__free_pages(page, page->private);
 	}
 
-	kfree(hctx->rqs);
+	kfree(tags->rqs);
 
-	if (hctx->tags)
-		blk_mq_free_tags(hctx->tags);
+	blk_mq_free_tags(tags);
 }
 
 static size_t order_to_size(unsigned int order)
@@ -1075,28 +1076,35 @@ static size_t order_to_size(unsigned int order)
 	return ret;
 }
 
-static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
-			      unsigned int reserved_tags, int node)
+static struct blk_mq_tags *blk_mq_init_rq_map(unsigned int total_tags,
+		unsigned int reserved_tags, unsigned int cmd_size, int node)
 {
+	struct blk_mq_tags *tags;
 	unsigned int i, j, entries_per_page, max_order = 4;
 	size_t rq_size, left;
 
-	INIT_LIST_HEAD(&hctx->page_list);
+	tags = blk_mq_init_tags(total_tags, reserved_tags, node);
+	if (!tags)
+		return NULL;
+
+	INIT_LIST_HEAD(&tags->page_list);
 
-	hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
+	tags->rqs = kmalloc_node(total_tags * sizeof(struct request *),
 					GFP_KERNEL, node);
-	if (!hctx->rqs)
-		return -ENOMEM;
+	if (!tags->rqs) {
+		blk_mq_free_tags(tags);
+		return NULL;
+	}
 
 	/*
 	 * rq_size is the size of the request plus driver payload, rounded
 	 * to the cacheline size
 	 */
-	rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
+	rq_size = round_up(sizeof(struct request) + cmd_size,
 				cache_line_size());
-	left = rq_size * hctx->queue_depth;
+	left = rq_size * total_tags;
 
-	for (i = 0; i < hctx->queue_depth;) {
+	for (i = 0; i < total_tags; ) {
 		int this_order = max_order;
 		struct page *page;
 		int to_do;
@@ -1116,38 +1124,28 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
 		} while (1);
 
 		if (!page)
-			break;
+			goto fail;
 
 		page->private = this_order;
-		list_add_tail(&page->lru, &hctx->page_list);
+		list_add_tail(&page->lru, &tags->page_list);
 
 		p = page_address(page);
 		entries_per_page = order_to_size(this_order) / rq_size;
-		to_do = min(entries_per_page, hctx->queue_depth - i);
+		to_do = min(entries_per_page, total_tags - i);
 		left -= to_do * rq_size;
 		for (j = 0; j < to_do; j++) {
-			hctx->rqs[i] = p;
+			tags->rqs[i] = p;
 			p += rq_size;
 			i++;
 		}
 	}
 
-	if (i < (reserved_tags + BLK_MQ_TAG_MIN))
-		goto err_rq_map;
-	else if (i != hctx->queue_depth) {
-		hctx->queue_depth = i;
-		pr_warn("%s: queue depth set to %u because of low memory\n",
-					__func__, i);
-	}
-
-	hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
-	if (!hctx->tags) {
-err_rq_map:
-		blk_mq_free_rq_map(hctx);
-		return -ENOMEM;
-	}
+	return tags;
 
-	return 0;
+fail:
+	pr_warn("%s: failed to allocate requests\n", __func__);
+	blk_mq_free_rq_map(tags);
+	return NULL;
 }
 
 static int blk_mq_init_hw_queues(struct request_queue *q,
@@ -1180,7 +1178,9 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 						blk_mq_hctx_notify, hctx);
 		blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
 
-		if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node))
+		hctx->tags = blk_mq_init_rq_map(hctx->queue_depth,
+				reg->reserved_tags, reg->cmd_size, node);
+		if (!hctx->tags)
 			break;
 
 		/*
@@ -1220,7 +1220,8 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 			reg->ops->exit_hctx(hctx, j);
 
 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
-		blk_mq_free_rq_map(hctx);
+		if (hctx->tags)
+			blk_mq_free_rq_map(hctx->tags);
 		kfree(hctx->ctxs);
 	}
 
@@ -1397,7 +1398,8 @@ void blk_mq_free_queue(struct request_queue *q)
 	queue_for_each_hw_ctx(q, hctx, i) {
 		kfree(hctx->ctx_map);
 		kfree(hctx->ctxs);
-		blk_mq_free_rq_map(hctx);
+		if (hctx->tags)
+			blk_mq_free_rq_map(hctx->tags);
 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
 		if (q->mq_ops->exit_hctx)
 			q->mq_ops->exit_hctx(hctx, i);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 238379a..eca3a47 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -1,6 +1,24 @@
 #ifndef INT_BLK_MQ_H
 #define INT_BLK_MQ_H
 
+#include <linux/percpu_ida.h>
+
+/*
+ * Tag address space map.
+ */
+struct blk_mq_tags {
+	unsigned int nr_tags;
+	unsigned int nr_reserved_tags;
+	unsigned int nr_batch_move;
+	unsigned int nr_max_cache;
+
+	struct percpu_ida free_tags;
+	struct percpu_ida reserved_tags;
+
+	struct request **rqs;
+	struct list_head page_list;
+};
+
 struct blk_mq_ctx {
 	struct {
 		spinlock_t		lock;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 0120451..69aa3ad 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -32,8 +32,6 @@ struct blk_mq_hw_ctx {
 	unsigned int 		nr_ctx_map;
 	unsigned long		*ctx_map;
 
-	struct request		**rqs;
-	struct list_head	page_list;
 	struct blk_mq_tags	*tags;
 
 	unsigned long		queued;
@@ -162,12 +160,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
 	return (void *) rq + sizeof(*rq);
 }
 
-static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx,
-					       unsigned int tag)
-{
-	return hctx->rqs[tag];
-}
-
 #define queue_for_each_hw_ctx(q, hctx, i)				\
 	for ((i) = 0; (i) < (q)->nr_hw_queues &&			\
 	     ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 4/4] blk-mq: support shared tag maps
  2014-03-31 14:46 [RFC] blk-mq: support for shared tags Christoph Hellwig
                   ` (2 preceding siblings ...)
  2014-03-31 14:46 ` [PATCH 3/4] blk-mq: move request structures into struct blk_mq_tags Christoph Hellwig
@ 2014-03-31 14:46 ` Christoph Hellwig
  2014-04-02  0:16 ` [RFC] blk-mq: support for shared tags Matias Bjorling
  4 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2014-03-31 14:46 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Matias Bjorling, linux-kernel, linux-scsi

---
 block/blk-mq-tag.c     |    2 ++
 block/blk-mq.c         |   83 +++++++++++++++++++++++++++++++++++++++++++-----
 block/blk-mq.h         |    2 ++
 include/linux/blk-mq.h |   12 +++++++
 4 files changed, 91 insertions(+), 8 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 108f82b..a7b1888 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -121,6 +121,8 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	if (!tags)
 		return NULL;
 
+	kref_init(&tags->ref_count);
+
 	nr_tags = total_tags - reserved_tags;
 	nr_cache = nr_tags / num_possible_cpus();
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index f1b5d52..3d63d71 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1051,8 +1051,10 @@ void blk_mq_free_commands(struct request_queue *q,
 }
 EXPORT_SYMBOL(blk_mq_free_commands);
 
-static void blk_mq_free_rq_map(struct blk_mq_tags *tags)
+static void blk_mq_free_rq_map(struct kref *kref)
 {
+	struct blk_mq_tags *tags =
+		container_of(kref, struct blk_mq_tags, ref_count);
 	struct page *page;
 
 	while (!list_empty(&tags->page_list)) {
@@ -1066,6 +1068,17 @@ static void blk_mq_free_rq_map(struct blk_mq_tags *tags)
 	blk_mq_free_tags(tags);
 }
 
+static void blk_mq_put_rq_map(struct blk_mq_tags *tags)
+{
+	kref_put(&tags->ref_count, blk_mq_free_rq_map);
+}
+
+static struct blk_mq_tags *blk_mq_get_rq_map(struct blk_mq_tags *tags)
+{
+	kref_get(&tags->ref_count);
+	return tags;
+}
+
 static size_t order_to_size(unsigned int order)
 {
 	size_t ret = PAGE_SIZE;
@@ -1144,7 +1157,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(unsigned int total_tags,
 
 fail:
 	pr_warn("%s: failed to allocate requests\n", __func__);
-	blk_mq_free_rq_map(tags);
+	blk_mq_put_rq_map(tags);
 	return NULL;
 }
 
@@ -1178,10 +1191,14 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 						blk_mq_hctx_notify, hctx);
 		blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
 
-		hctx->tags = blk_mq_init_rq_map(hctx->queue_depth,
-				reg->reserved_tags, reg->cmd_size, node);
-		if (!hctx->tags)
-			break;
+		if (reg->shared_tags) {
+			hctx->tags = blk_mq_get_rq_map(reg->shared_tags->tags[i]);
+		} else {
+			hctx->tags = blk_mq_init_rq_map(hctx->queue_depth,
+					reg->reserved_tags, reg->cmd_size, node);
+			if (!hctx->tags)
+				break;
+		}
 
 		/*
 		 * Allocate space for all possible cpus to avoid allocation in
@@ -1221,7 +1238,7 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 
 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
 		if (hctx->tags)
-			blk_mq_free_rq_map(hctx->tags);
+			blk_mq_put_rq_map(hctx->tags);
 		kfree(hctx->ctxs);
 	}
 
@@ -1399,7 +1416,7 @@ void blk_mq_free_queue(struct request_queue *q)
 		kfree(hctx->ctx_map);
 		kfree(hctx->ctxs);
 		if (hctx->tags)
-			blk_mq_free_rq_map(hctx->tags);
+			blk_mq_put_rq_map(hctx->tags);
 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
 		if (q->mq_ops->exit_hctx)
 			q->mq_ops->exit_hctx(hctx, i);
@@ -1459,6 +1476,56 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+struct blk_mq_shared_tags *blk_mq_alloc_shared_tags(struct blk_mq_reg *reg,
+		int (*init)(void *, struct request *), void *data)
+{
+	struct blk_mq_shared_tags *shared_tags;
+	int i, j;
+
+	shared_tags = kmalloc_node(sizeof(*shared_tags) +
+				reg->nr_hw_queues * sizeof(struct blk_mq_tags),
+				GFP_KERNEL, reg->numa_node);
+	if (!shared_tags)
+		goto out;
+
+	shared_tags->nr_hw_queues = reg->nr_hw_queues;
+	shared_tags->queue_depth = reg->queue_depth;
+	for (i = 0; i < reg->nr_hw_queues; i++) {
+		shared_tags->tags[i] = blk_mq_init_rq_map(reg->queue_depth,
+				reg->reserved_tags, reg->cmd_size, reg->numa_node);
+		if (!shared_tags->tags[i])
+			goto out_unwind;
+
+		for (j = 0; j < reg->queue_depth; j++) {
+			struct request *rq = shared_tags->tags[i]->rqs[j];
+			int ret;
+
+			ret = init(data, rq);
+			BUG_ON(ret);
+		}
+	}
+
+	return shared_tags;
+
+out_unwind:
+	while (--i >= 0)
+		blk_mq_put_rq_map(shared_tags->tags[i]);
+out:
+	return NULL;
+}
+
+void blk_mq_free_shared_tags(struct blk_mq_shared_tags *shared_tags,
+		void (*exit)(void *, struct request *), void *data)
+{
+	int i, j;
+
+	for (i = 0; i < shared_tags->nr_hw_queues; i++) {
+		for (j = 0; j < shared_tags->queue_depth; j++)
+			exit(data, shared_tags->tags[i]->rqs[j]);
+		blk_mq_put_rq_map(shared_tags->tags[i]);
+	}
+}
+
 void blk_mq_disable_hotplug(void)
 {
 	mutex_lock(&all_q_mutex);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index eca3a47..d7c753b 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -17,6 +17,8 @@ struct blk_mq_tags {
 
 	struct request **rqs;
 	struct list_head page_list;
+
+	struct kref ref_count;
 };
 
 struct blk_mq_ctx {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 69aa3ad..a564167 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -47,6 +47,12 @@ struct blk_mq_hw_ctx {
 	struct kobject		kobj;
 };
 
+struct blk_mq_shared_tags {
+	unsigned int		nr_hw_queues;
+	unsigned int		queue_depth;
+	struct blk_mq_tags	*tags[];
+};
+
 struct blk_mq_reg {
 	struct blk_mq_ops	*ops;
 	unsigned int		nr_hw_queues;
@@ -56,6 +62,7 @@ struct blk_mq_reg {
 	int			numa_node;
 	unsigned int		timeout;
 	unsigned int		flags;		/* BLK_MQ_F_* */
+	struct blk_mq_shared_tags *shared_tags;
 };
 
 typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
@@ -118,6 +125,11 @@ void blk_mq_unregister_disk(struct gendisk *);
 int blk_mq_init_commands(struct request_queue *, int (*init)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data);
 void blk_mq_free_commands(struct request_queue *, void (*free)(void *data, struct blk_mq_hw_ctx *, struct request *, unsigned int), void *data);
 
+struct blk_mq_shared_tags *blk_mq_alloc_shared_tags(struct blk_mq_reg *reg,
+		int (*init)(void *, struct request *), void *data);
+void blk_mq_free_shared_tags(struct blk_mq_shared_tags *shared_tags,
+		void (*exit)(void *, struct request *), void *data);
+
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
 void blk_mq_insert_request(struct request *, bool, bool, bool);
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [RFC] blk-mq: support for shared tags
  2014-03-31 14:46 [RFC] blk-mq: support for shared tags Christoph Hellwig
                   ` (3 preceding siblings ...)
  2014-03-31 14:46 ` [PATCH 4/4] blk-mq: support shared tag maps Christoph Hellwig
@ 2014-04-02  0:16 ` Matias Bjorling
  2014-04-02  7:46   ` Christoph Hellwig
  4 siblings, 1 reply; 20+ messages in thread
From: Matias Bjorling @ 2014-04-02  0:16 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe; +Cc: linux-kernel, linux-scsi

On 03/31/2014 07:46 AM, Christoph Hellwig wrote:
> This series adds support for sharing tags (and thus requests) between
> multiple request_queues.  We'll need this for SCSI, and I think Martin
> also wants something similar for nvme.
> 
> Besides the mess with request contructors/destructors the major RFC here
> is how the blk_mq_alloc_shared_tags API should look like.  For now I've
> been lazy and reused struct blk_mq_reg, but that feels a bit cumbersome.
> Either a separate blk_mq_tags_reg or just passing the few arguments directly
> would work fine for me.
> 

Hi Christoph,

Can you rebase it on top of 3.14. I have trouble applying it for testing.

For nvme, there's need for two separate types of queues. The admin queue
(before initializing blk-mq) and the actual hardware queues.

Should we allow the driver to get/put tags before initializing blk-mq?
Or let drivers implement their own framework?

Thanks,
Matias

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC] blk-mq: support for shared tags
  2014-04-02  0:16 ` [RFC] blk-mq: support for shared tags Matias Bjorling
@ 2014-04-02  7:46   ` Christoph Hellwig
  2014-04-03  4:10     ` Matias Bjorling
  0 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2014-04-02  7:46 UTC (permalink / raw)
  To: Matias Bjorling; +Cc: Jens Axboe, linux-kernel, linux-scsi

On Tue, Apr 01, 2014 at 05:16:21PM -0700, Matias Bjorling wrote:
> Hi Christoph,
> 
> Can you rebase it on top of 3.14. I have trouble applying it for testing.

Hi Martin,

the series is based on top of Jens' for-next branch.  I've also pushed out a
git tree to the blk-mq-share-tags.2 branch of

	git://git.infradead.org/users/hch/scsi.git

to make testing and reviewing easier.

> For nvme, there's need for two separate types of queues. The admin queue
> (before initializing blk-mq) and the actual hardware queues.
> 
> Should we allow the driver to get/put tags before initializing blk-mq?
> Or let drivers implement their own framework?

What do you mean with initializing blk-mq?  We need to allocate data
structures for sure, and I don't see much else in terms of initialization
in blk-mq.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC] blk-mq: support for shared tags
  2014-04-02  7:46   ` Christoph Hellwig
@ 2014-04-03  4:10     ` Matias Bjorling
  2014-04-03  7:36       ` Christoph Hellwig
  0 siblings, 1 reply; 20+ messages in thread
From: Matias Bjorling @ 2014-04-03  4:10 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jens Axboe, linux-kernel, linux-scsi

On 04/02/2014 12:46 AM, Christoph Hellwig wrote:
> On Tue, Apr 01, 2014 at 05:16:21PM -0700, Matias Bjorling wrote:
>> Hi Christoph,
>>
>> Can you rebase it on top of 3.14. I have trouble applying it for testing.
> 
> Hi Martin,
> 
> the series is based on top of Jens' for-next branch.  I've also pushed out a
> git tree to the blk-mq-share-tags.2 branch of
> 
> 	git://git.infradead.org/users/hch/scsi.git
> 
> to make testing and reviewing easier.
>

Thanks.

Regarding the tags API. I think the best approach is a struct
blk_mq_tags_reg. That'll make their parameters very visible in the
drivers. I'll send a patch with the change, using the nvme driver as an
example.

>> For nvme, there's need for two separate types of queues. The admin queue
>> (before initializing blk-mq) and the actual hardware queues.
>>
>> Should we allow the driver to get/put tags before initializing blk-mq?
>> Or let drivers implement their own framework?
> 
> What do you mean with initializing blk-mq?  We need to allocate data
> structures for sure, and I don't see much else in terms of initialization
> in blk-mq.
> 

For the nvme driver, there's a single admin queue, which is outside
blk-mq's control, and the X normal queues. Should we allow the shared
tags structure to be used (get/put) for the admin queue, without
initializing blk-mq? or should the drivers simply implement their own
tags for their admin queue?




^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC] blk-mq: support for shared tags
  2014-04-03  4:10     ` Matias Bjorling
@ 2014-04-03  7:36       ` Christoph Hellwig
  2014-04-03 16:45         ` Matias Bjorling
  0 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2014-04-03  7:36 UTC (permalink / raw)
  To: Matias Bjorling; +Cc: Jens Axboe, linux-kernel, linux-scsi

On Wed, Apr 02, 2014 at 09:10:12PM -0700, Matias Bjorling wrote:
> For the nvme driver, there's a single admin queue, which is outside
> blk-mq's control, and the X normal queues. Should we allow the shared
> tags structure to be used (get/put) for the admin queue, without
> initializing blk-mq? or should the drivers simply implement their own
> tags for their admin queue?

I'd still create a request_queue for the internal queue, just not register
a block device for it.  For example SCSI sets up queues for each LUN
found, but only a subset actually is exposed as a block device.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC] blk-mq: support for shared tags
  2014-04-03  7:36       ` Christoph Hellwig
@ 2014-04-03 16:45         ` Matias Bjorling
  2014-04-03 18:01           ` Christoph Hellwig
  0 siblings, 1 reply; 20+ messages in thread
From: Matias Bjorling @ 2014-04-03 16:45 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jens Axboe, linux-kernel, linux-scsi

On 04/03/2014 12:36 AM, Christoph Hellwig wrote:
> On Wed, Apr 02, 2014 at 09:10:12PM -0700, Matias Bjorling wrote:
>> For the nvme driver, there's a single admin queue, which is outside
>> blk-mq's control, and the X normal queues. Should we allow the shared
>> tags structure to be used (get/put) for the admin queue, without
>> initializing blk-mq? or should the drivers simply implement their own
>> tags for their admin queue?
> 
> I'd still create a request_queue for the internal queue, just not register
> a block device for it.  For example SCSI sets up queues for each LUN
> found, but only a subset actually is exposed as a block device.
> 

Ok. That is good enough for now. A little heavy on the overhead side, if
only the tag logic is needed.

What about the following suggestions for shared tags:

1. Rename it from blk_mq_shared_tags to blk_mq_tag_group. A driver can
have several tag groups that it maintains.
2. Instead of blk_mq_shared_tags structure in blk_mq_reg. Have  function
pointer for getting the tags structure during hctx initialization. This
is interesting for nvme, because it has as set of tags for each hardware
queue it exposes.

Thanks,
Matias

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC] blk-mq: support for shared tags
  2014-04-03 16:45         ` Matias Bjorling
@ 2014-04-03 18:01           ` Christoph Hellwig
  2014-04-03 21:47             ` Matias Bjorling
  0 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2014-04-03 18:01 UTC (permalink / raw)
  To: Matias Bjorling; +Cc: Jens Axboe, linux-kernel, linux-scsi

On Thu, Apr 03, 2014 at 09:45:11AM -0700, Matias Bjorling wrote:
> > I'd still create a request_queue for the internal queue, just not register
> > a block device for it.  For example SCSI sets up queues for each LUN
> > found, but only a subset actually is exposed as a block device.
> > 
> 
> Ok. That is good enough for now. A little heavy on the overhead side, if
> only the tag logic is needed.
> 
> What about the following suggestions for shared tags:
> 
> 1. Rename it from blk_mq_shared_tags to blk_mq_tag_group. A driver can
> have several tag groups that it maintains.

I was going to rename it to tag_set, but tag_group sounds fine to me as well.

> 2. Instead of blk_mq_shared_tags structure in blk_mq_reg. Have  function
> pointer for getting the tags structure during hctx initialization. This
> is interesting for nvme, because it has as set of tags for each hardware
> queue it exposes.

The current code also has an array of blk_mq_tags structures, one for
each queue.  Do you need a more complicated mapping than that?

Btw, I was also going to siply split out the tag allocation from the
queue registration unconditionally.  While this adds a little more
boilerplate to simple drivers it avoids unconditional code pathes and should
make the model much easier to understand.  I should have a new version
of the patches soon.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC] blk-mq: support for shared tags
  2014-04-03 18:01           ` Christoph Hellwig
@ 2014-04-03 21:47             ` Matias Bjorling
  2014-04-04 15:19               ` Christoph Hellwig
  0 siblings, 1 reply; 20+ messages in thread
From: Matias Bjorling @ 2014-04-03 21:47 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Jens Axboe, linux-kernel, linux-scsi

On 04/03/2014 11:01 AM, Christoph Hellwig wrote:
> On Thu, Apr 03, 2014 at 09:45:11AM -0700, Matias Bjorling wrote:
>>> I'd still create a request_queue for the internal queue, just not register
>>> a block device for it.  For example SCSI sets up queues for each LUN
>>> found, but only a subset actually is exposed as a block device.
>>>
>>
>> Ok. That is good enough for now. A little heavy on the overhead side, if
>> only the tag logic is needed.
>>
>> What about the following suggestions for shared tags:
>>
>> 1. Rename it from blk_mq_shared_tags to blk_mq_tag_group. A driver can
>> have several tag groups that it maintains.
> 
> I was going to rename it to tag_set, but tag_group sounds fine to me as well.
>

tag_set is shorter. tag_set it is.

>> 2. Instead of blk_mq_shared_tags structure in blk_mq_reg. Have  function
>> pointer for getting the tags structure during hctx initialization. This
>> is interesting for nvme, because it has as set of tags for each hardware
>> queue it exposes.
> 
> The current code also has an array of blk_mq_tags structures, one for
> each queue.  Do you need a more complicated mapping than that?
> 

No, that's great. Had misinterpreted the arrays, now that I look at it
again. Thanks

> Btw, I was also going to siply split out the tag allocation from the
> queue registration unconditionally.  While this adds a little more
> boilerplate to simple drivers it avoids unconditional code pathes and should
> make the model much easier to understand.  I should have a new version
> of the patches soon.
> 

ack, good idea.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [RFC] blk-mq: support for shared tags
  2014-04-03 21:47             ` Matias Bjorling
@ 2014-04-04 15:19               ` Christoph Hellwig
  0 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2014-04-04 15:19 UTC (permalink / raw)
  To: Matias Bjorling; +Cc: Jens Axboe, linux-kernel, linux-scsi

Hi Matias,

I've pushed out a new version of the shared tag support to the
blk-mq-share-tags.3 branch of

  git://git.infradead.org/users/hch/scsi.git

and I'm fairly happy how it turned out.  The new blk_mq_tag_set
structure is now allocated by the driver and fully replaces the old
_reg structure which most drivers used in a very race way. blk_mq_init_queue
now only takes the tag_set as argument and doesn't take any other paramters
by itself, giving a very simple user interface.

I've attached the actual shared tags patch below, beside the patches
I already sent to Jens the only other remaining one is the unchanged
patch to initialize requests on allocation.

---
From: Christoph Hellwig <hch@lst.de>
Subject: blk-mq: split out tag initialization, support shared tags

Add a new blk_mq_tag_set structure that gets set up before we initialize
the queue.  A single blk_mq_tag_set structure can be shared by multiple
queues.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-cpumap.c      |    6 +-
 block/blk-mq-tag.c         |   13 ---
 block/blk-mq.c             |  241 ++++++++++++++++++++++++--------------------
 block/blk-mq.h             |   23 ++++-
 drivers/block/null_blk.c   |   92 ++++++++++-------
 drivers/block/virtio_blk.c |   39 ++++---
 include/linux/blk-mq.h     |   34 +++----
 7 files changed, 253 insertions(+), 195 deletions(-)

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 0979213..5d0f93c 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -80,17 +80,17 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
 	return 0;
 }
 
-unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg)
+unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
 {
 	unsigned int *map;
 
 	/* If cpus are offline, map them to first hctx */
 	map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL,
-				reg->numa_node);
+				set->numa_node);
 	if (!map)
 		return NULL;
 
-	if (!blk_mq_update_queue_map(map, reg->nr_hw_queues))
+	if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
 		return map;
 
 	kfree(map);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 83ae96c..108f82b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -7,19 +7,6 @@
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 
-/*
- * Per tagged queue (tag address space) map
- */
-struct blk_mq_tags {
-	unsigned int nr_tags;
-	unsigned int nr_reserved_tags;
-	unsigned int nr_batch_move;
-	unsigned int nr_max_cache;
-
-	struct percpu_ida free_tags;
-	struct percpu_ida reserved_tags;
-};
-
 void blk_mq_wait_for_tags(struct blk_mq_tags *tags)
 {
 	int tag = blk_mq_get_tag(tags, __GFP_WAIT, false);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ab8e347..2972855 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -81,7 +81,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx,
 
 	tag = blk_mq_get_tag(hctx->tags, gfp, reserved);
 	if (tag != BLK_MQ_TAG_FAIL) {
-		rq = hctx->rqs[tag];
+		rq = hctx->tags->rqs[tag];
 		blk_rq_init(hctx->queue, rq);
 		rq->tag = tag;
 
@@ -401,6 +401,11 @@ static void blk_mq_requeue_request(struct request *rq)
 		rq->nr_phys_segments--;
 }
 
+struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
+{
+	return tags->rqs[tag];
+}
+
 struct blk_mq_timeout_data {
 	struct blk_mq_hw_ctx *hctx;
 	unsigned long *next;
@@ -422,12 +427,13 @@ static void blk_mq_timeout_check(void *__data, unsigned long *free_tags)
 	do {
 		struct request *rq;
 
-		tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag);
-		if (tag >= hctx->queue_depth)
+		tag = find_next_zero_bit(free_tags, hctx->tags->nr_tags, tag);
+		if (tag >= hctx->tags->nr_tags)
 			break;
 
-		rq = hctx->rqs[tag++];
-
+		rq = blk_mq_tag_to_rq(hctx->tags, tag++);
+		if (rq->q != hctx->queue)
+			continue;
 		if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 			continue;
 
@@ -947,11 +953,11 @@ struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
 }
 EXPORT_SYMBOL(blk_mq_map_queue);
 
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg,
+struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *set,
 						   unsigned int hctx_index)
 {
 	return kmalloc_node(sizeof(struct blk_mq_hw_ctx),
-				GFP_KERNEL | __GFP_ZERO, reg->numa_node);
+				GFP_KERNEL | __GFP_ZERO, set->numa_node);
 }
 EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue);
 
@@ -1004,31 +1010,31 @@ static void blk_mq_hctx_notify(void *data, unsigned long action,
 	blk_mq_put_ctx(ctx);
 }
 
-static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx, void *driver_data)
+static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
+		struct blk_mq_tags *tags, unsigned int hctx_idx)
 {
 	struct page *page;
 
-	if (hctx->rqs && hctx->queue->mq_ops->exit_request) {
+	if (tags->rqs && set->ops->exit_request) {
 		int i;
 
-		for (i = 0; i < hctx->queue_depth; i++) {
-			if (!hctx->rqs[i])
+		for (i = 0; i < tags->nr_tags; i++) {
+			if (!tags->rqs[i])
 				continue;
-			hctx->queue->mq_ops->exit_request(driver_data, hctx,
-							  hctx->rqs[i], i);
+			set->ops->exit_request(set->driver_data, tags->rqs[i],
+						hctx_idx, i);
 		}
 	}
 
-	while (!list_empty(&hctx->page_list)) {
-		page = list_first_entry(&hctx->page_list, struct page, lru);
+	while (!list_empty(&tags->page_list)) {
+		page = list_first_entry(&tags->page_list, struct page, lru);
 		list_del_init(&page->lru);
 		__free_pages(page, page->private);
 	}
 
-	kfree(hctx->rqs);
+	kfree(tags->rqs);
 
-	if (hctx->tags)
-		blk_mq_free_tags(hctx->tags);
+	blk_mq_free_tags(tags);
 }
 
 static size_t order_to_size(unsigned int order)
@@ -1041,30 +1047,36 @@ static size_t order_to_size(unsigned int order)
 	return ret;
 }
 
-static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
-		struct blk_mq_reg *reg, void *driver_data, int node)
+static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
+		unsigned int hctx_idx)
 {
-	unsigned int reserved_tags = reg->reserved_tags;
+	struct blk_mq_tags *tags;
 	unsigned int i, j, entries_per_page, max_order = 4;
 	size_t rq_size, left;
-	int error;
 
-	INIT_LIST_HEAD(&hctx->page_list);
+	tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
+				set->numa_node);
+	if (!tags)
+		return NULL;
 
-	hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *),
-					GFP_KERNEL, node);
-	if (!hctx->rqs)
-		return -ENOMEM;
+	INIT_LIST_HEAD(&tags->page_list);
+
+	tags->rqs = kmalloc_node(set->queue_depth * sizeof(struct request *),
+					GFP_KERNEL, set->numa_node);
+	if (!tags->rqs) {
+		blk_mq_free_tags(tags);
+		return NULL;
+	}
 
 	/*
 	 * rq_size is the size of the request plus driver payload, rounded
 	 * to the cacheline size
 	 */
-	rq_size = round_up(sizeof(struct request) + hctx->cmd_size,
+	rq_size = round_up(sizeof(struct request) + set->cmd_size,
 				cache_line_size());
-	left = rq_size * hctx->queue_depth;
+	left = rq_size * set->queue_depth;
 
-	for (i = 0; i < hctx->queue_depth;) {
+	for (i = 0; i < set->queue_depth; ) {
 		int this_order = max_order;
 		struct page *page;
 		int to_do;
@@ -1074,7 +1086,8 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
 			this_order--;
 
 		do {
-			page = alloc_pages_node(node, GFP_KERNEL, this_order);
+			page = alloc_pages_node(set->numa_node, GFP_KERNEL,
+						this_order);
 			if (page)
 				break;
 			if (!this_order--)
@@ -1084,22 +1097,22 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
 		} while (1);
 
 		if (!page)
-			break;
+			goto fail;
 
 		page->private = this_order;
-		list_add_tail(&page->lru, &hctx->page_list);
+		list_add_tail(&page->lru, &tags->page_list);
 
 		p = page_address(page);
 		entries_per_page = order_to_size(this_order) / rq_size;
-		to_do = min(entries_per_page, hctx->queue_depth - i);
+		to_do = min(entries_per_page, set->queue_depth - i);
 		left -= to_do * rq_size;
 		for (j = 0; j < to_do; j++) {
-			hctx->rqs[i] = p;
-			if (reg->ops->init_request) {
-				error = reg->ops->init_request(driver_data,
-						hctx, hctx->rqs[i], i);
-				if (error)
-					goto err_rq_map;
+			tags->rqs[i] = p;
+			if (set->ops->init_request) {
+				if (set->ops->init_request(set->driver_data,
+						tags->rqs[i], hctx_idx, i,
+						set->numa_node))
+					goto fail;
 			}
 
 			p += rq_size;
@@ -1107,30 +1120,16 @@ static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx,
 		}
 	}
 
-	if (i < (reserved_tags + BLK_MQ_TAG_MIN)) {
-		error = -ENOMEM;
-		goto err_rq_map;
-	}
-	if (i != hctx->queue_depth) {
-		hctx->queue_depth = i;
-		pr_warn("%s: queue depth set to %u because of low memory\n",
-					__func__, i);
-	}
+	return tags;
 
-	hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node);
-	if (!hctx->tags) {
-		error = -ENOMEM;
-		goto err_rq_map;
-	}
-
-	return 0;
-err_rq_map:
-	blk_mq_free_rq_map(hctx, driver_data);
-	return error;
+fail:
+	pr_warn("%s: failed to allocate requests\n", __func__);
+	blk_mq_free_rq_map(set, tags, hctx_idx);
+	return NULL;
 }
 
 static int blk_mq_init_hw_queues(struct request_queue *q,
-				 struct blk_mq_reg *reg, void *driver_data)
+		struct blk_mq_tag_set *set)
 {
 	struct blk_mq_hw_ctx *hctx;
 	unsigned int i, j;
@@ -1144,23 +1143,21 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 
 		node = hctx->numa_node;
 		if (node == NUMA_NO_NODE)
-			node = hctx->numa_node = reg->numa_node;
+			node = hctx->numa_node = set->numa_node;
 
 		INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn);
 		spin_lock_init(&hctx->lock);
 		INIT_LIST_HEAD(&hctx->dispatch);
 		hctx->queue = q;
 		hctx->queue_num = i;
-		hctx->flags = reg->flags;
-		hctx->queue_depth = reg->queue_depth;
-		hctx->cmd_size = reg->cmd_size;
+		hctx->flags = set->flags;
+		hctx->cmd_size = set->cmd_size;
 
 		blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
 						blk_mq_hctx_notify, hctx);
 		blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
 
-		if (blk_mq_init_rq_map(hctx, reg, driver_data, node))
-			break;
+		hctx->tags = set->tags[i];
 
 		/*
 		 * Allocate space for all possible cpus to avoid allocation in
@@ -1180,8 +1177,8 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 		hctx->nr_ctx_map = num_maps;
 		hctx->nr_ctx = 0;
 
-		if (reg->ops->init_hctx &&
-		    reg->ops->init_hctx(hctx, driver_data, i))
+		if (set->ops->init_hctx &&
+		    set->ops->init_hctx(hctx, set->driver_data, i))
 			break;
 	}
 
@@ -1195,11 +1192,10 @@ static int blk_mq_init_hw_queues(struct request_queue *q,
 		if (i == j)
 			break;
 
-		if (reg->ops->exit_hctx)
-			reg->ops->exit_hctx(hctx, j);
+		if (set->ops->exit_hctx)
+			set->ops->exit_hctx(hctx, j);
 
 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
-		blk_mq_free_rq_map(hctx, driver_data);
 		kfree(hctx->ctxs);
 	}
 
@@ -1258,41 +1254,25 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	}
 }
 
-struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
-					void *driver_data)
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 {
 	struct blk_mq_hw_ctx **hctxs;
 	struct blk_mq_ctx *ctx;
 	struct request_queue *q;
 	int i;
 
-	if (!reg->nr_hw_queues ||
-	    !reg->ops->queue_rq || !reg->ops->map_queue ||
-	    !reg->ops->alloc_hctx || !reg->ops->free_hctx)
-		return ERR_PTR(-EINVAL);
-
-	if (!reg->queue_depth)
-		reg->queue_depth = BLK_MQ_MAX_DEPTH;
-	else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) {
-		pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth);
-		reg->queue_depth = BLK_MQ_MAX_DEPTH;
-	}
-
-	if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN))
-		return ERR_PTR(-EINVAL);
-
 	ctx = alloc_percpu(struct blk_mq_ctx);
 	if (!ctx)
 		return ERR_PTR(-ENOMEM);
 
-	hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
-			reg->numa_node);
+	hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
+			set->numa_node);
 
 	if (!hctxs)
 		goto err_percpu;
 
-	for (i = 0; i < reg->nr_hw_queues; i++) {
-		hctxs[i] = reg->ops->alloc_hctx(reg, i);
+	for (i = 0; i < set->nr_hw_queues; i++) {
+		hctxs[i] = set->ops->alloc_hctx(set, i);
 		if (!hctxs[i])
 			goto err_hctxs;
 
@@ -1300,11 +1280,11 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
 		hctxs[i]->queue_num = i;
 	}
 
-	q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node);
+	q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
 	if (!q)
 		goto err_hctxs;
 
-	q->mq_map = blk_mq_make_queue_map(reg);
+	q->mq_map = blk_mq_make_queue_map(set);
 	if (!q->mq_map)
 		goto err_map;
 
@@ -1312,33 +1292,34 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg,
 	blk_queue_rq_timeout(q, 30000);
 
 	q->nr_queues = nr_cpu_ids;
-	q->nr_hw_queues = reg->nr_hw_queues;
+	q->nr_hw_queues = set->nr_hw_queues;
 
 	q->queue_ctx = ctx;
 	q->queue_hw_ctx = hctxs;
 
-	q->mq_ops = reg->ops;
+	q->mq_ops = set->ops;
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 
 	q->sg_reserved_size = INT_MAX;
 
 	blk_queue_make_request(q, blk_mq_make_request);
-	blk_queue_rq_timed_out(q, reg->ops->timeout);
-	if (reg->timeout)
-		blk_queue_rq_timeout(q, reg->timeout);
+	blk_queue_rq_timed_out(q, set->ops->timeout);
+	if (set->timeout)
+		blk_queue_rq_timeout(q, set->timeout);
 
-	if (reg->ops->complete)
-		blk_queue_softirq_done(q, reg->ops->complete);
+	if (set->ops->complete)
+		blk_queue_softirq_done(q, set->ops->complete);
 
 	blk_mq_init_flush(q);
-	blk_mq_init_cpu_queues(q, reg->nr_hw_queues);
+	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
 
-	q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size,
-				cache_line_size()), GFP_KERNEL);
+	q->flush_rq = kzalloc(round_up(sizeof(struct request) +
+				set->cmd_size, cache_line_size()),
+				GFP_KERNEL);
 	if (!q->flush_rq)
 		goto err_hw;
 
-	if (blk_mq_init_hw_queues(q, reg, driver_data))
+	if (blk_mq_init_hw_queues(q, set))
 		goto err_flush_rq;
 
 	blk_mq_map_swqueue(q);
@@ -1356,10 +1337,10 @@ err_hw:
 err_map:
 	blk_cleanup_queue(q);
 err_hctxs:
-	for (i = 0; i < reg->nr_hw_queues; i++) {
+	for (i = 0; i < set->nr_hw_queues; i++) {
 		if (!hctxs[i])
 			break;
-		reg->ops->free_hctx(hctxs[i], i);
+		set->ops->free_hctx(hctxs[i], i);
 	}
 	kfree(hctxs);
 err_percpu:
@@ -1376,7 +1357,6 @@ void blk_mq_free_queue(struct request_queue *q)
 	queue_for_each_hw_ctx(q, hctx, i) {
 		kfree(hctx->ctx_map);
 		kfree(hctx->ctxs);
-		blk_mq_free_rq_map(hctx, q->queuedata);
 		blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
 		if (q->mq_ops->exit_hctx)
 			q->mq_ops->exit_hctx(hctx, i);
@@ -1436,6 +1416,51 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
+{
+	int i;
+
+	if (!set->nr_hw_queues)
+		return -EINVAL;
+	if (!set->queue_depth || set->queue_depth > BLK_MQ_MAX_DEPTH)
+		return -EINVAL;
+	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
+		return -EINVAL;
+
+	if (!set->nr_hw_queues ||
+	    !set->ops->queue_rq || !set->ops->map_queue ||
+	    !set->ops->alloc_hctx || !set->ops->free_hctx)
+		return -EINVAL;
+
+
+	set->tags = kmalloc_node(set->nr_hw_queues * sizeof(struct blk_mq_tags),
+				 GFP_KERNEL, set->numa_node);
+	if (!set->tags)
+		goto out;
+
+	for (i = 0; i < set->nr_hw_queues; i++) {
+		set->tags[i] = blk_mq_init_rq_map(set, i);
+		if (!set->tags[i])
+			goto out_unwind;
+	}
+
+	return 0;
+
+out_unwind:
+	while (--i >= 0)
+		blk_mq_free_rq_map(set, set->tags[i], i);
+out:
+	return -ENOMEM;
+}
+
+void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
+{
+	int i;
+
+	for (i = 0; i < set->nr_hw_queues; i++)
+		blk_mq_free_rq_map(set, set->tags[i], i);
+}
+
 void blk_mq_disable_hotplug(void)
 {
 	mutex_lock(&all_q_mutex);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 7964dad..355366e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -1,6 +1,26 @@
 #ifndef INT_BLK_MQ_H
 #define INT_BLK_MQ_H
 
+#include <linux/percpu_ida.h>
+
+struct blk_mq_tag_set;
+
+/*
+ * Tag address space map.
+ */
+struct blk_mq_tags {
+	unsigned int nr_tags;
+	unsigned int nr_reserved_tags;
+	unsigned int nr_batch_move;
+	unsigned int nr_max_cache;
+
+	struct percpu_ida free_tags;
+	struct percpu_ida reserved_tags;
+
+	struct request **rqs;
+	struct list_head page_list;
+};
+
 struct blk_mq_ctx {
 	struct {
 		spinlock_t		lock;
@@ -46,8 +66,7 @@ void blk_mq_disable_hotplug(void);
 /*
  * CPU -> queue mappings
  */
-struct blk_mq_reg;
-extern unsigned int *blk_mq_make_queue_map(struct blk_mq_reg *reg);
+extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
 extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
 
 void blk_mq_add_timer(struct request *rq);
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 71df69d..8e7e3a0 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -32,6 +32,7 @@ struct nullb {
 	unsigned int index;
 	struct request_queue *q;
 	struct gendisk *disk;
+	struct blk_mq_tag_set tag_set;
 	struct hrtimer timer;
 	unsigned int queue_depth;
 	spinlock_t lock;
@@ -320,10 +321,11 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
 	return BLK_MQ_RQ_QUEUE_OK;
 }
 
-static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned int hctx_index)
+static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_tag_set *set,
+		unsigned int hctx_index)
 {
-	int b_size = DIV_ROUND_UP(reg->nr_hw_queues, nr_online_nodes);
-	int tip = (reg->nr_hw_queues % nr_online_nodes);
+	int b_size = DIV_ROUND_UP(set->nr_hw_queues, nr_online_nodes);
+	int tip = (set->nr_hw_queues % nr_online_nodes);
 	int node = 0, i, n;
 
 	/*
@@ -338,7 +340,7 @@ static struct blk_mq_hw_ctx *null_alloc_hctx(struct blk_mq_reg *reg, unsigned in
 
 			tip--;
 			if (!tip)
-				b_size = reg->nr_hw_queues / nr_online_nodes;
+				b_size = set->nr_hw_queues / nr_online_nodes;
 		}
 	}
 
@@ -387,13 +389,17 @@ static struct blk_mq_ops null_mq_ops = {
 	.map_queue      = blk_mq_map_queue,
 	.init_hctx	= null_init_hctx,
 	.complete	= null_softirq_done_fn,
+	.alloc_hctx	= blk_mq_alloc_single_hw_queue,
+	.free_hctx	= blk_mq_free_single_hw_queue,
 };
 
-static struct blk_mq_reg null_mq_reg = {
-	.ops		= &null_mq_ops,
-	.queue_depth	= 64,
-	.cmd_size	= sizeof(struct nullb_cmd),
-	.flags		= BLK_MQ_F_SHOULD_MERGE,
+static struct blk_mq_ops null_mq_ops_pernode = {
+	.queue_rq       = null_queue_rq,
+	.map_queue      = blk_mq_map_queue,
+	.init_hctx	= null_init_hctx,
+	.complete	= null_softirq_done_fn,
+	.alloc_hctx	= null_alloc_hctx,
+	.free_hctx	= null_free_hctx,
 };
 
 static void null_del_dev(struct nullb *nullb)
@@ -402,6 +408,8 @@ static void null_del_dev(struct nullb *nullb)
 
 	del_gendisk(nullb->disk);
 	blk_cleanup_queue(nullb->q);
+	if (queue_mode == NULL_Q_MQ)
+		blk_mq_free_tag_set(&nullb->tag_set);
 	put_disk(nullb->disk);
 	kfree(nullb);
 }
@@ -506,7 +514,7 @@ static int null_add_dev(void)
 
 	nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node);
 	if (!nullb)
-		return -ENOMEM;
+		goto out;
 
 	spin_lock_init(&nullb->lock);
 
@@ -514,49 +522,47 @@ static int null_add_dev(void)
 		submit_queues = nr_online_nodes;
 
 	if (setup_queues(nullb))
-		goto err;
+		goto out_free_nullb;
 
 	if (queue_mode == NULL_Q_MQ) {
-		null_mq_reg.numa_node = home_node;
-		null_mq_reg.queue_depth = hw_queue_depth;
-		null_mq_reg.nr_hw_queues = submit_queues;
-
-		if (use_per_node_hctx) {
-			null_mq_reg.ops->alloc_hctx = null_alloc_hctx;
-			null_mq_reg.ops->free_hctx = null_free_hctx;
-		} else {
-			null_mq_reg.ops->alloc_hctx = blk_mq_alloc_single_hw_queue;
-			null_mq_reg.ops->free_hctx = blk_mq_free_single_hw_queue;
-		}
-
-		nullb->q = blk_mq_init_queue(&null_mq_reg, nullb);
+		if (use_per_node_hctx)
+			nullb->tag_set.ops = &null_mq_ops_pernode;
+		else
+			nullb->tag_set.ops = &null_mq_ops;
+		nullb->tag_set.nr_hw_queues = submit_queues;
+		nullb->tag_set.queue_depth = hw_queue_depth;
+		nullb->tag_set.numa_node = home_node;
+		nullb->tag_set.cmd_size	= sizeof(struct nullb_cmd);
+		nullb->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+		nullb->tag_set.driver_data = nullb;
+
+		if (blk_mq_alloc_tag_set(&nullb->tag_set))
+			goto out_cleanup_queues;
+
+		nullb->q = blk_mq_init_queue(&nullb->tag_set);
+		if (!nullb->q)
+			goto out_cleanup_tags;
 	} else if (queue_mode == NULL_Q_BIO) {
 		nullb->q = blk_alloc_queue_node(GFP_KERNEL, home_node);
+		if (!nullb->q)
+			goto out_cleanup_queues;
 		blk_queue_make_request(nullb->q, null_queue_bio);
 		init_driver_queues(nullb);
 	} else {
 		nullb->q = blk_init_queue_node(null_request_fn, &nullb->lock, home_node);
+		if (!nullb->q)
+			goto out_cleanup_queues;
 		blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
-		if (nullb->q)
-			blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
+		blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
 		init_driver_queues(nullb);
 	}
 
-	if (!nullb->q)
-		goto queue_fail;
-
 	nullb->q->queuedata = nullb;
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, nullb->q);
 
 	disk = nullb->disk = alloc_disk_node(1, home_node);
-	if (!disk) {
-queue_fail:
-		blk_cleanup_queue(nullb->q);
-		cleanup_queues(nullb);
-err:
-		kfree(nullb);
-		return -ENOMEM;
-	}
+	if (!disk)
+		goto out_cleanup_blk_queue;
 
 	mutex_lock(&lock);
 	list_add_tail(&nullb->list, &nullb_list);
@@ -579,6 +585,18 @@ err:
 	sprintf(disk->disk_name, "nullb%d", nullb->index);
 	add_disk(disk);
 	return 0;
+
+out_cleanup_blk_queue:
+	blk_cleanup_queue(nullb->q);
+out_cleanup_tags:
+	if (queue_mode == NULL_Q_MQ)
+		blk_mq_free_tag_set(&nullb->tag_set);
+out_cleanup_queues:
+	cleanup_queues(nullb);
+out_free_nullb:
+	kfree(nullb);
+out:
+	return -ENOMEM;
 }
 
 static int __init null_init(void)
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 87447c1f..0ee66be 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -30,6 +30,9 @@ struct virtio_blk
 	/* The disk structure for the kernel. */
 	struct gendisk *disk;
 
+	/* Block layer tags. */
+	struct blk_mq_tag_set tag_set;
+
 	/* Process context for config space updates */
 	struct work_struct config_work;
 
@@ -474,8 +477,9 @@ static const struct device_attribute dev_attr_cache_type_rw =
 	__ATTR(cache_type, S_IRUGO|S_IWUSR,
 	       virtblk_cache_type_show, virtblk_cache_type_store);
 
-static int virtblk_init_request(void *data, struct blk_mq_hw_ctx *hctx,
-		struct request *rq, unsigned int nr)
+static int virtblk_init_request(void *data, struct request *rq,
+		unsigned int hctx_idx, unsigned int request_idx,
+		unsigned int numa_node)
 {
 	struct virtio_blk *vblk = data;
 	struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
@@ -489,16 +493,8 @@ static struct blk_mq_ops virtio_mq_ops = {
 	.map_queue	= blk_mq_map_queue,
 	.alloc_hctx	= blk_mq_alloc_single_hw_queue,
 	.free_hctx	= blk_mq_free_single_hw_queue,
-	.init_request	= virtblk_init_request,
 	.complete	= virtblk_request_done,
-};
-
-static struct blk_mq_reg virtio_mq_reg = {
-	.ops		= &virtio_mq_ops,
-	.nr_hw_queues	= 1,
-	.queue_depth	= 64,
-	.numa_node	= NUMA_NO_NODE,
-	.flags		= BLK_MQ_F_SHOULD_MERGE,
+	.init_request	= virtblk_init_request,
 };
 
 static int virtblk_probe(struct virtio_device *vdev)
@@ -554,14 +550,25 @@ static int virtblk_probe(struct virtio_device *vdev)
 		goto out_free_vq;
 	}
 
-	virtio_mq_reg.cmd_size =
+	memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
+	vblk->tag_set.ops = &virtio_mq_ops;
+	vblk->tag_set.nr_hw_queues = 1;
+	vblk->tag_set.queue_depth = 64;
+	vblk->tag_set.numa_node = NUMA_NO_NODE;
+	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	vblk->tag_set.cmd_size =
 		sizeof(struct virtblk_req) +
 		sizeof(struct scatterlist) * sg_elems;
+	vblk->tag_set.driver_data = vblk;
 
-	q = vblk->disk->queue = blk_mq_init_queue(&virtio_mq_reg, vblk);
+	err = blk_mq_alloc_tag_set(&vblk->tag_set);
+	if (err)
+		goto out_put_disk;
+
+	q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set);
 	if (!q) {
 		err = -ENOMEM;
-		goto out_put_disk;
+		goto out_free_tags;
 	}
 
 	q->queuedata = vblk;
@@ -664,6 +671,8 @@ static int virtblk_probe(struct virtio_device *vdev)
 out_del_disk:
 	del_gendisk(vblk->disk);
 	blk_cleanup_queue(vblk->disk->queue);
+out_free_tags:
+	blk_mq_free_tag_set(&vblk->tag_set);
 out_put_disk:
 	put_disk(vblk->disk);
 out_free_vq:
@@ -690,6 +699,8 @@ static void virtblk_remove(struct virtio_device *vdev)
 	del_gendisk(vblk->disk);
 	blk_cleanup_queue(vblk->disk->queue);
 
+	blk_mq_free_tag_set(&vblk->tag_set);
+
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 897ca1a..e3e1f41 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -32,8 +32,6 @@ struct blk_mq_hw_ctx {
 	unsigned int 		nr_ctx_map;
 	unsigned long		*ctx_map;
 
-	struct request		**rqs;
-	struct list_head	page_list;
 	struct blk_mq_tags	*tags;
 
 	unsigned long		queued;
@@ -41,7 +39,6 @@ struct blk_mq_hw_ctx {
 #define BLK_MQ_MAX_DISPATCH_ORDER	10
 	unsigned long		dispatched[BLK_MQ_MAX_DISPATCH_ORDER];
 
-	unsigned int		queue_depth;
 	unsigned int		numa_node;
 	unsigned int		cmd_size;	/* per-request extra data */
 
@@ -49,7 +46,7 @@ struct blk_mq_hw_ctx {
 	struct kobject		kobj;
 };
 
-struct blk_mq_reg {
+struct blk_mq_tag_set {
 	struct blk_mq_ops	*ops;
 	unsigned int		nr_hw_queues;
 	unsigned int		queue_depth;
@@ -58,18 +55,22 @@ struct blk_mq_reg {
 	int			numa_node;
 	unsigned int		timeout;
 	unsigned int		flags;		/* BLK_MQ_F_* */
+	void			*driver_data;
+
+	struct blk_mq_tags	**tags;
 };
 
 typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *);
 typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
-typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_reg *,unsigned int);
+typedef struct blk_mq_hw_ctx *(alloc_hctx_fn)(struct blk_mq_tag_set *,
+		unsigned int);
 typedef void (free_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
 typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
 typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
-typedef int (init_request_fn)(void *, struct blk_mq_hw_ctx *,
-		struct request *, unsigned int);
-typedef void (exit_request_fn)(void *, struct blk_mq_hw_ctx *,
-		struct request *, unsigned int);
+typedef int (init_request_fn)(void *, struct request *, unsigned int,
+		unsigned int, unsigned int);
+typedef void (exit_request_fn)(void *, struct request *, unsigned int,
+		unsigned int);
 
 struct blk_mq_ops {
 	/*
@@ -126,10 +127,13 @@ enum {
 	BLK_MQ_MAX_DEPTH	= 2048,
 };
 
-struct request_queue *blk_mq_init_queue(struct blk_mq_reg *, void *);
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
 int blk_mq_register_disk(struct gendisk *);
 void blk_mq_unregister_disk(struct gendisk *);
 
+int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set);
+void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
+
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 
 void blk_mq_insert_request(struct request *, bool, bool, bool);
@@ -138,10 +142,10 @@ void blk_mq_free_request(struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp);
 struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, gfp_t gfp);
-struct request *blk_mq_rq_from_tag(struct request_queue *q, unsigned int tag);
+struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
 
 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index);
-struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *, unsigned int);
+struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int);
 void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *, unsigned int);
 
 bool blk_mq_end_io_partial(struct request *rq, int error,
@@ -172,12 +176,6 @@ static inline void *blk_mq_rq_to_pdu(struct request *rq)
 	return (void *) rq + sizeof(*rq);
 }
 
-static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx,
-					       unsigned int tag)
-{
-	return hctx->rqs[tag];
-}
-
 #define queue_for_each_hw_ctx(q, hctx, i)				\
 	for ((i) = 0; (i) < (q)->nr_hw_queues &&			\
 	     ({ hctx = (q)->queue_hw_ctx[i]; 1; }); (i)++)
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/4] blk-mq: move request structures into struct blk_mq_tags
  2014-03-31 14:46 ` [PATCH 3/4] blk-mq: move request structures into struct blk_mq_tags Christoph Hellwig
@ 2014-04-09 16:23     ` Jens Axboe
  0 siblings, 0 replies; 20+ messages in thread
From: Jens Axboe @ 2014-04-09 16:23 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Matias Bjorling, linux-kernel, linux-scsi

On 03/31/2014 08:46 AM, Christoph Hellwig wrote:
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 238379a..eca3a47 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -1,6 +1,24 @@
>   #ifndef INT_BLK_MQ_H
>   #define INT_BLK_MQ_H
>
> +#include <linux/percpu_ida.h>
> +
> +/*
> + * Tag address space map.
> + */
> +struct blk_mq_tags {
> +	unsigned int nr_tags;
> +	unsigned int nr_reserved_tags;
> +	unsigned int nr_batch_move;
> +	unsigned int nr_max_cache;
> +
> +	struct percpu_ida free_tags;
> +	struct percpu_ida reserved_tags;
> +
> +	struct request **rqs;
> +	struct list_head page_list;
> +};
> +

This should go into block/blk-mq-tag.h.


> -static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx,
> -					       unsigned int tag)
> -{
> -	return hctx->rqs[tag];
> -}

We might as well leave this, the mtip32xx conversion ends up using it. 
So if we pull it now, it'll just be reintroduced shortly.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/4] blk-mq: move request structures into struct blk_mq_tags
@ 2014-04-09 16:23     ` Jens Axboe
  0 siblings, 0 replies; 20+ messages in thread
From: Jens Axboe @ 2014-04-09 16:23 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Matias Bjorling, linux-kernel, linux-scsi

On 03/31/2014 08:46 AM, Christoph Hellwig wrote:
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 238379a..eca3a47 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -1,6 +1,24 @@
>   #ifndef INT_BLK_MQ_H
>   #define INT_BLK_MQ_H
>
> +#include <linux/percpu_ida.h>
> +
> +/*
> + * Tag address space map.
> + */
> +struct blk_mq_tags {
> +	unsigned int nr_tags;
> +	unsigned int nr_reserved_tags;
> +	unsigned int nr_batch_move;
> +	unsigned int nr_max_cache;
> +
> +	struct percpu_ida free_tags;
> +	struct percpu_ida reserved_tags;
> +
> +	struct request **rqs;
> +	struct list_head page_list;
> +};
> +

This should go into block/blk-mq-tag.h.


> -static inline struct request *blk_mq_tag_to_rq(struct blk_mq_hw_ctx *hctx,
> -					       unsigned int tag)
> -{
> -	return hctx->rqs[tag];
> -}

We might as well leave this, the mtip32xx conversion ends up using it. 
So if we pull it now, it'll just be reintroduced shortly.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/4] blk-mq: move request structures into struct blk_mq_tags
  2014-04-09 16:23     ` Jens Axboe
  (?)
@ 2014-04-10 10:01     ` Christoph Hellwig
  2014-04-11  2:42         ` Jens Axboe
  -1 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2014-04-10 10:01 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Matias Bjorling, linux-kernel, linux-scsi

On Wed, Apr 09, 2014 at 10:23:32AM -0600, Jens Axboe wrote:
> This should go into block/blk-mq-tag.h.

Ok.

> We might as well leave this, the mtip32xx conversion ends up using it. So 
> if we pull it now, it'll just be reintroduced shortly.

It's back in the latest revision of the patch, just taking a
struct blk_mq_tag pointer now so that it can be used by SCSI as well.

I've also changed an opencode variant of it to use the helper.

Pointer: http://git.infradead.org/users/hch/scsi.git/commitdiff/b0f1ed35bbeb6d0177fc0cc0bf5c880c3c5d1817


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/4] blk-mq: move request structures into struct blk_mq_tags
  2014-04-10 10:01     ` Christoph Hellwig
@ 2014-04-11  2:42         ` Jens Axboe
  0 siblings, 0 replies; 20+ messages in thread
From: Jens Axboe @ 2014-04-11  2:42 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Matias Bjorling, linux-kernel, linux-scsi

On 2014-04-10 04:01, Christoph Hellwig wrote:
> On Wed, Apr 09, 2014 at 10:23:32AM -0600, Jens Axboe wrote:
>> This should go into block/blk-mq-tag.h.
>
> Ok.
>
>> We might as well leave this, the mtip32xx conversion ends up using it. So
>> if we pull it now, it'll just be reintroduced shortly.
>
> It's back in the latest revision of the patch, just taking a
> struct blk_mq_tag pointer now so that it can be used by SCSI as well.
>
> I've also changed an opencode variant of it to use the helper.

Great. Will you send out an updated patchset?

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/4] blk-mq: move request structures into struct blk_mq_tags
@ 2014-04-11  2:42         ` Jens Axboe
  0 siblings, 0 replies; 20+ messages in thread
From: Jens Axboe @ 2014-04-11  2:42 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Matias Bjorling, linux-kernel, linux-scsi

On 2014-04-10 04:01, Christoph Hellwig wrote:
> On Wed, Apr 09, 2014 at 10:23:32AM -0600, Jens Axboe wrote:
>> This should go into block/blk-mq-tag.h.
>
> Ok.
>
>> We might as well leave this, the mtip32xx conversion ends up using it. So
>> if we pull it now, it'll just be reintroduced shortly.
>
> It's back in the latest revision of the patch, just taking a
> struct blk_mq_tag pointer now so that it can be used by SCSI as well.
>
> I've also changed an opencode variant of it to use the helper.

Great. Will you send out an updated patchset?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/4] blk-mq: move request structures into struct blk_mq_tags
  2014-04-11  2:42         ` Jens Axboe
  (?)
@ 2014-04-11  7:33         ` Christoph Hellwig
  2014-04-11 13:56           ` Jens Axboe
  -1 siblings, 1 reply; 20+ messages in thread
From: Christoph Hellwig @ 2014-04-11  7:33 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Christoph Hellwig, Matias Bjorling, linux-kernel, linux-scsi

On Thu, Apr 10, 2014 at 08:42:27PM -0600, Jens Axboe wrote:
> Great. Will you send out an updated patchset?

I was waiting for you to apply the previous series, but I'll just resend my
whole set of outstanding patches, it'll need a bit of rebasing against
your updated tree anyway.


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/4] blk-mq: move request structures into struct blk_mq_tags
  2014-04-11  7:33         ` Christoph Hellwig
@ 2014-04-11 13:56           ` Jens Axboe
  0 siblings, 0 replies; 20+ messages in thread
From: Jens Axboe @ 2014-04-11 13:56 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe; +Cc: Matias Bjorling, linux-kernel, linux-scsi

On 04/11/2014 01:33 AM, Christoph Hellwig wrote:
> On Thu, Apr 10, 2014 at 08:42:27PM -0600, Jens Axboe wrote:
>> Great. Will you send out an updated patchset?
>
> I was waiting for you to apply the previous series, but I'll just resend my
> whole set of outstanding patches, it'll need a bit of rebasing against
> your updated tree anyway.

I'll take a look at the previous series and apply/comment.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2014-04-11 13:56 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-03-31 14:46 [RFC] blk-mq: support for shared tags Christoph Hellwig
2014-03-31 14:46 ` [PATCH 1/4] blk->mq: stop pre-initializing req->special Christoph Hellwig
2014-03-31 14:46 ` [PATCH 2/4] blk-mq: initialize request on allocation Christoph Hellwig
2014-03-31 14:46 ` [PATCH 3/4] blk-mq: move request structures into struct blk_mq_tags Christoph Hellwig
2014-04-09 16:23   ` Jens Axboe
2014-04-09 16:23     ` Jens Axboe
2014-04-10 10:01     ` Christoph Hellwig
2014-04-11  2:42       ` Jens Axboe
2014-04-11  2:42         ` Jens Axboe
2014-04-11  7:33         ` Christoph Hellwig
2014-04-11 13:56           ` Jens Axboe
2014-03-31 14:46 ` [PATCH 4/4] blk-mq: support shared tag maps Christoph Hellwig
2014-04-02  0:16 ` [RFC] blk-mq: support for shared tags Matias Bjorling
2014-04-02  7:46   ` Christoph Hellwig
2014-04-03  4:10     ` Matias Bjorling
2014-04-03  7:36       ` Christoph Hellwig
2014-04-03 16:45         ` Matias Bjorling
2014-04-03 18:01           ` Christoph Hellwig
2014-04-03 21:47             ` Matias Bjorling
2014-04-04 15:19               ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.