All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHv2 1/2] blk-mq: Shared tag enhancements
@ 2015-06-01 15:29 Keith Busch
  2015-06-01 15:29 ` [PATCHv2 2/2] NVMe: Remove hctx reliance for multi-namespace Keith Busch
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Keith Busch @ 2015-06-01 15:29 UTC (permalink / raw)


Storage controllers may expose multiple block devices that share hardware
resources managed by blk-mq. This patch enhances the shared tags so a
low-level driver can access the shared resources not tied to the unshared
h/w contexts. This way the LLD can dynamically add and delete disks and
request queues without having to track all the request_queue hctx's to
iterate outstanding tags.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
v1 -> v2:

  Removed additional blk-mq ops for tag init/exit. The driver can obtain
  references to these through its tagset instead.

 block/blk-mq-tag.c     |   38 ++++++++++++++++++++++++++++++++++++++
 block/blk-mq-tag.h     |    1 +
 block/blk-mq.c         |   12 ++++++++++--
 include/linux/blk-mq.h |    4 ++++
 4 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index be3290c..9b6e288 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -438,6 +438,39 @@ static void bt_for_each(struct blk_mq_hw_ctx *hctx,
 	}
 }
 
+static void bt_tags_for_each(struct blk_mq_tags *tags,
+		struct blk_mq_bitmap_tags *bt, unsigned int off,
+		busy_tag_iter_fn *fn, void *data, bool reserved)
+{
+	struct request *rq;
+	int bit, i;
+
+	if (!tags->rqs)
+		return;
+	for (i = 0; i < bt->map_nr; i++) {
+		struct blk_align_bitmap *bm = &bt->map[i];
+
+		for (bit = find_first_bit(&bm->word, bm->depth);
+		     bit < bm->depth;
+		     bit = find_next_bit(&bm->word, bm->depth, bit + 1)) {
+			rq = blk_mq_tag_to_rq(tags, off + bit);
+			fn(rq, data, reserved);
+		}
+
+		off += (1 << bt->bits_per_word);
+	}
+}
+
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+		void *priv)
+{
+	if (tags->nr_reserved_tags)
+		bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true);
+	bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv,
+			false);
+}
+EXPORT_SYMBOL(blk_mq_all_tag_busy_iter);
+
 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
 		void *priv)
 {
@@ -580,6 +613,11 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	if (!tags)
 		return NULL;
 
+	if (!zalloc_cpumask_var(&tags->cpumask, GFP_KERNEL)) {
+		kfree(tags);
+		return NULL;
+	}
+
 	tags->nr_tags = total_tags;
 	tags->nr_reserved_tags = reserved_tags;
 
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 90767b3..75893a3 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -44,6 +44,7 @@ struct blk_mq_tags {
 	struct list_head page_list;
 
 	int alloc_policy;
+	cpumask_var_t cpumask;
 };
 
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c382a34..ef100fd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1525,7 +1525,6 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 			i++;
 		}
 	}
-
 	return tags;
 
 fail:
@@ -1821,6 +1820,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 
 		hctx = q->mq_ops->map_queue(q, i);
 		cpumask_set_cpu(i, hctx->cpumask);
+		cpumask_set_cpu(i, hctx->tags->cpumask);
 		ctx->index_hw = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
 	}
@@ -2187,6 +2187,12 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 	return 0;
 }
 
+struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
+{
+	return tags->cpumask;
+}
+EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
+
 /*
  * Alloc a tag set to be associated with one or more request queues.
  * May fail with EINVAL for various error conditions. May adjust the
@@ -2248,8 +2254,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 	int i;
 
 	for (i = 0; i < set->nr_hw_queues; i++) {
-		if (set->tags[i])
+		if (set->tags[i]) {
 			blk_mq_free_rq_map(set, set->tags[i], i);
+			free_cpumask_var(set->tags[i]->cpumask);
+		}
 	}
 
 	kfree(set->tags);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2056a99..37d1602 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -96,6 +96,7 @@ typedef void (exit_request_fn)(void *, struct request *, unsigned int,
 
 typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
 		bool);
+typedef void (busy_tag_iter_fn)(struct request *, void *, bool);
 
 struct blk_mq_ops {
 	/*
@@ -182,6 +183,7 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
 		gfp_t gfp, bool reserved);
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
+struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags);
 
 enum {
 	BLK_MQ_UNIQUE_TAG_BITS = 16,
@@ -224,6 +226,8 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
 void blk_mq_tag_busy_iter(struct blk_mq_hw_ctx *hctx, busy_iter_fn *fn,
 		void *priv);
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+		void *priv);
 void blk_mq_freeze_queue(struct request_queue *q);
 void blk_mq_unfreeze_queue(struct request_queue *q);
 void blk_mq_freeze_queue_start(struct request_queue *q);
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCHv2 2/2] NVMe: Remove hctx reliance for multi-namespace
  2015-06-01 15:29 [PATCHv2 1/2] blk-mq: Shared tag enhancements Keith Busch
@ 2015-06-01 15:29 ` Keith Busch
  2015-06-01 20:35 ` [PATCHv2 1/2] blk-mq: Shared tag enhancements Jens Axboe
  2015-06-03  7:00 ` Christoph Hellwig
  2 siblings, 0 replies; 5+ messages in thread
From: Keith Busch @ 2015-06-01 15:29 UTC (permalink / raw)


The driver needs to track shared tags to support multiple namespaces
that may be dynamically allocated or deleted. Relying on the first
request_queue's hctx's is not appropriate as we cannot clear outstanding
tags for all namespaces using this handle, nor can the driver easily track
all request_queue's hctx as namespaces are attached/detached. Instead,
this patch uses the nvme_dev's tagset to get the shared tag resources
instead of through a request_queue hctx.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
v1 -> v2:

  Obtain reference to blk_mq_tags through the device's tagset instead
  of requiring additional blk-mq ops.

 drivers/block/nvme-core.c |   52 +++++++++++++++++----------------------------
 1 file changed, 19 insertions(+), 33 deletions(-)

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index c42bc53..4899c45 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -102,6 +102,7 @@ struct nvme_queue {
 	spinlock_t q_lock;
 	struct nvme_command *sq_cmds;
 	volatile struct nvme_completion *cqes;
+	struct blk_mq_tags **tags;
 	dma_addr_t sq_dma_addr;
 	dma_addr_t cq_dma_addr;
 	u32 __iomem *q_db;
@@ -114,7 +115,6 @@ struct nvme_queue {
 	u8 cq_phase;
 	u8 cqe_seen;
 	struct async_cmd_info cmdinfo;
-	struct blk_mq_hw_ctx *hctx;
 };
 
 /*
@@ -182,9 +182,12 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 	struct nvme_dev *dev = data;
 	struct nvme_queue *nvmeq = dev->queues[0];
 
-	WARN_ON(nvmeq->hctx);
-	nvmeq->hctx = hctx;
+	WARN_ON(hctx_idx != 0);
+	WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
+	WARN_ON(nvmeq->tags);
+
 	hctx->driver_data = nvmeq;
+	nvmeq->tags = &dev->admin_tagset.tags[0];
 	return 0;
 }
 
@@ -201,27 +204,16 @@ static int nvme_admin_init_request(void *data, struct request *req,
 	return 0;
 }
 
-static void nvme_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
-{
-	struct nvme_queue *nvmeq = hctx->driver_data;
-
-	nvmeq->hctx = NULL;
-}
-
 static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 			  unsigned int hctx_idx)
 {
 	struct nvme_dev *dev = data;
-	struct nvme_queue *nvmeq = dev->queues[
-					(hctx_idx % dev->queue_count) + 1];
-
-	if (!nvmeq->hctx)
-		nvmeq->hctx = hctx;
+	struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
 
-	/* nvmeq queues are shared between namespaces. We assume here that
-	 * blk-mq map the tags so they match up with the nvme queue tags. */
-	WARN_ON(nvmeq->hctx->tags != hctx->tags);
+	if (!nvmeq->tags)
+		nvmeq->tags = &dev->tagset.tags[hctx_idx];
 
+	WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
 	hctx->driver_data = nvmeq;
 	return 0;
 }
@@ -320,7 +312,7 @@ static void abort_completion(struct nvme_queue *nvmeq, void *ctx,
 	u16 status = le16_to_cpup(&cqe->status) >> 1;
 	u32 result = le32_to_cpup(&cqe->result);
 
-	blk_mq_free_hctx_request(nvmeq->hctx, req);
+	blk_mq_free_request(req);
 
 	dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result);
 	++nvmeq->dev->abort_limit;
@@ -333,14 +325,13 @@ static void async_completion(struct nvme_queue *nvmeq, void *ctx,
 	cmdinfo->result = le32_to_cpup(&cqe->result);
 	cmdinfo->status = le16_to_cpup(&cqe->status) >> 1;
 	queue_kthread_work(cmdinfo->worker, &cmdinfo->work);
-	blk_mq_free_hctx_request(nvmeq->hctx, cmdinfo->req);
+	blk_mq_free_request(cmdinfo->req);
 }
 
 static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq,
 				  unsigned int tag)
 {
-	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
-	struct request *req = blk_mq_tag_to_rq(hctx->tags, tag);
+	struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag);
 
 	return blk_mq_rq_to_pdu(req);
 }
@@ -1067,7 +1058,7 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev)
 	c.common.opcode = nvme_admin_async_event;
 	c.common.command_id = req->tag;
 
-	blk_mq_free_hctx_request(nvmeq->hctx, req);
+	blk_mq_free_request(req);
 	return __nvme_submit_cmd(nvmeq, &c);
 }
 
@@ -1309,8 +1300,7 @@ static void nvme_abort_req(struct request *req)
 	}
 }
 
-static void nvme_cancel_queue_ios(struct blk_mq_hw_ctx *hctx,
-				struct request *req, void *data, bool reserved)
+static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved)
 {
 	struct nvme_queue *nvmeq = data;
 	void *ctx;
@@ -1407,11 +1397,9 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 
 static void nvme_clear_queue(struct nvme_queue *nvmeq)
 {
-	struct blk_mq_hw_ctx *hctx = nvmeq->hctx;
-
 	spin_lock_irq(&nvmeq->q_lock);
-	if (hctx && hctx->tags)
-		blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq);
+	if (nvmeq->tags && *nvmeq->tags)
+		blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq);
 	spin_unlock_irq(&nvmeq->q_lock);
 }
 
@@ -1604,7 +1592,6 @@ static struct blk_mq_ops nvme_mq_admin_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.map_queue	= blk_mq_map_queue,
 	.init_hctx	= nvme_admin_init_hctx,
-	.exit_hctx	= nvme_exit_hctx,
 	.init_request	= nvme_admin_init_request,
 	.timeout	= nvme_timeout,
 };
@@ -1613,7 +1600,6 @@ static struct blk_mq_ops nvme_mq_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.map_queue	= blk_mq_map_queue,
 	.init_hctx	= nvme_init_hctx,
-	.exit_hctx	= nvme_exit_hctx,
 	.init_request	= nvme_init_request,
 	.timeout	= nvme_timeout,
 };
@@ -2723,11 +2709,11 @@ static void nvme_set_irq_hints(struct nvme_dev *dev)
 	for (i = 0; i < dev->online_queues; i++) {
 		nvmeq = dev->queues[i];
 
-		if (!nvmeq->hctx)
+		if (!nvmeq->tags || !(*nvmeq->tags))
 			continue;
 
 		irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector,
-							nvmeq->hctx->cpumask);
+					blk_mq_tags_cpumask(*nvmeq->tags));
 	}
 }
 
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCHv2 1/2] blk-mq: Shared tag enhancements
  2015-06-01 15:29 [PATCHv2 1/2] blk-mq: Shared tag enhancements Keith Busch
  2015-06-01 15:29 ` [PATCHv2 2/2] NVMe: Remove hctx reliance for multi-namespace Keith Busch
@ 2015-06-01 20:35 ` Jens Axboe
  2015-06-03  7:00 ` Christoph Hellwig
  2 siblings, 0 replies; 5+ messages in thread
From: Jens Axboe @ 2015-06-01 20:35 UTC (permalink / raw)


On 06/01/2015 09:29 AM, Keith Busch wrote:
> Storage controllers may expose multiple block devices that share hardware
> resources managed by blk-mq. This patch enhances the shared tags so a
> low-level driver can access the shared resources not tied to the unshared
> h/w contexts. This way the LLD can dynamically add and delete disks and
> request queues without having to track all the request_queue hctx's to
> iterate outstanding tags.

v2 looks good to me. I'll apply 1/2 to core, and 2/2 to drivers for 4.2.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCHv2 1/2] blk-mq: Shared tag enhancements
  2015-06-01 15:29 [PATCHv2 1/2] blk-mq: Shared tag enhancements Keith Busch
  2015-06-01 15:29 ` [PATCHv2 2/2] NVMe: Remove hctx reliance for multi-namespace Keith Busch
  2015-06-01 20:35 ` [PATCHv2 1/2] blk-mq: Shared tag enhancements Jens Axboe
@ 2015-06-03  7:00 ` Christoph Hellwig
  2015-06-03 14:14   ` Keith Busch
  2 siblings, 1 reply; 5+ messages in thread
From: Christoph Hellwig @ 2015-06-03  7:00 UTC (permalink / raw)


On Mon, Jun 01, 2015@09:29:53AM -0600, Keith Busch wrote:
> Storage controllers may expose multiple block devices that share hardware
> resources managed by blk-mq. This patch enhances the shared tags so a
> low-level driver can access the shared resources not tied to the unshared
> h/w contexts. This way the LLD can dynamically add and delete disks and
> request queues without having to track all the request_queue hctx's to
> iterate outstanding tags.

Why do you add a new function instead of fully replacing the old
blk_mq_tag_busy_iter?  The only other user is the timeout handler,
and I think it would be fine with your version as well.  And
blk_mq_tag_busy_iter is a much better name for your new function anyway :)

Also the patch is missing a description for the cpumask changes, which
look like they should be a separate patch, or probably just moved into
the driver as the blk core doesn't make use of it.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCHv2 1/2] blk-mq: Shared tag enhancements
  2015-06-03  7:00 ` Christoph Hellwig
@ 2015-06-03 14:14   ` Keith Busch
  0 siblings, 0 replies; 5+ messages in thread
From: Keith Busch @ 2015-06-03 14:14 UTC (permalink / raw)


On Wed, 3 Jun 2015, Christoph Hellwig wrote:
> On Mon, Jun 01, 2015@09:29:53AM -0600, Keith Busch wrote:
>> Storage controllers may expose multiple block devices that share hardware
>> resources managed by blk-mq. This patch enhances the shared tags so a
>> low-level driver can access the shared resources not tied to the unshared
>> h/w contexts. This way the LLD can dynamically add and delete disks and
>> request queues without having to track all the request_queue hctx's to
>> iterate outstanding tags.
>
> Why do you add a new function instead of fully replacing the old
> blk_mq_tag_busy_iter?  The only other user is the timeout handler,
> and I think it would be fine with your version as well.  And
> blk_mq_tag_busy_iter is a much better name for your new function anyway :)

I thought about that, but looks like trouble. Each namespace has its
own request_queue and each request_queue its own timer to iterate their
busy tags. Letting any timer iterate another's tags could expire the
same command multiple times.

Maybe we could move the timer from the request_queue to the tagset
instead? There seems to be some duplicate information propagating in
the request_queue anyway that might be better suited to the common tagset.

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2015-06-03 14:14 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-01 15:29 [PATCHv2 1/2] blk-mq: Shared tag enhancements Keith Busch
2015-06-01 15:29 ` [PATCHv2 2/2] NVMe: Remove hctx reliance for multi-namespace Keith Busch
2015-06-01 20:35 ` [PATCHv2 1/2] blk-mq: Shared tag enhancements Jens Axboe
2015-06-03  7:00 ` Christoph Hellwig
2015-06-03 14:14   ` Keith Busch

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.