All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/4] blk-mq: support to use hw tag for scheduling
@ 2017-04-28 15:15 Ming Lei
  2017-04-28 15:15 ` [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG Ming Lei
                   ` (4 more replies)
  0 siblings, 5 replies; 57+ messages in thread
From: Ming Lei @ 2017-04-28 15:15 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: Christoph Hellwig, Omar Sandoval, Ming Lei

Hi,

This patchset introduces flag of BLK_MQ_F_SCHED_USE_HW_TAG and
allows to use hardware tag directly for IO scheduling if the queue's
depth is big enough. In this way, we can avoid to allocate extra tags
and request pool for IO schedule, and the schedule tag allocation/release
can be saved in I/O submit path.

Thanks,
Ming

Ming Lei (4):
  blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG
  blk-mq: introduce blk_mq_get_queue_depth()
  blk-mq: use hw tag for scheduling if hw tag space is big enough
  blk-mq: dump new introduced flag of BLK_MQ_F_SCHED_USE_HW_TAG

 block/blk-mq-debugfs.c |  1 +
 block/blk-mq-sched.c   | 18 +++++++++++++-
 block/blk-mq-sched.h   | 15 ++++++++++++
 block/blk-mq.c         | 65 ++++++++++++++++++++++++++++++++++++++++++++------
 block/blk-mq.h         |  1 +
 include/linux/blk-mq.h |  1 +
 6 files changed, 93 insertions(+), 8 deletions(-)

-- 
2.9.3

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG
  2017-04-28 15:15 [PATCH 0/4] blk-mq: support to use hw tag for scheduling Ming Lei
@ 2017-04-28 15:15 ` Ming Lei
  2017-05-03 16:21   ` Omar Sandoval
  2017-05-03 16:46   ` Omar Sandoval
  2017-04-28 15:15 ` [PATCH 2/4] blk-mq: introduce blk_mq_get_queue_depth() Ming Lei
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 57+ messages in thread
From: Ming Lei @ 2017-04-28 15:15 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: Christoph Hellwig, Omar Sandoval, Ming Lei

When blk-mq I/O scheduler is used, we need two tags for
submitting one request. One is called scheduler tag for
allocating request and scheduling I/O, another one is called
driver tag, which is used for dispatching IO to hardware/driver.
This way introduces one extra per-queue allocation for both tags
and request pool, and may not be as efficient as case of none
scheduler.

Also currently we put a default per-hctx limit on schedulable
requests, and this limit may be a bottleneck for some devices,
especialy when these devices have a quite big tag space.

This patch introduces BLK_MQ_F_SCHED_USE_HW_TAG so that we can
allow to use hardware/driver tags directly for IO scheduling if
devices's hardware tag space is big enough. Then we can avoid
the extra resource allocation and make IO submission more
efficient.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-sched.c   | 10 +++++++++-
 block/blk-mq.c         | 35 +++++++++++++++++++++++++++++------
 include/linux/blk-mq.h |  1 +
 3 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 27c67465f856..45a675f07b8b 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -83,7 +83,12 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
 		data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
 
 	if (e) {
-		data->flags |= BLK_MQ_REQ_INTERNAL;
+		/*
+		 * If BLK_MQ_F_SCHED_USE_HW_TAG is set, we use hardware
+		 * tag for IO scheduler directly.
+		 */
+		if (!(data->hctx->flags & BLK_MQ_F_SCHED_USE_HW_TAG))
+			data->flags |= BLK_MQ_REQ_INTERNAL;
 
 		/*
 		 * Flush requests are special and go directly to the
@@ -431,6 +436,9 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
 	struct blk_mq_tag_set *set = q->tag_set;
 	int ret;
 
+	if (hctx->flags & BLK_MQ_F_SCHED_USE_HW_TAG)
+		return 0;
+
 	hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
 					       set->reserved_tags);
 	if (!hctx->sched_tags)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0168b27469cb..e530bc54f0d9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -263,9 +263,19 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
 				rq->rq_flags = RQF_MQ_INFLIGHT;
 				atomic_inc(&data->hctx->nr_active);
 			}
-			rq->tag = tag;
-			rq->internal_tag = -1;
-			data->hctx->tags->rqs[rq->tag] = rq;
+			data->hctx->tags->rqs[tag] = rq;
+
+			/*
+			 * If we use hw tag for scheduling, postpone setting
+			 * rq->tag in blk_mq_get_driver_tag().
+			 */
+			if (data->hctx->flags & BLK_MQ_F_SCHED_USE_HW_TAG) {
+				rq->tag = -1;
+				rq->internal_tag = tag;
+			} else {
+				rq->tag = tag;
+				rq->internal_tag = -1;
+			}
 		}
 
 		if (data->flags & BLK_MQ_REQ_RESERVED)
@@ -368,7 +378,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 	clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
 	if (rq->tag != -1)
 		blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
-	if (sched_tag != -1)
+	if (sched_tag != -1 && !(hctx->flags & BLK_MQ_F_SCHED_USE_HW_TAG))
 		blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
 	blk_mq_sched_restart(hctx);
 	blk_queue_exit(q);
@@ -872,6 +882,12 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 	if (rq->tag != -1)
 		goto done;
 
+	/* we buffered driver tag in rq->internal_tag */
+	if (data.hctx->flags & BLK_MQ_F_SCHED_USE_HW_TAG) {
+		rq->tag = rq->internal_tag;
+		goto done;
+	}
+
 	if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
 		data.flags |= BLK_MQ_REQ_RESERVED;
 
@@ -893,9 +909,15 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
 				    struct request *rq)
 {
-	blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
+	unsigned tag = rq->tag;
+
 	rq->tag = -1;
 
+	if (hctx->flags & BLK_MQ_F_SCHED_USE_HW_TAG)
+		return;
+
+	blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, tag);
+
 	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
 		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
 		atomic_dec(&hctx->nr_active);
@@ -2865,7 +2887,8 @@ bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
 		blk_flush_plug_list(plug, false);
 
 	hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
-	if (!blk_qc_t_is_internal(cookie))
+	if (!blk_qc_t_is_internal(cookie) || (hctx->flags &
+			BLK_MQ_F_SCHED_USE_HW_TAG))
 		rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
 	else {
 		rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 32bd8eb5ba67..53f24df91a05 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -162,6 +162,7 @@ enum {
 	BLK_MQ_F_SG_MERGE	= 1 << 2,
 	BLK_MQ_F_BLOCKING	= 1 << 5,
 	BLK_MQ_F_NO_SCHED	= 1 << 6,
+	BLK_MQ_F_SCHED_USE_HW_TAG	= 1 << 7,
 	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
 	BLK_MQ_F_ALLOC_POLICY_BITS = 1,
 
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 2/4] blk-mq: introduce blk_mq_get_queue_depth()
  2017-04-28 15:15 [PATCH 0/4] blk-mq: support to use hw tag for scheduling Ming Lei
  2017-04-28 15:15 ` [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG Ming Lei
@ 2017-04-28 15:15 ` Ming Lei
  2017-04-28 18:23   ` Jens Axboe
  2017-05-03 16:55   ` Omar Sandoval
  2017-04-28 15:15 ` [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough Ming Lei
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 57+ messages in thread
From: Ming Lei @ 2017-04-28 15:15 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: Christoph Hellwig, Omar Sandoval, Ming Lei

The hardware queue depth can be resized via blk_mq_update_nr_requests(),
so introduce this helper for retrieving queue's depth easily.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq.c | 12 ++++++++++++
 block/blk-mq.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index e530bc54f0d9..04761fb76ab4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2637,6 +2637,18 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 }
 EXPORT_SYMBOL(blk_mq_free_tag_set);
 
+/*
+ * Queue depth can be changed via blk_mq_update_nr_requests(),
+ * so use this helper to retrieve queue's depth.
+ */
+int blk_mq_get_queue_depth(struct request_queue *q)
+{
+	/* All queues have same queue depth */
+	struct blk_mq_tags	*tags = q->tag_set->tags[0];
+
+	return tags->bitmap_tags.sb.depth;
+}
+
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 {
 	struct blk_mq_tag_set *set = q->tag_set;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 2814a14e529c..8085d5989cf5 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -166,6 +166,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 void blk_mq_finish_request(struct request *rq);
 struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
 					unsigned int op);
+int blk_mq_get_queue_depth(struct request_queue *q);
 
 static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
 {
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough
  2017-04-28 15:15 [PATCH 0/4] blk-mq: support to use hw tag for scheduling Ming Lei
  2017-04-28 15:15 ` [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG Ming Lei
  2017-04-28 15:15 ` [PATCH 2/4] blk-mq: introduce blk_mq_get_queue_depth() Ming Lei
@ 2017-04-28 15:15 ` Ming Lei
  2017-04-28 18:09   ` Bart Van Assche
                     ` (2 more replies)
  2017-04-28 15:15 ` [PATCH 4/4] blk-mq: dump new introduced flag of BLK_MQ_F_SCHED_USE_HW_TAG Ming Lei
  2017-04-28 20:29 ` [PATCH 0/4] blk-mq: support to use hw tag for scheduling Jens Axboe
  4 siblings, 3 replies; 57+ messages in thread
From: Ming Lei @ 2017-04-28 15:15 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: Christoph Hellwig, Omar Sandoval, Ming Lei

When tag space of one device is big enough, we use hw tag
directly for I/O scheduling.

Now the decision is made if hw queue depth is not less than
q->nr_requests and the tag set isn't shared.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-sched.c |  8 ++++++++
 block/blk-mq-sched.h | 15 +++++++++++++++
 block/blk-mq.c       | 18 +++++++++++++++++-
 3 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 45a675f07b8b..4681e27c127e 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -507,6 +507,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 	struct elevator_queue *eq;
 	unsigned int i;
 	int ret;
+	bool auto_hw_tag;
 
 	if (!e) {
 		q->elevator = NULL;
@@ -519,7 +520,14 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 	 */
 	q->nr_requests = 2 * BLKDEV_MAX_RQ;
 
+	auto_hw_tag = blk_mq_sched_may_use_hw_tag(q);
+
 	queue_for_each_hw_ctx(q, hctx, i) {
+		if (auto_hw_tag)
+			hctx->flags |= BLK_MQ_F_SCHED_USE_HW_TAG;
+		else
+			hctx->flags &= ~BLK_MQ_F_SCHED_USE_HW_TAG;
+
 		ret = blk_mq_sched_alloc_tags(q, hctx, i);
 		if (ret)
 			goto err;
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index edafb5383b7b..22a19c118044 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -129,4 +129,19 @@ static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx)
 	return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 }
 
+/*
+ * If this queue has enough hardware tags and doesn't share tags with
+ * other queues, just use hw tag directly for scheduling.
+ */
+static inline bool blk_mq_sched_may_use_hw_tag(struct request_queue *q)
+{
+	if (q->tag_set->flags & BLK_MQ_F_TAG_SHARED)
+		return false;
+
+	if (blk_mq_get_queue_depth(q) < q->nr_requests)
+		return false;
+
+	return true;
+}
+
 #endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 04761fb76ab4..b0bd1fb4b0f8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2649,6 +2649,19 @@ int blk_mq_get_queue_depth(struct request_queue *q)
 	return tags->bitmap_tags.sb.depth;
 }
 
+static void blk_mq_update_sched_flag(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	int i;
+
+	if (!blk_mq_sched_may_use_hw_tag(q))
+		queue_for_each_hw_ctx(q, hctx, i)
+			hctx->flags &= ~BLK_MQ_F_SCHED_USE_HW_TAG;
+	else
+		queue_for_each_hw_ctx(q, hctx, i)
+			hctx->flags |= BLK_MQ_F_SCHED_USE_HW_TAG;
+}
+
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 {
 	struct blk_mq_tag_set *set = q->tag_set;
@@ -2681,9 +2694,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 			break;
 	}
 
-	if (!ret)
+	if (!ret) {
 		q->nr_requests = nr;
 
+		blk_mq_update_sched_flag(q);
+	}
+
 	blk_mq_unfreeze_queue(q);
 	blk_mq_start_stopped_hw_queues(q, true);
 
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 4/4] blk-mq: dump new introduced flag of BLK_MQ_F_SCHED_USE_HW_TAG
  2017-04-28 15:15 [PATCH 0/4] blk-mq: support to use hw tag for scheduling Ming Lei
                   ` (2 preceding siblings ...)
  2017-04-28 15:15 ` [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough Ming Lei
@ 2017-04-28 15:15 ` Ming Lei
  2017-04-28 18:10   ` Bart Van Assche
  2017-04-28 20:29 ` [PATCH 0/4] blk-mq: support to use hw tag for scheduling Jens Axboe
  4 siblings, 1 reply; 57+ messages in thread
From: Ming Lei @ 2017-04-28 15:15 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: Christoph Hellwig, Omar Sandoval, Ming Lei

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-debugfs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index bcd2a7d4a3a5..bc390847a60d 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -220,6 +220,7 @@ static const char *const hctx_flag_name[] = {
 	[ilog2(BLK_MQ_F_SG_MERGE)]	= "SG_MERGE",
 	[ilog2(BLK_MQ_F_BLOCKING)]	= "BLOCKING",
 	[ilog2(BLK_MQ_F_NO_SCHED)]	= "NO_SCHED",
+	[ilog2(BLK_MQ_F_SCHED_USE_HW_TAG)]	= "SCHED_USE_HW_TAG",
 };
 
 static int hctx_flags_show(struct seq_file *m, void *v)
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough
  2017-04-28 15:15 ` [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough Ming Lei
@ 2017-04-28 18:09   ` Bart Van Assche
  2017-04-29 10:35     ` Ming Lei
  2017-04-28 18:22   ` Jens Axboe
  2017-05-03 16:29   ` Omar Sandoval
  2 siblings, 1 reply; 57+ messages in thread
From: Bart Van Assche @ 2017-04-28 18:09 UTC (permalink / raw)
  To: linux-block, axboe, ming.lei; +Cc: hch, osandov

On Fri, 2017-04-28 at 23:15 +0800, Ming Lei wrote:
> +static inline bool blk_mq_sched_may_use_hw_tag(struct request_queue *q)
> +{
> +	if (q->tag_set->flags & BLK_MQ_F_TAG_SHARED)
> +		return false;
> +
> +	if (blk_mq_get_queue_depth(q) < q->nr_requests)
> +		return false;
> +
> +	return true;
> +}

The only user of shared tag sets I know of is scsi-mq. I think it's really
unfortunate that this patch systematically disables BLK_MQ_F_SCHED_USE_HW_T=
AG
for scsi-mq.

>  int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
>  {
>  	struct blk_mq_tag_set *set =3D q->tag_set;
> @@ -2681,9 +2694,12 @@ int blk_mq_update_nr_requests(struct request_queue=
 *q, unsigned int nr)
>  			break;
>  	}
> =20
> -	if (!ret)
> +	if (!ret) {
>  		q->nr_requests =3D nr;
> =20
> +		blk_mq_update_sched_flag(q);
> +	}
> +
>  	blk_mq_unfreeze_queue(q);
>  	blk_mq_start_stopped_hw_queues(q, true);

If a queue is created with a low value of nr_requests that will cause
blk_mq_sched_alloc_tags() to skip allocation of .sched_tags. If nr_requests
is increased, can that cause this function to clear BLK_MQ_F_SCHED_USE_HW_T=
AG
while keeping hctx->sched_tags =3D=3D NULL and hence trigger a NULL pointer
dereference?

Bart.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 4/4] blk-mq: dump new introduced flag of BLK_MQ_F_SCHED_USE_HW_TAG
  2017-04-28 15:15 ` [PATCH 4/4] blk-mq: dump new introduced flag of BLK_MQ_F_SCHED_USE_HW_TAG Ming Lei
@ 2017-04-28 18:10   ` Bart Van Assche
  2017-04-29 11:00     ` Ming Lei
  0 siblings, 1 reply; 57+ messages in thread
From: Bart Van Assche @ 2017-04-28 18:10 UTC (permalink / raw)
  To: linux-block, axboe, ming.lei; +Cc: hch, osandov

On Fri, 2017-04-28 at 23:15 +0800, Ming Lei wrote:
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/blk-mq-debugfs.c | 1 +
>  1 file changed, 1 insertion(+)
>=20
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index bcd2a7d4a3a5..bc390847a60d 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -220,6 +220,7 @@ static const char *const hctx_flag_name[] =3D {
>  	[ilog2(BLK_MQ_F_SG_MERGE)]	=3D "SG_MERGE",
>  	[ilog2(BLK_MQ_F_BLOCKING)]	=3D "BLOCKING",
>  	[ilog2(BLK_MQ_F_NO_SCHED)]	=3D "NO_SCHED",
> +	[ilog2(BLK_MQ_F_SCHED_USE_HW_TAG)]	=3D "SCHED_USE_HW_TAG",
>  };
> =20
>  static int hctx_flags_show(struct seq_file *m, void *v)

Hello Ming,

Please combine patches 1/4 and 4/4.

Thanks,

Bart.=

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough
  2017-04-28 15:15 ` [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough Ming Lei
  2017-04-28 18:09   ` Bart Van Assche
@ 2017-04-28 18:22   ` Jens Axboe
  2017-04-28 20:11     ` Bart Van Assche
  2017-04-29 10:59     ` Ming Lei
  2017-05-03 16:29   ` Omar Sandoval
  2 siblings, 2 replies; 57+ messages in thread
From: Jens Axboe @ 2017-04-28 18:22 UTC (permalink / raw)
  To: Ming Lei, linux-block; +Cc: Christoph Hellwig, Omar Sandoval

On 04/28/2017 09:15 AM, Ming Lei wrote:  
> +/*
> + * If this queue has enough hardware tags and doesn't share tags with
> + * other queues, just use hw tag directly for scheduling.
> + */
> +static inline bool blk_mq_sched_may_use_hw_tag(struct request_queue *q)
> +{
> +	if (q->tag_set->flags & BLK_MQ_F_TAG_SHARED)
> +		return false;
> +
> +	if (blk_mq_get_queue_depth(q) < q->nr_requests)
> +		return false;

I think we should leave a bigger gap. Ideally, for scheduling, we should
have a hw queue depth that's around half of what the scheduler has to
work with. That will always leave us something to schedule with, if the
hw can't deplete the whole pool.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/4] blk-mq: introduce blk_mq_get_queue_depth()
  2017-04-28 15:15 ` [PATCH 2/4] blk-mq: introduce blk_mq_get_queue_depth() Ming Lei
@ 2017-04-28 18:23   ` Jens Axboe
  2017-04-29  9:55     ` Ming Lei
  2017-05-03 16:55   ` Omar Sandoval
  1 sibling, 1 reply; 57+ messages in thread
From: Jens Axboe @ 2017-04-28 18:23 UTC (permalink / raw)
  To: Ming Lei, linux-block; +Cc: Christoph Hellwig, Omar Sandoval

On 04/28/2017 09:15 AM, Ming Lei wrote:
> The hardware queue depth can be resized via blk_mq_update_nr_requests(),
> so introduce this helper for retrieving queue's depth easily.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/blk-mq.c | 12 ++++++++++++
>  block/blk-mq.h |  1 +
>  2 files changed, 13 insertions(+)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index e530bc54f0d9..04761fb76ab4 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2637,6 +2637,18 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
>  }
>  EXPORT_SYMBOL(blk_mq_free_tag_set);
>  
> +/*
> + * Queue depth can be changed via blk_mq_update_nr_requests(),
> + * so use this helper to retrieve queue's depth.
> + */
> +int blk_mq_get_queue_depth(struct request_queue *q)
> +{
> +	/* All queues have same queue depth */
> +	struct blk_mq_tags	*tags = q->tag_set->tags[0];
> +
> +	return tags->bitmap_tags.sb.depth;
> +}

What about the per-hw queue tag space? q->nr_requests is device side,
this might not be.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough
  2017-04-28 18:22   ` Jens Axboe
@ 2017-04-28 20:11     ` Bart Van Assche
  2017-04-29 10:59     ` Ming Lei
  1 sibling, 0 replies; 57+ messages in thread
From: Bart Van Assche @ 2017-04-28 20:11 UTC (permalink / raw)
  To: linux-block, axboe, ming.lei; +Cc: hch, osandov

On Fri, 2017-04-28 at 12:22 -0600, Jens Axboe wrote:
> On 04/28/2017 09:15 AM, Ming Lei wrote: =20
> > +/*
> > + * If this queue has enough hardware tags and doesn't share tags with
> > + * other queues, just use hw tag directly for scheduling.
> > + */
> > +static inline bool blk_mq_sched_may_use_hw_tag(struct request_queue *q=
)
> > +{
> > +	if (q->tag_set->flags & BLK_MQ_F_TAG_SHARED)
> > +		return false;
> > +
> > +	if (blk_mq_get_queue_depth(q) < q->nr_requests)
> > +		return false;
>=20
> I think we should leave a bigger gap. Ideally, for scheduling, we should
> have a hw queue depth that's around half of what the scheduler has to
> work with. That will always leave us something to schedule with, if the
> hw can't deplete the whole pool.

Hello Jens,

The scsi-mq core allocates exactly the same number of tags per hardware
queue as the SCSI queue depth. Requiring that there is a gap would cause
BLK_MQ_F_SCHED_USE_HW_TAG not to be enabled for any scsi-mq LLD. I'm not
sure that changing the tag allocation strategy in scsi-mq would be the best
solution. How about changing blk_mq_sched_may_use_hw_tag() into something
like the below to guarantee that the scheduler has sufficient tags availabl=
e?

static bool blk_mq_sched_may_use_hw_tag(struct request_queue *q)
{
=A0=A0=A0=A0=A0=A0=A0 return=A0blk_mq_get_queue_depth(q) >=3D max(q->nr_req=
uests, 16);
}

Thanks,

Bart.=

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-04-28 15:15 [PATCH 0/4] blk-mq: support to use hw tag for scheduling Ming Lei
                   ` (3 preceding siblings ...)
  2017-04-28 15:15 ` [PATCH 4/4] blk-mq: dump new introduced flag of BLK_MQ_F_SCHED_USE_HW_TAG Ming Lei
@ 2017-04-28 20:29 ` Jens Axboe
  2017-05-03  4:03   ` Ming Lei
  4 siblings, 1 reply; 57+ messages in thread
From: Jens Axboe @ 2017-04-28 20:29 UTC (permalink / raw)
  To: Ming Lei, linux-block; +Cc: Christoph Hellwig, Omar Sandoval

On 04/28/2017 09:15 AM, Ming Lei wrote:
> Hi,
> 
> This patchset introduces flag of BLK_MQ_F_SCHED_USE_HW_TAG and
> allows to use hardware tag directly for IO scheduling if the queue's
> depth is big enough. In this way, we can avoid to allocate extra tags
> and request pool for IO schedule, and the schedule tag allocation/release
> can be saved in I/O submit path.

Ming, I like this approach, it's pretty clean. It'd be nice to have a
bit of performance data to back up that it's useful to add this code,
though.  Have you run anything on eg kyber on nvme that shows a
reduction in overhead when getting rid of separate scheduler tags?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/4] blk-mq: introduce blk_mq_get_queue_depth()
  2017-04-28 18:23   ` Jens Axboe
@ 2017-04-29  9:55     ` Ming Lei
  0 siblings, 0 replies; 57+ messages in thread
From: Ming Lei @ 2017-04-29  9:55 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Christoph Hellwig, Omar Sandoval

On Fri, Apr 28, 2017 at 12:23:38PM -0600, Jens Axboe wrote:
> On 04/28/2017 09:15 AM, Ming Lei wrote:
> > The hardware queue depth can be resized via blk_mq_update_nr_requests(),
> > so introduce this helper for retrieving queue's depth easily.
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  block/blk-mq.c | 12 ++++++++++++
> >  block/blk-mq.h |  1 +
> >  2 files changed, 13 insertions(+)
> > 
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index e530bc54f0d9..04761fb76ab4 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -2637,6 +2637,18 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
> >  }
> >  EXPORT_SYMBOL(blk_mq_free_tag_set);
> >  
> > +/*
> > + * Queue depth can be changed via blk_mq_update_nr_requests(),
> > + * so use this helper to retrieve queue's depth.
> > + */
> > +int blk_mq_get_queue_depth(struct request_queue *q)
> > +{
> > +	/* All queues have same queue depth */
> > +	struct blk_mq_tags	*tags = q->tag_set->tags[0];
> > +
> > +	return tags->bitmap_tags.sb.depth;
> > +}
> 
> What about the per-hw queue tag space? q->nr_requests is device side,
> this might not be.

This helper returns hw queue's actual queue depth, so only hw tag
space is involved.

We don't have per-hw queue tag space yet, and the tag space
should belong to tag set actually, but the hw queue number can be
one. And now all hw queues have same queue depth, either it is
set->queue_depth, or the new value updated in blk_mq_update_nr_requests().
And tags's depth is always the latest value.

The similar comment can be found in kyber_sched_tags_shift() too.

BTW, this patch missed reserved tags, will fix it in V1.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough
  2017-04-28 18:09   ` Bart Van Assche
@ 2017-04-29 10:35     ` Ming Lei
  2017-05-01 15:06       ` Bart Van Assche
  0 siblings, 1 reply; 57+ messages in thread
From: Ming Lei @ 2017-04-29 10:35 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: linux-block, axboe, hch, osandov

On Fri, Apr 28, 2017 at 06:09:40PM +0000, Bart Van Assche wrote:
> On Fri, 2017-04-28 at 23:15 +0800, Ming Lei wrote:
> > +static inline bool blk_mq_sched_may_use_hw_tag(struct request_queue *q)
> > +{
> > +	if (q->tag_set->flags & BLK_MQ_F_TAG_SHARED)
> > +		return false;
> > +
> > +	if (blk_mq_get_queue_depth(q) < q->nr_requests)
> > +		return false;
> > +
> > +	return true;
> > +}
> 
> The only user of shared tag sets I know of is scsi-mq. I think it's really
> unfortunate that this patch systematically disables BLK_MQ_F_SCHED_USE_HW_TAG
> for scsi-mq.

In previous patch, I actually allow driver to pass this flag, but this
feature is dropped in this post, just for making it simple & clean.
If you think we need it for shared tag set, I can add it in v1.

For shared tag sets, I suggest to not enable it at default, because
scheduler is per request queue now, and generaly more requests available,
better it performs.  When tags are shared among several request
queues, one of them may use tags up for its own scheduling, then
starve others. But it should be possible and not difficult to allocate
requests fairly for scheduling in this case if we switch to per-hctx
scheduling.

> 
> >  int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
> >  {
> >  	struct blk_mq_tag_set *set = q->tag_set;
> > @@ -2681,9 +2694,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
> >  			break;
> >  	}
> >  
> > -	if (!ret)
> > +	if (!ret) {
> >  		q->nr_requests = nr;
> >  
> > +		blk_mq_update_sched_flag(q);
> > +	}
> > +
> >  	blk_mq_unfreeze_queue(q);
> >  	blk_mq_start_stopped_hw_queues(q, true);
> 
> If a queue is created with a low value of nr_requests that will cause
> blk_mq_sched_alloc_tags() to skip allocation of .sched_tags. If nr_requests
> is increased, can that cause this function to clear BLK_MQ_F_SCHED_USE_HW_TAG
> while keeping hctx->sched_tags == NULL and hence trigger a NULL pointer
> dereference?

Good catch, will fix it in V1.

Given both scheduler switch and updating requests number are not
frequent actions, we can allocate/deallocate .sched_tags just
in need.


Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough
  2017-04-28 18:22   ` Jens Axboe
  2017-04-28 20:11     ` Bart Van Assche
@ 2017-04-29 10:59     ` Ming Lei
  1 sibling, 0 replies; 57+ messages in thread
From: Ming Lei @ 2017-04-29 10:59 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Christoph Hellwig, Omar Sandoval

On Fri, Apr 28, 2017 at 12:22:45PM -0600, Jens Axboe wrote:
> On 04/28/2017 09:15 AM, Ming Lei wrote:  
> > +/*
> > + * If this queue has enough hardware tags and doesn't share tags with
> > + * other queues, just use hw tag directly for scheduling.
> > + */
> > +static inline bool blk_mq_sched_may_use_hw_tag(struct request_queue *q)
> > +{
> > +	if (q->tag_set->flags & BLK_MQ_F_TAG_SHARED)
> > +		return false;
> > +
> > +	if (blk_mq_get_queue_depth(q) < q->nr_requests)
> > +		return false;
> 
> I think we should leave a bigger gap. Ideally, for scheduling, we should
> have a hw queue depth that's around half of what the scheduler has to
> work with. That will always leave us something to schedule with, if the
> hw can't deplete the whole pool.

When .sched_tags and .tags are different, it makes sense to make
nr_requests to be two times of queue_depth.

When we switch to schedule with hw tags directly, the euquation
can't be true at all. The simple policy in this patch can't be worsen
than standalone .sched_tags because lifetime of one sched tag is
actually same with request(from its allocation<io submission> to
freeing<io completion>). When we have bigger queue depth, even we can
schedule more.


Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 4/4] blk-mq: dump new introduced flag of BLK_MQ_F_SCHED_USE_HW_TAG
  2017-04-28 18:10   ` Bart Van Assche
@ 2017-04-29 11:00     ` Ming Lei
  0 siblings, 0 replies; 57+ messages in thread
From: Ming Lei @ 2017-04-29 11:00 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: linux-block, axboe, hch, osandov

On Fri, Apr 28, 2017 at 06:10:22PM +0000, Bart Van Assche wrote:
> On Fri, 2017-04-28 at 23:15 +0800, Ming Lei wrote:
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  block/blk-mq-debugfs.c | 1 +
> >  1 file changed, 1 insertion(+)
> > 
> > diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> > index bcd2a7d4a3a5..bc390847a60d 100644
> > --- a/block/blk-mq-debugfs.c
> > +++ b/block/blk-mq-debugfs.c
> > @@ -220,6 +220,7 @@ static const char *const hctx_flag_name[] = {
> >  	[ilog2(BLK_MQ_F_SG_MERGE)]	= "SG_MERGE",
> >  	[ilog2(BLK_MQ_F_BLOCKING)]	= "BLOCKING",
> >  	[ilog2(BLK_MQ_F_NO_SCHED)]	= "NO_SCHED",
> > +	[ilog2(BLK_MQ_F_SCHED_USE_HW_TAG)]	= "SCHED_USE_HW_TAG",
> >  };
> >  
> >  static int hctx_flags_show(struct seq_file *m, void *v)
> 
> Hello Ming,
> 
> Please combine patches 1/4 and 4/4.

OK!


Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough
  2017-04-29 10:35     ` Ming Lei
@ 2017-05-01 15:06       ` Bart Van Assche
  2017-05-02  3:49         ` Omar Sandoval
  2017-05-02  8:46         ` Ming Lei
  0 siblings, 2 replies; 57+ messages in thread
From: Bart Van Assche @ 2017-05-01 15:06 UTC (permalink / raw)
  To: ming.lei; +Cc: hch, linux-block, osandov, axboe

On Sat, 2017-04-29 at 18:35 +0800, Ming Lei wrote:
> On Fri, Apr 28, 2017 at 06:09:40PM +0000, Bart Van Assche wrote:
> > On Fri, 2017-04-28 at 23:15 +0800, Ming Lei wrote:
> > > +static inline bool blk_mq_sched_may_use_hw_tag(struct request_queue =
*q)
> > > +{
> > > +	if (q->tag_set->flags & BLK_MQ_F_TAG_SHARED)
> > > +		return false;
> > > +
> > > +	if (blk_mq_get_queue_depth(q) < q->nr_requests)
> > > +		return false;
> > > +
> > > +	return true;
> > > +}
> >=20
> > The only user of shared tag sets I know of is scsi-mq. I think it's rea=
lly
> > unfortunate that this patch systematically disables BLK_MQ_F_SCHED_USE_=
HW_TAG
> > for scsi-mq.
>=20
> In previous patch, I actually allow driver to pass this flag, but this
> feature is dropped in this post, just for making it simple & clean.
> If you think we need it for shared tag set, I can add it in v1.
>=20
> For shared tag sets, I suggest to not enable it at default, because
> scheduler is per request queue now, and generaly more requests available,
> better it performs.  When tags are shared among several request
> queues, one of them may use tags up for its own scheduling, then
> starve others. But it should be possible and not difficult to allocate
> requests fairly for scheduling in this case if we switch to per-hctx
> scheduling.

Hello Ming,

Have you noticed that there is already a mechanism in the block layer to
avoid starvation if a tag set is shared? The hctx_may_queue() function
guarantees that each user that shares a tag set gets at least some tags.
The .active_queues counter keeps track of the number of hardware queues
that share a tag set.

Bart.=

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough
  2017-05-01 15:06       ` Bart Van Assche
@ 2017-05-02  3:49         ` Omar Sandoval
  2017-05-02  8:46         ` Ming Lei
  1 sibling, 0 replies; 57+ messages in thread
From: Omar Sandoval @ 2017-05-02  3:49 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: ming.lei, hch, linux-block, osandov, axboe

On Mon, May 01, 2017 at 03:06:16PM +0000, Bart Van Assche wrote:
> On Sat, 2017-04-29 at 18:35 +0800, Ming Lei wrote:
> > On Fri, Apr 28, 2017 at 06:09:40PM +0000, Bart Van Assche wrote:
> > > On Fri, 2017-04-28 at 23:15 +0800, Ming Lei wrote:
> > > > +static inline bool blk_mq_sched_may_use_hw_tag(struct request_queue *q)
> > > > +{
> > > > +	if (q->tag_set->flags & BLK_MQ_F_TAG_SHARED)
> > > > +		return false;
> > > > +
> > > > +	if (blk_mq_get_queue_depth(q) < q->nr_requests)
> > > > +		return false;
> > > > +
> > > > +	return true;
> > > > +}
> > > 
> > > The only user of shared tag sets I know of is scsi-mq. I think it's really
> > > unfortunate that this patch systematically disables BLK_MQ_F_SCHED_USE_HW_TAG
> > > for scsi-mq.
> > 
> > In previous patch, I actually allow driver to pass this flag, but this
> > feature is dropped in this post, just for making it simple & clean.
> > If you think we need it for shared tag set, I can add it in v1.
> > 
> > For shared tag sets, I suggest to not enable it at default, because
> > scheduler is per request queue now, and generaly more requests available,
> > better it performs.  When tags are shared among several request
> > queues, one of them may use tags up for its own scheduling, then
> > starve others. But it should be possible and not difficult to allocate
> > requests fairly for scheduling in this case if we switch to per-hctx
> > scheduling.
> 
> Hello Ming,
> 
> Have you noticed that there is already a mechanism in the block layer to
> avoid starvation if a tag set is shared? The hctx_may_queue() function
> guarantees that each user that shares a tag set gets at least some tags.
> The .active_queues counter keeps track of the number of hardware queues
> that share a tag set.
> 
> Bart.

The scheduler tags are there to abstract away the hardware, and
USE_HW_TAG should just be an optimization for when that abstraction is a
noop. That's not the case when there are shared tags, and I doubt that
the overhead of the scheduler tags is significant for scsi-mq. Let's
stick with the behavior Ming had here.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough
  2017-05-01 15:06       ` Bart Van Assche
  2017-05-02  3:49         ` Omar Sandoval
@ 2017-05-02  8:46         ` Ming Lei
  1 sibling, 0 replies; 57+ messages in thread
From: Ming Lei @ 2017-05-02  8:46 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: hch, linux-block, osandov, axboe

On Mon, May 01, 2017 at 03:06:16PM +0000, Bart Van Assche wrote:
> On Sat, 2017-04-29 at 18:35 +0800, Ming Lei wrote:
> > On Fri, Apr 28, 2017 at 06:09:40PM +0000, Bart Van Assche wrote:
> > > On Fri, 2017-04-28 at 23:15 +0800, Ming Lei wrote:
> > > > +static inline bool blk_mq_sched_may_use_hw_tag(struct request_queue *q)
> > > > +{
> > > > +	if (q->tag_set->flags & BLK_MQ_F_TAG_SHARED)
> > > > +		return false;
> > > > +
> > > > +	if (blk_mq_get_queue_depth(q) < q->nr_requests)
> > > > +		return false;
> > > > +
> > > > +	return true;
> > > > +}
> > > 
> > > The only user of shared tag sets I know of is scsi-mq. I think it's really
> > > unfortunate that this patch systematically disables BLK_MQ_F_SCHED_USE_HW_TAG
> > > for scsi-mq.
> > 
> > In previous patch, I actually allow driver to pass this flag, but this
> > feature is dropped in this post, just for making it simple & clean.
> > If you think we need it for shared tag set, I can add it in v1.
> > 
> > For shared tag sets, I suggest to not enable it at default, because
> > scheduler is per request queue now, and generaly more requests available,
> > better it performs.  When tags are shared among several request
> > queues, one of them may use tags up for its own scheduling, then
> > starve others. But it should be possible and not difficult to allocate
> > requests fairly for scheduling in this case if we switch to per-hctx
> > scheduling.
> 
> Hello Ming,
> 
> Have you noticed that there is already a mechanism in the block layer to
> avoid starvation if a tag set is shared? The hctx_may_queue() function
> guarantees that each user that shares a tag set gets at least some tags.
> The .active_queues counter keeps track of the number of hardware queues
> that share a tag set.

OK, from hctx_may_queue(), each hw queue can be allocated at most
tags of (queue depth / nr_hw_queues), and we can allow to use
hw tag for scheduler too if the following condition is ture for
shared tags:

	q->nr_requests <= (blk_mq_get_queue_depth(q) / nr_hw_queues)

It should often be true for some scsi devices(such as virtio-scsi)
in some configurations.


thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-04-28 20:29 ` [PATCH 0/4] blk-mq: support to use hw tag for scheduling Jens Axboe
@ 2017-05-03  4:03   ` Ming Lei
  2017-05-03 14:08     ` Jens Axboe
  0 siblings, 1 reply; 57+ messages in thread
From: Ming Lei @ 2017-05-03  4:03 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Christoph Hellwig, Omar Sandoval

On Fri, Apr 28, 2017 at 02:29:18PM -0600, Jens Axboe wrote:
> On 04/28/2017 09:15 AM, Ming Lei wrote:
> > Hi,
> > 
> > This patchset introduces flag of BLK_MQ_F_SCHED_USE_HW_TAG and
> > allows to use hardware tag directly for IO scheduling if the queue's
> > depth is big enough. In this way, we can avoid to allocate extra tags
> > and request pool for IO schedule, and the schedule tag allocation/release
> > can be saved in I/O submit path.
> 
> Ming, I like this approach, it's pretty clean. It'd be nice to have a
> bit of performance data to back up that it's useful to add this code,
> though.  Have you run anything on eg kyber on nvme that shows a
> reduction in overhead when getting rid of separate scheduler tags?

I can observe small improvement in the following tests:

1) fio script
# io scheduler: kyber

RWS="randread read randwrite write"
for RW in $RWS; do
        echo "Running test $RW"
        sudo echo 3 > /proc/sys/vm/drop_caches
        sudo fio --direct=1 --size=128G --bsrange=4k-4k --runtime=20 --numjobs=1 --ioengine=libaio --iodepth=10240 --group_reporting=1 --filename=$DISK --name=$DISK-test-$RW --rw=$RW --output-format=json
done

2) results

---------------------------------------------------------
			|sched tag(iops/lat)	| use hw tag to sched(iops/lat)
----------------------------------------------------------
randread	|188940/54107			| 193865/52734
----------------------------------------------------------
read		|192646/53069			| 199738/51188
----------------------------------------------------------
randwrite	|171048/59777			| 179038/57112
----------------------------------------------------------
write		|171886/59492			| 181029/56491
----------------------------------------------------------

I guess it may be a bit more obvious when running the test on one slow
NVMe device, and will try to find one and run the test again.


thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03  4:03   ` Ming Lei
@ 2017-05-03 14:08     ` Jens Axboe
  2017-05-03 14:10       ` Jens Axboe
  0 siblings, 1 reply; 57+ messages in thread
From: Jens Axboe @ 2017-05-03 14:08 UTC (permalink / raw)
  To: Ming Lei; +Cc: linux-block, Christoph Hellwig, Omar Sandoval

On 05/02/2017 10:03 PM, Ming Lei wrote:
> On Fri, Apr 28, 2017 at 02:29:18PM -0600, Jens Axboe wrote:
>> On 04/28/2017 09:15 AM, Ming Lei wrote:
>>> Hi,
>>>
>>> This patchset introduces flag of BLK_MQ_F_SCHED_USE_HW_TAG and
>>> allows to use hardware tag directly for IO scheduling if the queue's
>>> depth is big enough. In this way, we can avoid to allocate extra tags
>>> and request pool for IO schedule, and the schedule tag allocation/release
>>> can be saved in I/O submit path.
>>
>> Ming, I like this approach, it's pretty clean. It'd be nice to have a
>> bit of performance data to back up that it's useful to add this code,
>> though.  Have you run anything on eg kyber on nvme that shows a
>> reduction in overhead when getting rid of separate scheduler tags?
> 
> I can observe small improvement in the following tests:
> 
> 1) fio script
> # io scheduler: kyber
> 
> RWS="randread read randwrite write"
> for RW in $RWS; do
>         echo "Running test $RW"
>         sudo echo 3 > /proc/sys/vm/drop_caches
>         sudo fio --direct=1 --size=128G --bsrange=4k-4k --runtime=20 --numjobs=1 --ioengine=libaio --iodepth=10240 --group_reporting=1 --filename=$DISK --name=$DISK-test-$RW --rw=$RW --output-format=json
> done
> 
> 2) results
> 
> ---------------------------------------------------------
> 			|sched tag(iops/lat)	| use hw tag to sched(iops/lat)
> ----------------------------------------------------------
> randread	|188940/54107			| 193865/52734
> ----------------------------------------------------------
> read		|192646/53069			| 199738/51188
> ----------------------------------------------------------
> randwrite	|171048/59777			| 179038/57112
> ----------------------------------------------------------
> write		|171886/59492			| 181029/56491
> ----------------------------------------------------------
> 
> I guess it may be a bit more obvious when running the test on one slow
> NVMe device, and will try to find one and run the test again.

Thanks for running that. As I said in my original reply, I think this
is a good optimization, and the implementation is clean. I'm fine with
the current limitations of when to enable it, and it's not like we
can't extend this later, if we want.

I do agree with Bart that patch 1+4 should be combined. I'll do that.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 14:08     ` Jens Axboe
@ 2017-05-03 14:10       ` Jens Axboe
  2017-05-03 15:03         ` Ming Lei
  0 siblings, 1 reply; 57+ messages in thread
From: Jens Axboe @ 2017-05-03 14:10 UTC (permalink / raw)
  To: Ming Lei; +Cc: linux-block, Christoph Hellwig, Omar Sandoval

On 05/03/2017 08:08 AM, Jens Axboe wrote:
> On 05/02/2017 10:03 PM, Ming Lei wrote:
>> On Fri, Apr 28, 2017 at 02:29:18PM -0600, Jens Axboe wrote:
>>> On 04/28/2017 09:15 AM, Ming Lei wrote:
>>>> Hi,
>>>>
>>>> This patchset introduces flag of BLK_MQ_F_SCHED_USE_HW_TAG and
>>>> allows to use hardware tag directly for IO scheduling if the queue's
>>>> depth is big enough. In this way, we can avoid to allocate extra tags
>>>> and request pool for IO schedule, and the schedule tag allocation/release
>>>> can be saved in I/O submit path.
>>>
>>> Ming, I like this approach, it's pretty clean. It'd be nice to have a
>>> bit of performance data to back up that it's useful to add this code,
>>> though.  Have you run anything on eg kyber on nvme that shows a
>>> reduction in overhead when getting rid of separate scheduler tags?
>>
>> I can observe small improvement in the following tests:
>>
>> 1) fio script
>> # io scheduler: kyber
>>
>> RWS="randread read randwrite write"
>> for RW in $RWS; do
>>         echo "Running test $RW"
>>         sudo echo 3 > /proc/sys/vm/drop_caches
>>         sudo fio --direct=1 --size=128G --bsrange=4k-4k --runtime=20 --numjobs=1 --ioengine=libaio --iodepth=10240 --group_reporting=1 --filename=$DISK --name=$DISK-test-$RW --rw=$RW --output-format=json
>> done
>>
>> 2) results
>>
>> ---------------------------------------------------------
>> 			|sched tag(iops/lat)	| use hw tag to sched(iops/lat)
>> ----------------------------------------------------------
>> randread	|188940/54107			| 193865/52734
>> ----------------------------------------------------------
>> read		|192646/53069			| 199738/51188
>> ----------------------------------------------------------
>> randwrite	|171048/59777			| 179038/57112
>> ----------------------------------------------------------
>> write		|171886/59492			| 181029/56491
>> ----------------------------------------------------------
>>
>> I guess it may be a bit more obvious when running the test on one slow
>> NVMe device, and will try to find one and run the test again.
> 
> Thanks for running that. As I said in my original reply, I think this
> is a good optimization, and the implementation is clean. I'm fine with
> the current limitations of when to enable it, and it's not like we
> can't extend this later, if we want.
> 
> I do agree with Bart that patch 1+4 should be combined. I'll do that.

Actually, can you do that when reposting? Looks like you needed to
do that anyway.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 14:10       ` Jens Axboe
@ 2017-05-03 15:03         ` Ming Lei
  2017-05-03 15:08           ` Jens Axboe
  0 siblings, 1 reply; 57+ messages in thread
From: Ming Lei @ 2017-05-03 15:03 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Christoph Hellwig, Omar Sandoval

On Wed, May 03, 2017 at 08:10:58AM -0600, Jens Axboe wrote:
> On 05/03/2017 08:08 AM, Jens Axboe wrote:
> > On 05/02/2017 10:03 PM, Ming Lei wrote:
> >> On Fri, Apr 28, 2017 at 02:29:18PM -0600, Jens Axboe wrote:
> >>> On 04/28/2017 09:15 AM, Ming Lei wrote:
> >>>> Hi,
> >>>>
> >>>> This patchset introduces flag of BLK_MQ_F_SCHED_USE_HW_TAG and
> >>>> allows to use hardware tag directly for IO scheduling if the queue's
> >>>> depth is big enough. In this way, we can avoid to allocate extra tags
> >>>> and request pool for IO schedule, and the schedule tag allocation/release
> >>>> can be saved in I/O submit path.
> >>>
> >>> Ming, I like this approach, it's pretty clean. It'd be nice to have a
> >>> bit of performance data to back up that it's useful to add this code,
> >>> though.  Have you run anything on eg kyber on nvme that shows a
> >>> reduction in overhead when getting rid of separate scheduler tags?
> >>
> >> I can observe small improvement in the following tests:
> >>
> >> 1) fio script
> >> # io scheduler: kyber
> >>
> >> RWS="randread read randwrite write"
> >> for RW in $RWS; do
> >>         echo "Running test $RW"
> >>         sudo echo 3 > /proc/sys/vm/drop_caches
> >>         sudo fio --direct=1 --size=128G --bsrange=4k-4k --runtime=20 --numjobs=1 --ioengine=libaio --iodepth=10240 --group_reporting=1 --filename=$DISK --name=$DISK-test-$RW --rw=$RW --output-format=json
> >> done
> >>
> >> 2) results
> >>
> >> ---------------------------------------------------------
> >> 			|sched tag(iops/lat)	| use hw tag to sched(iops/lat)
> >> ----------------------------------------------------------
> >> randread	|188940/54107			| 193865/52734
> >> ----------------------------------------------------------
> >> read		|192646/53069			| 199738/51188
> >> ----------------------------------------------------------
> >> randwrite	|171048/59777			| 179038/57112
> >> ----------------------------------------------------------
> >> write		|171886/59492			| 181029/56491
> >> ----------------------------------------------------------
> >>
> >> I guess it may be a bit more obvious when running the test on one slow
> >> NVMe device, and will try to find one and run the test again.
> > 
> > Thanks for running that. As I said in my original reply, I think this
> > is a good optimization, and the implementation is clean. I'm fine with
> > the current limitations of when to enable it, and it's not like we
> > can't extend this later, if we want.
> > 
> > I do agree with Bart that patch 1+4 should be combined. I'll do that.
> 
> Actually, can you do that when reposting? Looks like you needed to
> do that anyway.

Yeah, I will do that in V1.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 15:03         ` Ming Lei
@ 2017-05-03 15:08           ` Jens Axboe
  2017-05-03 15:38             ` Ming Lei
  0 siblings, 1 reply; 57+ messages in thread
From: Jens Axboe @ 2017-05-03 15:08 UTC (permalink / raw)
  To: Ming Lei; +Cc: linux-block, Christoph Hellwig, Omar Sandoval

On 05/03/2017 09:03 AM, Ming Lei wrote:
> On Wed, May 03, 2017 at 08:10:58AM -0600, Jens Axboe wrote:
>> On 05/03/2017 08:08 AM, Jens Axboe wrote:
>>> On 05/02/2017 10:03 PM, Ming Lei wrote:
>>>> On Fri, Apr 28, 2017 at 02:29:18PM -0600, Jens Axboe wrote:
>>>>> On 04/28/2017 09:15 AM, Ming Lei wrote:
>>>>>> Hi,
>>>>>>
>>>>>> This patchset introduces flag of BLK_MQ_F_SCHED_USE_HW_TAG and
>>>>>> allows to use hardware tag directly for IO scheduling if the queue's
>>>>>> depth is big enough. In this way, we can avoid to allocate extra tags
>>>>>> and request pool for IO schedule, and the schedule tag allocation/release
>>>>>> can be saved in I/O submit path.
>>>>>
>>>>> Ming, I like this approach, it's pretty clean. It'd be nice to have a
>>>>> bit of performance data to back up that it's useful to add this code,
>>>>> though.  Have you run anything on eg kyber on nvme that shows a
>>>>> reduction in overhead when getting rid of separate scheduler tags?
>>>>
>>>> I can observe small improvement in the following tests:
>>>>
>>>> 1) fio script
>>>> # io scheduler: kyber
>>>>
>>>> RWS="randread read randwrite write"
>>>> for RW in $RWS; do
>>>>         echo "Running test $RW"
>>>>         sudo echo 3 > /proc/sys/vm/drop_caches
>>>>         sudo fio --direct=1 --size=128G --bsrange=4k-4k --runtime=20 --numjobs=1 --ioengine=libaio --iodepth=10240 --group_reporting=1 --filename=$DISK --name=$DISK-test-$RW --rw=$RW --output-format=json
>>>> done
>>>>
>>>> 2) results
>>>>
>>>> ---------------------------------------------------------
>>>> 			|sched tag(iops/lat)	| use hw tag to sched(iops/lat)
>>>> ----------------------------------------------------------
>>>> randread	|188940/54107			| 193865/52734
>>>> ----------------------------------------------------------
>>>> read		|192646/53069			| 199738/51188
>>>> ----------------------------------------------------------
>>>> randwrite	|171048/59777			| 179038/57112
>>>> ----------------------------------------------------------
>>>> write		|171886/59492			| 181029/56491
>>>> ----------------------------------------------------------
>>>>
>>>> I guess it may be a bit more obvious when running the test on one slow
>>>> NVMe device, and will try to find one and run the test again.
>>>
>>> Thanks for running that. As I said in my original reply, I think this
>>> is a good optimization, and the implementation is clean. I'm fine with
>>> the current limitations of when to enable it, and it's not like we
>>> can't extend this later, if we want.
>>>
>>> I do agree with Bart that patch 1+4 should be combined. I'll do that.
>>
>> Actually, can you do that when reposting? Looks like you needed to
>> do that anyway.
> 
> Yeah, I will do that in V1.

V2? :-)

Sounds good. I just wanted to check the numbers here, with the series
applied on top of for-linus crashes when switching to kyber. A few hunks
threw fuzz, but it looked fine to me. But I bet I fat fingered
something.  So it'd be great if you could respin against my for-linus
branch.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 15:08           ` Jens Axboe
@ 2017-05-03 15:38             ` Ming Lei
  2017-05-03 16:06               ` Omar Sandoval
  2017-05-03 16:52               ` Ming Lei
  0 siblings, 2 replies; 57+ messages in thread
From: Ming Lei @ 2017-05-03 15:38 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Christoph Hellwig, Omar Sandoval

On Wed, May 03, 2017 at 09:08:34AM -0600, Jens Axboe wrote:
> On 05/03/2017 09:03 AM, Ming Lei wrote:
> > On Wed, May 03, 2017 at 08:10:58AM -0600, Jens Axboe wrote:
> >> On 05/03/2017 08:08 AM, Jens Axboe wrote:
> >>> On 05/02/2017 10:03 PM, Ming Lei wrote:
> >>>> On Fri, Apr 28, 2017 at 02:29:18PM -0600, Jens Axboe wrote:
> >>>>> On 04/28/2017 09:15 AM, Ming Lei wrote:
> >>>>>> Hi,
> >>>>>>
> >>>>>> This patchset introduces flag of BLK_MQ_F_SCHED_USE_HW_TAG and
> >>>>>> allows to use hardware tag directly for IO scheduling if the queue's
> >>>>>> depth is big enough. In this way, we can avoid to allocate extra tags
> >>>>>> and request pool for IO schedule, and the schedule tag allocation/release
> >>>>>> can be saved in I/O submit path.
> >>>>>
> >>>>> Ming, I like this approach, it's pretty clean. It'd be nice to have a
> >>>>> bit of performance data to back up that it's useful to add this code,
> >>>>> though.  Have you run anything on eg kyber on nvme that shows a
> >>>>> reduction in overhead when getting rid of separate scheduler tags?
> >>>>
> >>>> I can observe small improvement in the following tests:
> >>>>
> >>>> 1) fio script
> >>>> # io scheduler: kyber
> >>>>
> >>>> RWS="randread read randwrite write"
> >>>> for RW in $RWS; do
> >>>>         echo "Running test $RW"
> >>>>         sudo echo 3 > /proc/sys/vm/drop_caches
> >>>>         sudo fio --direct=1 --size=128G --bsrange=4k-4k --runtime=20 --numjobs=1 --ioengine=libaio --iodepth=10240 --group_reporting=1 --filename=$DISK --name=$DISK-test-$RW --rw=$RW --output-format=json
> >>>> done
> >>>>
> >>>> 2) results
> >>>>
> >>>> ---------------------------------------------------------
> >>>> 			|sched tag(iops/lat)	| use hw tag to sched(iops/lat)
> >>>> ----------------------------------------------------------
> >>>> randread	|188940/54107			| 193865/52734
> >>>> ----------------------------------------------------------
> >>>> read		|192646/53069			| 199738/51188
> >>>> ----------------------------------------------------------
> >>>> randwrite	|171048/59777			| 179038/57112
> >>>> ----------------------------------------------------------
> >>>> write		|171886/59492			| 181029/56491
> >>>> ----------------------------------------------------------
> >>>>
> >>>> I guess it may be a bit more obvious when running the test on one slow
> >>>> NVMe device, and will try to find one and run the test again.
> >>>
> >>> Thanks for running that. As I said in my original reply, I think this
> >>> is a good optimization, and the implementation is clean. I'm fine with
> >>> the current limitations of when to enable it, and it's not like we
> >>> can't extend this later, if we want.
> >>>
> >>> I do agree with Bart that patch 1+4 should be combined. I'll do that.
> >>
> >> Actually, can you do that when reposting? Looks like you needed to
> >> do that anyway.
> > 
> > Yeah, I will do that in V1.
> 
> V2? :-)
> 
> Sounds good. I just wanted to check the numbers here, with the series
> applied on top of for-linus crashes when switching to kyber. A few hunks

Yeah, I saw that too, it has been fixed in my local tree, :-)

> threw fuzz, but it looked fine to me. But I bet I fat fingered
> something.  So it'd be great if you could respin against my for-linus
> branch.

Actually, that is exactly what I am doing, :-)

Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 15:38             ` Ming Lei
@ 2017-05-03 16:06               ` Omar Sandoval
  2017-05-03 16:21                 ` Ming Lei
  2017-05-03 16:52               ` Ming Lei
  1 sibling, 1 reply; 57+ messages in thread
From: Omar Sandoval @ 2017-05-03 16:06 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block, Christoph Hellwig, Omar Sandoval

On Wed, May 03, 2017 at 11:38:09PM +0800, Ming Lei wrote:
> On Wed, May 03, 2017 at 09:08:34AM -0600, Jens Axboe wrote:
> > Sounds good. I just wanted to check the numbers here, with the series
> > applied on top of for-linus crashes when switching to kyber. A few hunks
> 
> Yeah, I saw that too, it has been fixed in my local tree, :-)

I'm guessing that was this?

static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
{
	/*
	 * All of the hardware queues have the same depth, so we can just grab
	 * the shift of the first one.
	 */
	return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
}

If not, that'll need to be fixed up.

> > threw fuzz, but it looked fine to me. But I bet I fat fingered
> > something.  So it'd be great if you could respin against my for-linus
> > branch.
> 
> Actually, that is exactly what I am doing, :-)
> 
> Thanks,
> Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 16:06               ` Omar Sandoval
@ 2017-05-03 16:21                 ` Ming Lei
  0 siblings, 0 replies; 57+ messages in thread
From: Ming Lei @ 2017-05-03 16:21 UTC (permalink / raw)
  To: Omar Sandoval; +Cc: Jens Axboe, linux-block, Christoph Hellwig, Omar Sandoval

On Wed, May 03, 2017 at 09:06:27AM -0700, Omar Sandoval wrote:
> On Wed, May 03, 2017 at 11:38:09PM +0800, Ming Lei wrote:
> > On Wed, May 03, 2017 at 09:08:34AM -0600, Jens Axboe wrote:
> > > Sounds good. I just wanted to check the numbers here, with the series
> > > applied on top of for-linus crashes when switching to kyber. A few hunks
> > 
> > Yeah, I saw that too, it has been fixed in my local tree, :-)
> 
> I'm guessing that was this?
> 
> static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd)
> {
> 	/*
> 	 * All of the hardware queues have the same depth, so we can just grab
> 	 * the shift of the first one.
> 	 */
> 	return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
> }

Yes, that is it, :-)

Now we need to check .sched_tags here.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG
  2017-04-28 15:15 ` [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG Ming Lei
@ 2017-05-03 16:21   ` Omar Sandoval
  2017-05-03 16:46   ` Omar Sandoval
  1 sibling, 0 replies; 57+ messages in thread
From: Omar Sandoval @ 2017-05-03 16:21 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block, Christoph Hellwig, Omar Sandoval

On Fri, Apr 28, 2017 at 11:15:36PM +0800, Ming Lei wrote:
> When blk-mq I/O scheduler is used, we need two tags for
> submitting one request. One is called scheduler tag for
> allocating request and scheduling I/O, another one is called
> driver tag, which is used for dispatching IO to hardware/driver.
> This way introduces one extra per-queue allocation for both tags
> and request pool, and may not be as efficient as case of none
> scheduler.
> 
> Also currently we put a default per-hctx limit on schedulable
> requests, and this limit may be a bottleneck for some devices,
> especialy when these devices have a quite big tag space.
> 
> This patch introduces BLK_MQ_F_SCHED_USE_HW_TAG so that we can
> allow to use hardware/driver tags directly for IO scheduling if
> devices's hardware tag space is big enough. Then we can avoid
> the extra resource allocation and make IO submission more
> efficient.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/blk-mq-sched.c   | 10 +++++++++-
>  block/blk-mq.c         | 35 +++++++++++++++++++++++++++++------
>  include/linux/blk-mq.h |  1 +
>  3 files changed, 39 insertions(+), 7 deletions(-)
> 
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index 27c67465f856..45a675f07b8b 100644
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 0168b27469cb..e530bc54f0d9 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -263,9 +263,19 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
>  				rq->rq_flags = RQF_MQ_INFLIGHT;
>  				atomic_inc(&data->hctx->nr_active);
>  			}
> -			rq->tag = tag;
> -			rq->internal_tag = -1;
> -			data->hctx->tags->rqs[rq->tag] = rq;
> +			data->hctx->tags->rqs[tag] = rq;
> +
> +			/*
> +			 * If we use hw tag for scheduling, postpone setting
> +			 * rq->tag in blk_mq_get_driver_tag().
> +			 */
> +			if (data->hctx->flags & BLK_MQ_F_SCHED_USE_HW_TAG) {
> +				rq->tag = -1;
> +				rq->internal_tag = tag;
> +			} else {
> +				rq->tag = tag;
> +				rq->internal_tag = -1;
> +			}

I'm guessing you did it this way because we currently check rq->tag to
decided whether this is a flush that needs to be bypassed? Makes sense,
but I'm adding it to my list of reasons why the flush stuff sucks.

> @@ -893,9 +909,15 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
>  static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
>  				    struct request *rq)
>  {
> -	blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
> +	unsigned tag = rq->tag;

The pickiest of all nits, but we mostly spell out `unsigned int` in this
file, it'd be nice to stay consistent.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough
  2017-04-28 15:15 ` [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough Ming Lei
  2017-04-28 18:09   ` Bart Van Assche
  2017-04-28 18:22   ` Jens Axboe
@ 2017-05-03 16:29   ` Omar Sandoval
  2017-05-03 16:55     ` Ming Lei
  2 siblings, 1 reply; 57+ messages in thread
From: Omar Sandoval @ 2017-05-03 16:29 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block, Christoph Hellwig, Omar Sandoval

On Fri, Apr 28, 2017 at 11:15:38PM +0800, Ming Lei wrote:
> When tag space of one device is big enough, we use hw tag
> directly for I/O scheduling.
> 
> Now the decision is made if hw queue depth is not less than
> q->nr_requests and the tag set isn't shared.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/blk-mq-sched.c |  8 ++++++++
>  block/blk-mq-sched.h | 15 +++++++++++++++
>  block/blk-mq.c       | 18 +++++++++++++++++-
>  3 files changed, 40 insertions(+), 1 deletion(-)
> 
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index 45a675f07b8b..4681e27c127e 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -507,6 +507,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
>  	struct elevator_queue *eq;
>  	unsigned int i;
>  	int ret;
> +	bool auto_hw_tag;
>  
>  	if (!e) {
>  		q->elevator = NULL;
> @@ -519,7 +520,14 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
>  	 */
>  	q->nr_requests = 2 * BLKDEV_MAX_RQ;
>  
> +	auto_hw_tag = blk_mq_sched_may_use_hw_tag(q);
> +
>  	queue_for_each_hw_ctx(q, hctx, i) {
> +		if (auto_hw_tag)
> +			hctx->flags |= BLK_MQ_F_SCHED_USE_HW_TAG;
> +		else
> +			hctx->flags &= ~BLK_MQ_F_SCHED_USE_HW_TAG;
> +
>  		ret = blk_mq_sched_alloc_tags(q, hctx, i);
>  		if (ret)
>  			goto err;

I think you should also clear the BLK_MQ_F_SCHED_USE_HW_TAG flag in
blk_mq_exit_sched()?

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG
  2017-04-28 15:15 ` [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG Ming Lei
  2017-05-03 16:21   ` Omar Sandoval
@ 2017-05-03 16:46   ` Omar Sandoval
  2017-05-03 20:13     ` Ming Lei
  1 sibling, 1 reply; 57+ messages in thread
From: Omar Sandoval @ 2017-05-03 16:46 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block, Christoph Hellwig, Omar Sandoval

On Fri, Apr 28, 2017 at 11:15:36PM +0800, Ming Lei wrote:
> When blk-mq I/O scheduler is used, we need two tags for
> submitting one request. One is called scheduler tag for
> allocating request and scheduling I/O, another one is called
> driver tag, which is used for dispatching IO to hardware/driver.
> This way introduces one extra per-queue allocation for both tags
> and request pool, and may not be as efficient as case of none
> scheduler.
> 
> Also currently we put a default per-hctx limit on schedulable
> requests, and this limit may be a bottleneck for some devices,
> especialy when these devices have a quite big tag space.
> 
> This patch introduces BLK_MQ_F_SCHED_USE_HW_TAG so that we can
> allow to use hardware/driver tags directly for IO scheduling if
> devices's hardware tag space is big enough. Then we can avoid
> the extra resource allocation and make IO submission more
> efficient.
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/blk-mq-sched.c   | 10 +++++++++-
>  block/blk-mq.c         | 35 +++++++++++++++++++++++++++++------
>  include/linux/blk-mq.h |  1 +
>  3 files changed, 39 insertions(+), 7 deletions(-)

One more note on this: if we're using the hardware tags directly, then
we are no longer limited to q->nr_requests requests in-flight. Instead,
we're limited to the hw queue depth. We probably want to maintain the
original behavior, so I think we need to resize the hw tags in
blk_mq_init_sched() if we're using hardware tags.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 15:38             ` Ming Lei
  2017-05-03 16:06               ` Omar Sandoval
@ 2017-05-03 16:52               ` Ming Lei
  2017-05-03 17:03                 ` Ming Lei
  2017-05-03 17:08                 ` Bart Van Assche
  1 sibling, 2 replies; 57+ messages in thread
From: Ming Lei @ 2017-05-03 16:52 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Christoph Hellwig, Omar Sandoval

On Wed, May 03, 2017 at 11:38:09PM +0800, Ming Lei wrote:
> On Wed, May 03, 2017 at 09:08:34AM -0600, Jens Axboe wrote:
> > On 05/03/2017 09:03 AM, Ming Lei wrote:
> > > On Wed, May 03, 2017 at 08:10:58AM -0600, Jens Axboe wrote:
> > >> On 05/03/2017 08:08 AM, Jens Axboe wrote:
> > >>> On 05/02/2017 10:03 PM, Ming Lei wrote:
> > >>>> On Fri, Apr 28, 2017 at 02:29:18PM -0600, Jens Axboe wrote:
> > >>>>> On 04/28/2017 09:15 AM, Ming Lei wrote:
> > >>>>>> Hi,
> > >>>>>>
> > >>>>>> This patchset introduces flag of BLK_MQ_F_SCHED_USE_HW_TAG and
> > >>>>>> allows to use hardware tag directly for IO scheduling if the queue's
> > >>>>>> depth is big enough. In this way, we can avoid to allocate extra tags
> > >>>>>> and request pool for IO schedule, and the schedule tag allocation/release
> > >>>>>> can be saved in I/O submit path.
> > >>>>>
> > >>>>> Ming, I like this approach, it's pretty clean. It'd be nice to have a
> > >>>>> bit of performance data to back up that it's useful to add this code,
> > >>>>> though.  Have you run anything on eg kyber on nvme that shows a
> > >>>>> reduction in overhead when getting rid of separate scheduler tags?
> > >>>>
> > >>>> I can observe small improvement in the following tests:
> > >>>>
> > >>>> 1) fio script
> > >>>> # io scheduler: kyber
> > >>>>
> > >>>> RWS="randread read randwrite write"
> > >>>> for RW in $RWS; do
> > >>>>         echo "Running test $RW"
> > >>>>         sudo echo 3 > /proc/sys/vm/drop_caches
> > >>>>         sudo fio --direct=1 --size=128G --bsrange=4k-4k --runtime=20 --numjobs=1 --ioengine=libaio --iodepth=10240 --group_reporting=1 --filename=$DISK --name=$DISK-test-$RW --rw=$RW --output-format=json
> > >>>> done
> > >>>>
> > >>>> 2) results
> > >>>>
> > >>>> ---------------------------------------------------------
> > >>>> 			|sched tag(iops/lat)	| use hw tag to sched(iops/lat)
> > >>>> ----------------------------------------------------------
> > >>>> randread	|188940/54107			| 193865/52734
> > >>>> ----------------------------------------------------------
> > >>>> read		|192646/53069			| 199738/51188
> > >>>> ----------------------------------------------------------
> > >>>> randwrite	|171048/59777			| 179038/57112
> > >>>> ----------------------------------------------------------
> > >>>> write		|171886/59492			| 181029/56491
> > >>>> ----------------------------------------------------------
> > >>>>
> > >>>> I guess it may be a bit more obvious when running the test on one slow
> > >>>> NVMe device, and will try to find one and run the test again.
> > >>>
> > >>> Thanks for running that. As I said in my original reply, I think this
> > >>> is a good optimization, and the implementation is clean. I'm fine with
> > >>> the current limitations of when to enable it, and it's not like we
> > >>> can't extend this later, if we want.
> > >>>
> > >>> I do agree with Bart that patch 1+4 should be combined. I'll do that.
> > >>
> > >> Actually, can you do that when reposting? Looks like you needed to
> > >> do that anyway.
> > > 
> > > Yeah, I will do that in V1.
> > 
> > V2? :-)
> > 
> > Sounds good. I just wanted to check the numbers here, with the series
> > applied on top of for-linus crashes when switching to kyber. A few hunks
> 
> Yeah, I saw that too, it has been fixed in my local tree, :-)
> 
> > threw fuzz, but it looked fine to me. But I bet I fat fingered
> > something.  So it'd be great if you could respin against my for-linus
> > branch.
> 
> Actually, that is exactly what I am doing, :-)

Looks v4.11 plus your for-linus often triggers the following hang during
boot, and it seems caused by the change in (blk-mq: unify hctx delayed_run_work
and run_work)


UG: scheduling while atomic: kworker/0:1H/704/0x00000002
Modules linked in:
Preemption disabled at:
[<ffffffffaf5607bb>] virtio_queue_rq+0xdb/0x350
CPU: 0 PID: 704 Comm: kworker/0:1H Not tainted 4.11.0-04508-ga1f35f46164b #132
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.9.3-1.fc25 04/01/2014
Workqueue: kblockd blk_mq_run_work_fn
Call Trace:
 dump_stack+0x65/0x8f
 ? virtio_queue_rq+0xdb/0x350
 __schedule_bug+0x76/0xc0
 __schedule+0x610/0x820
 ? new_slab+0x2c9/0x590
 schedule+0x40/0x90
 schedule_timeout+0x273/0x320
 ? ___slab_alloc+0x3cb/0x4f0
 wait_for_completion+0x97/0x100
 ? wait_for_completion+0x97/0x100
 ? wake_up_q+0x80/0x80
 flush_work+0x104/0x1a0
 ? flush_workqueue_prep_pwqs+0x130/0x130
 __cancel_work_timer+0xeb/0x160
 ? vp_notify+0x16/0x20
 ? virtqueue_add_sgs+0x23c/0x4a0
 cancel_delayed_work_sync+0x13/0x20
 blk_mq_stop_hw_queue+0x16/0x20
 virtio_queue_rq+0x316/0x350
 blk_mq_dispatch_rq_list+0x194/0x350
 blk_mq_sched_dispatch_requests+0x118/0x170
 ? finish_task_switch+0x80/0x1e0
 __blk_mq_run_hw_queue+0xa3/0xc0
 blk_mq_run_work_fn+0x2c/0x30
 process_one_work+0x1e0/0x400
 worker_thread+0x48/0x3f0
 kthread+0x109/0x140
 ? process_one_work+0x400/0x400
 ? kthread_create_on_node+0x40/0x40
 ret_from_fork+0x2c/0x40


Thansk,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/4] blk-mq: introduce blk_mq_get_queue_depth()
  2017-04-28 15:15 ` [PATCH 2/4] blk-mq: introduce blk_mq_get_queue_depth() Ming Lei
  2017-04-28 18:23   ` Jens Axboe
@ 2017-05-03 16:55   ` Omar Sandoval
  2017-05-04  2:10     ` Ming Lei
  1 sibling, 1 reply; 57+ messages in thread
From: Omar Sandoval @ 2017-05-03 16:55 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block, Christoph Hellwig, Omar Sandoval

On Fri, Apr 28, 2017 at 11:15:37PM +0800, Ming Lei wrote:
> The hardware queue depth can be resized via blk_mq_update_nr_requests(),
> so introduce this helper for retrieving queue's depth easily.

One nit below. If the per-hardware queue tag space situation changes, we
can revisit this and the similar thing in Kyber.

Reviewed-by: Omar Sandoval <osandov@fb.com>

> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/blk-mq.c | 12 ++++++++++++
>  block/blk-mq.h |  1 +
>  2 files changed, 13 insertions(+)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index e530bc54f0d9..04761fb76ab4 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2637,6 +2637,18 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
>  }
>  EXPORT_SYMBOL(blk_mq_free_tag_set);
>  
> +/*
> + * Queue depth can be changed via blk_mq_update_nr_requests(),
> + * so use this helper to retrieve queue's depth.
> + */
> +int blk_mq_get_queue_depth(struct request_queue *q)
> +{
> +	/* All queues have same queue depth */
> +	struct blk_mq_tags	*tags = q->tag_set->tags[0];

Not sure what's going on with the spacing here. Tab instead of a space?

> +	return tags->bitmap_tags.sb.depth;
> +}
> +
>  int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
>  {
>  	struct blk_mq_tag_set *set = q->tag_set;
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 2814a14e529c..8085d5989cf5 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -166,6 +166,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
>  void blk_mq_finish_request(struct request *rq);
>  struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
>  					unsigned int op);
> +int blk_mq_get_queue_depth(struct request_queue *q);
>  
>  static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
>  {
> -- 
> 2.9.3
> 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough
  2017-05-03 16:29   ` Omar Sandoval
@ 2017-05-03 16:55     ` Ming Lei
  2017-05-03 17:00       ` Omar Sandoval
  0 siblings, 1 reply; 57+ messages in thread
From: Ming Lei @ 2017-05-03 16:55 UTC (permalink / raw)
  To: Omar Sandoval; +Cc: Jens Axboe, linux-block, Christoph Hellwig, Omar Sandoval

On Wed, May 03, 2017 at 09:29:36AM -0700, Omar Sandoval wrote:
> On Fri, Apr 28, 2017 at 11:15:38PM +0800, Ming Lei wrote:
> > When tag space of one device is big enough, we use hw tag
> > directly for I/O scheduling.
> > 
> > Now the decision is made if hw queue depth is not less than
> > q->nr_requests and the tag set isn't shared.
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  block/blk-mq-sched.c |  8 ++++++++
> >  block/blk-mq-sched.h | 15 +++++++++++++++
> >  block/blk-mq.c       | 18 +++++++++++++++++-
> >  3 files changed, 40 insertions(+), 1 deletion(-)
> > 
> > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > index 45a675f07b8b..4681e27c127e 100644
> > --- a/block/blk-mq-sched.c
> > +++ b/block/blk-mq-sched.c
> > @@ -507,6 +507,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> >  	struct elevator_queue *eq;
> >  	unsigned int i;
> >  	int ret;
> > +	bool auto_hw_tag;
> >  
> >  	if (!e) {
> >  		q->elevator = NULL;
> > @@ -519,7 +520,14 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> >  	 */
> >  	q->nr_requests = 2 * BLKDEV_MAX_RQ;
> >  
> > +	auto_hw_tag = blk_mq_sched_may_use_hw_tag(q);
> > +
> >  	queue_for_each_hw_ctx(q, hctx, i) {
> > +		if (auto_hw_tag)
> > +			hctx->flags |= BLK_MQ_F_SCHED_USE_HW_TAG;
> > +		else
> > +			hctx->flags &= ~BLK_MQ_F_SCHED_USE_HW_TAG;
> > +
> >  		ret = blk_mq_sched_alloc_tags(q, hctx, i);
> >  		if (ret)
> >  			goto err;
> 
> I think you should also clear the BLK_MQ_F_SCHED_USE_HW_TAG flag in
> blk_mq_exit_sched()?

Looks not necessary since the flag is always evaluated in
blk_mq_init_sched().

Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough
  2017-05-03 16:55     ` Ming Lei
@ 2017-05-03 17:00       ` Omar Sandoval
  2017-05-03 17:33         ` Ming Lei
  0 siblings, 1 reply; 57+ messages in thread
From: Omar Sandoval @ 2017-05-03 17:00 UTC (permalink / raw)
  To: Ming Lei; +Cc: Jens Axboe, linux-block, Christoph Hellwig, Omar Sandoval

On Thu, May 04, 2017 at 12:55:30AM +0800, Ming Lei wrote:
> On Wed, May 03, 2017 at 09:29:36AM -0700, Omar Sandoval wrote:
> > On Fri, Apr 28, 2017 at 11:15:38PM +0800, Ming Lei wrote:
> > > When tag space of one device is big enough, we use hw tag
> > > directly for I/O scheduling.
> > > 
> > > Now the decision is made if hw queue depth is not less than
> > > q->nr_requests and the tag set isn't shared.
> > > 
> > > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > > ---
> > >  block/blk-mq-sched.c |  8 ++++++++
> > >  block/blk-mq-sched.h | 15 +++++++++++++++
> > >  block/blk-mq.c       | 18 +++++++++++++++++-
> > >  3 files changed, 40 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > index 45a675f07b8b..4681e27c127e 100644
> > > --- a/block/blk-mq-sched.c
> > > +++ b/block/blk-mq-sched.c
> > > @@ -507,6 +507,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> > >  	struct elevator_queue *eq;
> > >  	unsigned int i;
> > >  	int ret;
> > > +	bool auto_hw_tag;
> > >  
> > >  	if (!e) {
> > >  		q->elevator = NULL;
> > > @@ -519,7 +520,14 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> > >  	 */
> > >  	q->nr_requests = 2 * BLKDEV_MAX_RQ;
> > >  
> > > +	auto_hw_tag = blk_mq_sched_may_use_hw_tag(q);
> > > +
> > >  	queue_for_each_hw_ctx(q, hctx, i) {
> > > +		if (auto_hw_tag)
> > > +			hctx->flags |= BLK_MQ_F_SCHED_USE_HW_TAG;
> > > +		else
> > > +			hctx->flags &= ~BLK_MQ_F_SCHED_USE_HW_TAG;
> > > +
> > >  		ret = blk_mq_sched_alloc_tags(q, hctx, i);
> > >  		if (ret)
> > >  			goto err;
> > 
> > I think you should also clear the BLK_MQ_F_SCHED_USE_HW_TAG flag in
> > blk_mq_exit_sched()?
> 
> Looks not necessary since the flag is always evaluated in
> blk_mq_init_sched().

What if we're setting the scheduler to "none"? Then blk_mq_init_sched()
will go in here:

if (!e) {
	q->elevator = NULL;
	return 0;
}

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 16:52               ` Ming Lei
@ 2017-05-03 17:03                 ` Ming Lei
  2017-05-03 17:07                   ` Jens Axboe
  2017-05-03 17:08                 ` Bart Van Assche
  1 sibling, 1 reply; 57+ messages in thread
From: Ming Lei @ 2017-05-03 17:03 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, Christoph Hellwig, Omar Sandoval, Bart Van Assche

On Thu, May 04, 2017 at 12:52:06AM +0800, Ming Lei wrote:
> On Wed, May 03, 2017 at 11:38:09PM +0800, Ming Lei wrote:
> > On Wed, May 03, 2017 at 09:08:34AM -0600, Jens Axboe wrote:
> > > On 05/03/2017 09:03 AM, Ming Lei wrote:
> > > > On Wed, May 03, 2017 at 08:10:58AM -0600, Jens Axboe wrote:
> > > >> On 05/03/2017 08:08 AM, Jens Axboe wrote:
> > > >>> On 05/02/2017 10:03 PM, Ming Lei wrote:
> > > >>>> On Fri, Apr 28, 2017 at 02:29:18PM -0600, Jens Axboe wrote:
> > > >>>>> On 04/28/2017 09:15 AM, Ming Lei wrote:
> > > >>>>>> Hi,
> > > >>>>>>
> > > >>>>>> This patchset introduces flag of BLK_MQ_F_SCHED_USE_HW_TAG and
> > > >>>>>> allows to use hardware tag directly for IO scheduling if the queue's
> > > >>>>>> depth is big enough. In this way, we can avoid to allocate extra tags
> > > >>>>>> and request pool for IO schedule, and the schedule tag allocation/release
> > > >>>>>> can be saved in I/O submit path.
> > > >>>>>
> > > >>>>> Ming, I like this approach, it's pretty clean. It'd be nice to have a
> > > >>>>> bit of performance data to back up that it's useful to add this code,
> > > >>>>> though.  Have you run anything on eg kyber on nvme that shows a
> > > >>>>> reduction in overhead when getting rid of separate scheduler tags?
> > > >>>>
> > > >>>> I can observe small improvement in the following tests:
> > > >>>>
> > > >>>> 1) fio script
> > > >>>> # io scheduler: kyber
> > > >>>>
> > > >>>> RWS="randread read randwrite write"
> > > >>>> for RW in $RWS; do
> > > >>>>         echo "Running test $RW"
> > > >>>>         sudo echo 3 > /proc/sys/vm/drop_caches
> > > >>>>         sudo fio --direct=1 --size=128G --bsrange=4k-4k --runtime=20 --numjobs=1 --ioengine=libaio --iodepth=10240 --group_reporting=1 --filename=$DISK --name=$DISK-test-$RW --rw=$RW --output-format=json
> > > >>>> done
> > > >>>>
> > > >>>> 2) results
> > > >>>>
> > > >>>> ---------------------------------------------------------
> > > >>>> 			|sched tag(iops/lat)	| use hw tag to sched(iops/lat)
> > > >>>> ----------------------------------------------------------
> > > >>>> randread	|188940/54107			| 193865/52734
> > > >>>> ----------------------------------------------------------
> > > >>>> read		|192646/53069			| 199738/51188
> > > >>>> ----------------------------------------------------------
> > > >>>> randwrite	|171048/59777			| 179038/57112
> > > >>>> ----------------------------------------------------------
> > > >>>> write		|171886/59492			| 181029/56491
> > > >>>> ----------------------------------------------------------
> > > >>>>
> > > >>>> I guess it may be a bit more obvious when running the test on one slow
> > > >>>> NVMe device, and will try to find one and run the test again.
> > > >>>
> > > >>> Thanks for running that. As I said in my original reply, I think this
> > > >>> is a good optimization, and the implementation is clean. I'm fine with
> > > >>> the current limitations of when to enable it, and it's not like we
> > > >>> can't extend this later, if we want.
> > > >>>
> > > >>> I do agree with Bart that patch 1+4 should be combined. I'll do that.
> > > >>
> > > >> Actually, can you do that when reposting? Looks like you needed to
> > > >> do that anyway.
> > > > 
> > > > Yeah, I will do that in V1.
> > > 
> > > V2? :-)
> > > 
> > > Sounds good. I just wanted to check the numbers here, with the series
> > > applied on top of for-linus crashes when switching to kyber. A few hunks
> > 
> > Yeah, I saw that too, it has been fixed in my local tree, :-)
> > 
> > > threw fuzz, but it looked fine to me. But I bet I fat fingered
> > > something.  So it'd be great if you could respin against my for-linus
> > > branch.
> > 
> > Actually, that is exactly what I am doing, :-)
> 
> Looks v4.11 plus your for-linus often triggers the following hang during
> boot, and it seems caused by the change in (blk-mq: unify hctx delayed_run_work
> and run_work)
> 
> 
> UG: scheduling while atomic: kworker/0:1H/704/0x00000002
> Modules linked in:
> Preemption disabled at:
> [<ffffffffaf5607bb>] virtio_queue_rq+0xdb/0x350
> CPU: 0 PID: 704 Comm: kworker/0:1H Not tainted 4.11.0-04508-ga1f35f46164b #132
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.9.3-1.fc25 04/01/2014
> Workqueue: kblockd blk_mq_run_work_fn
> Call Trace:
>  dump_stack+0x65/0x8f
>  ? virtio_queue_rq+0xdb/0x350
>  __schedule_bug+0x76/0xc0
>  __schedule+0x610/0x820
>  ? new_slab+0x2c9/0x590
>  schedule+0x40/0x90
>  schedule_timeout+0x273/0x320
>  ? ___slab_alloc+0x3cb/0x4f0
>  wait_for_completion+0x97/0x100
>  ? wait_for_completion+0x97/0x100
>  ? wake_up_q+0x80/0x80
>  flush_work+0x104/0x1a0
>  ? flush_workqueue_prep_pwqs+0x130/0x130
>  __cancel_work_timer+0xeb/0x160
>  ? vp_notify+0x16/0x20
>  ? virtqueue_add_sgs+0x23c/0x4a0
>  cancel_delayed_work_sync+0x13/0x20
>  blk_mq_stop_hw_queue+0x16/0x20
>  virtio_queue_rq+0x316/0x350
>  blk_mq_dispatch_rq_list+0x194/0x350
>  blk_mq_sched_dispatch_requests+0x118/0x170
>  ? finish_task_switch+0x80/0x1e0
>  __blk_mq_run_hw_queue+0xa3/0xc0
>  blk_mq_run_work_fn+0x2c/0x30
>  process_one_work+0x1e0/0x400
>  worker_thread+0x48/0x3f0
>  kthread+0x109/0x140
>  ? process_one_work+0x400/0x400
>  ? kthread_create_on_node+0x40/0x40
>  ret_from_fork+0x2c/0x40

Looks we can't call cancel_delayed_work_sync() here.

Last time, I asked why the _sync is required, looks not
get anwser, or I might forget that.

Bart, could you explain it a bit why the _sync version of
cancel work is required?

Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 17:03                 ` Ming Lei
@ 2017-05-03 17:07                   ` Jens Axboe
  2017-05-03 17:15                     ` Bart Van Assche
  0 siblings, 1 reply; 57+ messages in thread
From: Jens Axboe @ 2017-05-03 17:07 UTC (permalink / raw)
  To: Ming Lei; +Cc: linux-block, Christoph Hellwig, Omar Sandoval, Bart Van Assche

On 05/03/2017 11:03 AM, Ming Lei wrote:
> On Thu, May 04, 2017 at 12:52:06AM +0800, Ming Lei wrote:
>> On Wed, May 03, 2017 at 11:38:09PM +0800, Ming Lei wrote:
>>> On Wed, May 03, 2017 at 09:08:34AM -0600, Jens Axboe wrote:
>>>> On 05/03/2017 09:03 AM, Ming Lei wrote:
>>>>> On Wed, May 03, 2017 at 08:10:58AM -0600, Jens Axboe wrote:
>>>>>> On 05/03/2017 08:08 AM, Jens Axboe wrote:
>>>>>>> On 05/02/2017 10:03 PM, Ming Lei wrote:
>>>>>>>> On Fri, Apr 28, 2017 at 02:29:18PM -0600, Jens Axboe wrote:
>>>>>>>>> On 04/28/2017 09:15 AM, Ming Lei wrote:
>>>>>>>>>> Hi,
>>>>>>>>>>
>>>>>>>>>> This patchset introduces flag of BLK_MQ_F_SCHED_USE_HW_TAG and
>>>>>>>>>> allows to use hardware tag directly for IO scheduling if the queue's
>>>>>>>>>> depth is big enough. In this way, we can avoid to allocate extra tags
>>>>>>>>>> and request pool for IO schedule, and the schedule tag allocation/release
>>>>>>>>>> can be saved in I/O submit path.
>>>>>>>>>
>>>>>>>>> Ming, I like this approach, it's pretty clean. It'd be nice to have a
>>>>>>>>> bit of performance data to back up that it's useful to add this code,
>>>>>>>>> though.  Have you run anything on eg kyber on nvme that shows a
>>>>>>>>> reduction in overhead when getting rid of separate scheduler tags?
>>>>>>>>
>>>>>>>> I can observe small improvement in the following tests:
>>>>>>>>
>>>>>>>> 1) fio script
>>>>>>>> # io scheduler: kyber
>>>>>>>>
>>>>>>>> RWS="randread read randwrite write"
>>>>>>>> for RW in $RWS; do
>>>>>>>>         echo "Running test $RW"
>>>>>>>>         sudo echo 3 > /proc/sys/vm/drop_caches
>>>>>>>>         sudo fio --direct=1 --size=128G --bsrange=4k-4k --runtime=20 --numjobs=1 --ioengine=libaio --iodepth=10240 --group_reporting=1 --filename=$DISK --name=$DISK-test-$RW --rw=$RW --output-format=json
>>>>>>>> done
>>>>>>>>
>>>>>>>> 2) results
>>>>>>>>
>>>>>>>> ---------------------------------------------------------
>>>>>>>> 			|sched tag(iops/lat)	| use hw tag to sched(iops/lat)
>>>>>>>> ----------------------------------------------------------
>>>>>>>> randread	|188940/54107			| 193865/52734
>>>>>>>> ----------------------------------------------------------
>>>>>>>> read		|192646/53069			| 199738/51188
>>>>>>>> ----------------------------------------------------------
>>>>>>>> randwrite	|171048/59777			| 179038/57112
>>>>>>>> ----------------------------------------------------------
>>>>>>>> write		|171886/59492			| 181029/56491
>>>>>>>> ----------------------------------------------------------
>>>>>>>>
>>>>>>>> I guess it may be a bit more obvious when running the test on one slow
>>>>>>>> NVMe device, and will try to find one and run the test again.
>>>>>>>
>>>>>>> Thanks for running that. As I said in my original reply, I think this
>>>>>>> is a good optimization, and the implementation is clean. I'm fine with
>>>>>>> the current limitations of when to enable it, and it's not like we
>>>>>>> can't extend this later, if we want.
>>>>>>>
>>>>>>> I do agree with Bart that patch 1+4 should be combined. I'll do that.
>>>>>>
>>>>>> Actually, can you do that when reposting? Looks like you needed to
>>>>>> do that anyway.
>>>>>
>>>>> Yeah, I will do that in V1.
>>>>
>>>> V2? :-)
>>>>
>>>> Sounds good. I just wanted to check the numbers here, with the series
>>>> applied on top of for-linus crashes when switching to kyber. A few hunks
>>>
>>> Yeah, I saw that too, it has been fixed in my local tree, :-)
>>>
>>>> threw fuzz, but it looked fine to me. But I bet I fat fingered
>>>> something.  So it'd be great if you could respin against my for-linus
>>>> branch.
>>>
>>> Actually, that is exactly what I am doing, :-)
>>
>> Looks v4.11 plus your for-linus often triggers the following hang during
>> boot, and it seems caused by the change in (blk-mq: unify hctx delayed_run_work
>> and run_work)
>>
>>
>> UG: scheduling while atomic: kworker/0:1H/704/0x00000002
>> Modules linked in:
>> Preemption disabled at:
>> [<ffffffffaf5607bb>] virtio_queue_rq+0xdb/0x350
>> CPU: 0 PID: 704 Comm: kworker/0:1H Not tainted 4.11.0-04508-ga1f35f46164b #132
>> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.9.3-1.fc25 04/01/2014
>> Workqueue: kblockd blk_mq_run_work_fn
>> Call Trace:
>>  dump_stack+0x65/0x8f
>>  ? virtio_queue_rq+0xdb/0x350
>>  __schedule_bug+0x76/0xc0
>>  __schedule+0x610/0x820
>>  ? new_slab+0x2c9/0x590
>>  schedule+0x40/0x90
>>  schedule_timeout+0x273/0x320
>>  ? ___slab_alloc+0x3cb/0x4f0
>>  wait_for_completion+0x97/0x100
>>  ? wait_for_completion+0x97/0x100
>>  ? wake_up_q+0x80/0x80
>>  flush_work+0x104/0x1a0
>>  ? flush_workqueue_prep_pwqs+0x130/0x130
>>  __cancel_work_timer+0xeb/0x160
>>  ? vp_notify+0x16/0x20
>>  ? virtqueue_add_sgs+0x23c/0x4a0
>>  cancel_delayed_work_sync+0x13/0x20
>>  blk_mq_stop_hw_queue+0x16/0x20
>>  virtio_queue_rq+0x316/0x350
>>  blk_mq_dispatch_rq_list+0x194/0x350
>>  blk_mq_sched_dispatch_requests+0x118/0x170
>>  ? finish_task_switch+0x80/0x1e0
>>  __blk_mq_run_hw_queue+0xa3/0xc0
>>  blk_mq_run_work_fn+0x2c/0x30
>>  process_one_work+0x1e0/0x400
>>  worker_thread+0x48/0x3f0
>>  kthread+0x109/0x140
>>  ? process_one_work+0x400/0x400
>>  ? kthread_create_on_node+0x40/0x40
>>  ret_from_fork+0x2c/0x40
> 
> Looks we can't call cancel_delayed_work_sync() here.
> 
> Last time, I asked why the _sync is required, looks not
> get anwser, or I might forget that.
> 
> Bart, could you explain it a bit why the _sync version of
> cancel work is required?

Yeah, that patch was buggy, we definitely can't use the sync variants
from a drivers ->queue_rq(). How about something like the below? For
the important internal callers, we should be able to pass _sync just
fine.

diff --git a/block/blk-mq.c b/block/blk-mq.c
index bf90684a007a..1e230a864129 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -41,6 +41,7 @@ static LIST_HEAD(all_q_list);
 
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
+static void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync);
 
 static int blk_mq_poll_stats_bkt(const struct request *rq)
 {
@@ -166,7 +167,7 @@ void blk_mq_quiesce_queue(struct request_queue *q)
 	unsigned int i;
 	bool rcu = false;
 
-	blk_mq_stop_hw_queues(q);
+	__blk_mq_stop_hw_queues(q, true);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (hctx->flags & BLK_MQ_F_BLOCKING)
@@ -1218,20 +1219,34 @@ bool blk_mq_queue_stopped(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_mq_queue_stopped);
 
-void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
+static void __blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx, bool sync)
 {
-	cancel_delayed_work_sync(&hctx->run_work);
+	if (sync)
+		cancel_delayed_work_sync(&hctx->run_work);
+	else
+		cancel_delayed_work(&hctx->run_work);
+
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
+
+void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+	__blk_mq_stop_hw_queue(hctx, false);
+}
 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
 
-void blk_mq_stop_hw_queues(struct request_queue *q)
+void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i)
-		blk_mq_stop_hw_queue(hctx);
+		__blk_mq_stop_hw_queue(hctx, sync);
+}
+
+void blk_mq_stop_hw_queues(struct request_queue *q)
+{
+	__blk_mq_stop_hw_queues(q, false);
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
 

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 16:52               ` Ming Lei
  2017-05-03 17:03                 ` Ming Lei
@ 2017-05-03 17:08                 ` Bart Van Assche
  2017-05-03 17:11                   ` Jens Axboe
  2017-05-03 17:19                   ` Ming Lei
  1 sibling, 2 replies; 57+ messages in thread
From: Bart Van Assche @ 2017-05-03 17:08 UTC (permalink / raw)
  To: ming.lei, axboe; +Cc: hch, linux-block, osandov

On Thu, 2017-05-04 at 00:52 +0800, Ming Lei wrote:
> Looks v4.11 plus your for-linus often triggers the following hang during
> boot, and it seems caused by the change in (blk-mq: unify hctx delayed_ru=
n_work
> and run_work)
>=20
>=20
> BUG: scheduling while atomic: kworker/0:1H/704/0x00000002
> Modules linked in:
> Preemption disabled at:
> [<ffffffffaf5607bb>] virtio_queue_rq+0xdb/0x350
> CPU: 0 PID: 704 Comm: kworker/0:1H Not tainted 4.11.0-04508-ga1f35f46164b=
 #132
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.9.3-1.fc25 04/=
01/2014
> Workqueue: kblockd blk_mq_run_work_fn
> Call Trace:
>  dump_stack+0x65/0x8f
>  ? virtio_queue_rq+0xdb/0x350
>  __schedule_bug+0x76/0xc0
>  __schedule+0x610/0x820
>  ? new_slab+0x2c9/0x590
>  schedule+0x40/0x90
>  schedule_timeout+0x273/0x320
>  ? ___slab_alloc+0x3cb/0x4f0
>  wait_for_completion+0x97/0x100
>  ? wait_for_completion+0x97/0x100
>  ? wake_up_q+0x80/0x80
>  flush_work+0x104/0x1a0
>  ? flush_workqueue_prep_pwqs+0x130/0x130
>  __cancel_work_timer+0xeb/0x160
>  ? vp_notify+0x16/0x20
>  ? virtqueue_add_sgs+0x23c/0x4a0
>  cancel_delayed_work_sync+0x13/0x20
>  blk_mq_stop_hw_queue+0x16/0x20
>  virtio_queue_rq+0x316/0x350
>  blk_mq_dispatch_rq_list+0x194/0x350
>  blk_mq_sched_dispatch_requests+0x118/0x170
>  ? finish_task_switch+0x80/0x1e0
>  __blk_mq_run_hw_queue+0xa3/0xc0
>  blk_mq_run_work_fn+0x2c/0x30
>  process_one_work+0x1e0/0x400
>  worker_thread+0x48/0x3f0
>  kthread+0x109/0x140
>  ? process_one_work+0x400/0x400
>  ? kthread_create_on_node+0x40/0x40
>  ret_from_fork+0x2c/0x40

Callers of blk_mq_quiesce_queue() really need blk_mq_stop_hw_queue() to
cancel delayed work synchronously. The above call stack shows that we have
to do something about the blk_mq_stop_hw_queue() calls from inside .queue_r=
q()
functions for queues for which BLK_MQ_F_BLOCKING has not been set. I'm not
sure what the best approach would be: setting BLK_MQ_F_BLOCKING for queues
that call blk_mq_stop_hw_queue() from inside .queue_rq() or creating two
versions of blk_mq_stop_hw_queue().

Bart.=

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 17:08                 ` Bart Van Assche
@ 2017-05-03 17:11                   ` Jens Axboe
  2017-05-03 17:19                   ` Ming Lei
  1 sibling, 0 replies; 57+ messages in thread
From: Jens Axboe @ 2017-05-03 17:11 UTC (permalink / raw)
  To: Bart Van Assche, ming.lei; +Cc: hch, linux-block, osandov

On 05/03/2017 11:08 AM, Bart Van Assche wrote:
> On Thu, 2017-05-04 at 00:52 +0800, Ming Lei wrote:
>> Looks v4.11 plus your for-linus often triggers the following hang during
>> boot, and it seems caused by the change in (blk-mq: unify hctx delayed_run_work
>> and run_work)
>>
>>
>> BUG: scheduling while atomic: kworker/0:1H/704/0x00000002
>> Modules linked in:
>> Preemption disabled at:
>> [<ffffffffaf5607bb>] virtio_queue_rq+0xdb/0x350
>> CPU: 0 PID: 704 Comm: kworker/0:1H Not tainted 4.11.0-04508-ga1f35f46164b #132
>> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.9.3-1.fc25 04/01/2014
>> Workqueue: kblockd blk_mq_run_work_fn
>> Call Trace:
>>  dump_stack+0x65/0x8f
>>  ? virtio_queue_rq+0xdb/0x350
>>  __schedule_bug+0x76/0xc0
>>  __schedule+0x610/0x820
>>  ? new_slab+0x2c9/0x590
>>  schedule+0x40/0x90
>>  schedule_timeout+0x273/0x320
>>  ? ___slab_alloc+0x3cb/0x4f0
>>  wait_for_completion+0x97/0x100
>>  ? wait_for_completion+0x97/0x100
>>  ? wake_up_q+0x80/0x80
>>  flush_work+0x104/0x1a0
>>  ? flush_workqueue_prep_pwqs+0x130/0x130
>>  __cancel_work_timer+0xeb/0x160
>>  ? vp_notify+0x16/0x20
>>  ? virtqueue_add_sgs+0x23c/0x4a0
>>  cancel_delayed_work_sync+0x13/0x20
>>  blk_mq_stop_hw_queue+0x16/0x20
>>  virtio_queue_rq+0x316/0x350
>>  blk_mq_dispatch_rq_list+0x194/0x350
>>  blk_mq_sched_dispatch_requests+0x118/0x170
>>  ? finish_task_switch+0x80/0x1e0
>>  __blk_mq_run_hw_queue+0xa3/0xc0
>>  blk_mq_run_work_fn+0x2c/0x30
>>  process_one_work+0x1e0/0x400
>>  worker_thread+0x48/0x3f0
>>  kthread+0x109/0x140
>>  ? process_one_work+0x400/0x400
>>  ? kthread_create_on_node+0x40/0x40
>>  ret_from_fork+0x2c/0x40
> 
> Callers of blk_mq_quiesce_queue() really need blk_mq_stop_hw_queue() to
> cancel delayed work synchronously.

Right.

> The above call stack shows that we have to do something about the
> blk_mq_stop_hw_queue() calls from inside .queue_rq() functions for
> queues for which BLK_MQ_F_BLOCKING has not been set. I'm not sure what
> the best approach would be: setting BLK_MQ_F_BLOCKING for queues that
> call blk_mq_stop_hw_queue() from inside .queue_rq() or creating two
> versions of blk_mq_stop_hw_queue().

Regardless of whether BLOCKING is set or not, we don't have to hard
guarantee the flush from the drivers. If they do happen to get a 2nd
invocation before being stopped, that doesn't matter. So I think we're
fine with the patch I sent out 5 minutes ago, would be great if Ming
could test it though.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 17:07                   ` Jens Axboe
@ 2017-05-03 17:15                     ` Bart Van Assche
  2017-05-03 17:24                       ` Jens Axboe
  0 siblings, 1 reply; 57+ messages in thread
From: Bart Van Assche @ 2017-05-03 17:15 UTC (permalink / raw)
  To: axboe, ming.lei; +Cc: hch, linux-block, osandov

On Wed, 2017-05-03 at 11:07 -0600, Jens Axboe wrote:
> +void blk_mq_stop_hw_queues(struct request_queue *q)
> +{
> +	__blk_mq_stop_hw_queues(q, false);
>  }
>  EXPORT_SYMBOL(blk_mq_stop_hw_queues);

Hello Jens,

So the approach of this patch is to make all blk_mq_stop_hw_queue()
and blk_mq_stop_hw_queues() callers cancel run_work without waiting?
That should make the BUG reported by Ming disappear. However, I think
we may want to review all calls from block drivers to
blk_mq_stop_hw_queues(). There are drivers that call this function to
quiesce I/O so I think these need the synchronous work cancellation ...

Bart.=

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 17:08                 ` Bart Van Assche
  2017-05-03 17:11                   ` Jens Axboe
@ 2017-05-03 17:19                   ` Ming Lei
  2017-05-03 17:41                     ` Bart Van Assche
  1 sibling, 1 reply; 57+ messages in thread
From: Ming Lei @ 2017-05-03 17:19 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: axboe, hch, linux-block, osandov

On Wed, May 03, 2017 at 05:08:30PM +0000, Bart Van Assche wrote:
> On Thu, 2017-05-04 at 00:52 +0800, Ming Lei wrote:
> > Looks v4.11 plus your for-linus often triggers the following hang during
> > boot, and it seems caused by the change in (blk-mq: unify hctx delayed_run_work
> > and run_work)
> > 
> > 
> > BUG: scheduling while atomic: kworker/0:1H/704/0x00000002
> > Modules linked in:
> > Preemption disabled at:
> > [<ffffffffaf5607bb>] virtio_queue_rq+0xdb/0x350
> > CPU: 0 PID: 704 Comm: kworker/0:1H Not tainted 4.11.0-04508-ga1f35f46164b #132
> > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.9.3-1.fc25 04/01/2014
> > Workqueue: kblockd blk_mq_run_work_fn
> > Call Trace:
> >  dump_stack+0x65/0x8f
> >  ? virtio_queue_rq+0xdb/0x350
> >  __schedule_bug+0x76/0xc0
> >  __schedule+0x610/0x820
> >  ? new_slab+0x2c9/0x590
> >  schedule+0x40/0x90
> >  schedule_timeout+0x273/0x320
> >  ? ___slab_alloc+0x3cb/0x4f0
> >  wait_for_completion+0x97/0x100
> >  ? wait_for_completion+0x97/0x100
> >  ? wake_up_q+0x80/0x80
> >  flush_work+0x104/0x1a0
> >  ? flush_workqueue_prep_pwqs+0x130/0x130
> >  __cancel_work_timer+0xeb/0x160
> >  ? vp_notify+0x16/0x20
> >  ? virtqueue_add_sgs+0x23c/0x4a0
> >  cancel_delayed_work_sync+0x13/0x20
> >  blk_mq_stop_hw_queue+0x16/0x20
> >  virtio_queue_rq+0x316/0x350
> >  blk_mq_dispatch_rq_list+0x194/0x350
> >  blk_mq_sched_dispatch_requests+0x118/0x170
> >  ? finish_task_switch+0x80/0x1e0
> >  __blk_mq_run_hw_queue+0xa3/0xc0
> >  blk_mq_run_work_fn+0x2c/0x30
> >  process_one_work+0x1e0/0x400
> >  worker_thread+0x48/0x3f0
> >  kthread+0x109/0x140
> >  ? process_one_work+0x400/0x400
> >  ? kthread_create_on_node+0x40/0x40
> >  ret_from_fork+0x2c/0x40
> 
> Callers of blk_mq_quiesce_queue() really need blk_mq_stop_hw_queue() to
> cancel delayed work synchronously. The above call stack shows that we have
> to do something about the blk_mq_stop_hw_queue() calls from inside .queue_rq()
> functions for queues for which BLK_MQ_F_BLOCKING has not been set. I'm not
> sure what the best approach would be: setting BLK_MQ_F_BLOCKING for queues
> that call blk_mq_stop_hw_queue() from inside .queue_rq() or creating two
> versions of blk_mq_stop_hw_queue().

I think either synchronize_srcu() or synchronize_rcu() can handle 
this, since they are waiting for completion of .queue_rq(). So looks
cancel_delayed_work_sync() shouldn't be needed?


Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 17:15                     ` Bart Van Assche
@ 2017-05-03 17:24                       ` Jens Axboe
  2017-05-03 17:35                         ` Bart Van Assche
  0 siblings, 1 reply; 57+ messages in thread
From: Jens Axboe @ 2017-05-03 17:24 UTC (permalink / raw)
  To: Bart Van Assche, ming.lei; +Cc: hch, linux-block, osandov

On 05/03/2017 11:15 AM, Bart Van Assche wrote:
> On Wed, 2017-05-03 at 11:07 -0600, Jens Axboe wrote:
>> +void blk_mq_stop_hw_queues(struct request_queue *q)
>> +{
>> +	__blk_mq_stop_hw_queues(q, false);
>>  }
>>  EXPORT_SYMBOL(blk_mq_stop_hw_queues);
> 
> Hello Jens,
> 
> So the approach of this patch is to make all blk_mq_stop_hw_queue()
> and blk_mq_stop_hw_queues() callers cancel run_work without waiting?
> That should make the BUG reported by Ming disappear. However, I think
> we may want to review all calls from block drivers to
> blk_mq_stop_hw_queues(). There are drivers that call this function to
> quiesce I/O so I think these need the synchronous work cancellation
> ...

I took a look at the drivers, and it's trivial enough to do. Most of
them will work fine with a sync block for stopping _all_ queues. See
below.


diff --git a/block/blk-mq.c b/block/blk-mq.c
index e339247a2570..9dcb9592dbf9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -166,7 +166,7 @@ void blk_mq_quiesce_queue(struct request_queue *q)
 	unsigned int i;
 	bool rcu = false;
 
-	blk_mq_stop_hw_queues(q);
+	blk_mq_stop_hw_queues(q, true);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (hctx->flags & BLK_MQ_F_BLOCKING)
@@ -1218,20 +1218,29 @@ bool blk_mq_queue_stopped(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_mq_queue_stopped);
 
-void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
+static void __blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx, bool sync)
 {
-	cancel_delayed_work_sync(&hctx->run_work);
+	if (sync)
+		cancel_delayed_work_sync(&hctx->run_work);
+	else
+		cancel_delayed_work(&hctx->run_work);
+
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
+
+void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+	__blk_mq_stop_hw_queue(hctx, false);
+}
 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
 
-void blk_mq_stop_hw_queues(struct request_queue *q)
+void blk_mq_stop_hw_queues(struct request_queue *q, bool sync)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i)
-		blk_mq_stop_hw_queue(hctx);
+		__blk_mq_stop_hw_queue(hctx, sync);
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 3a779a4f5653..33b5d1382c45 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -950,7 +950,7 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
 	unsigned long to;
 	bool active = true;
 
-	blk_mq_stop_hw_queues(port->dd->queue);
+	blk_mq_stop_hw_queues(port->dd->queue, true);
 
 	to = jiffies + msecs_to_jiffies(timeout);
 	do {
@@ -4009,7 +4009,7 @@ static int mtip_block_remove(struct driver_data *dd)
 						dd->disk->disk_name);
 
 	blk_freeze_queue_start(dd->queue);
-	blk_mq_stop_hw_queues(dd->queue);
+	blk_mq_stop_hw_queues(dd->queue, true);
 	blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd);
 
 	/*
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 6b98ec2a3824..ce5490938232 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -656,7 +656,7 @@ static void nbd_clear_req(struct request *req, void *data, bool reserved)
 
 static void nbd_clear_que(struct nbd_device *nbd)
 {
-	blk_mq_stop_hw_queues(nbd->disk->queue);
+	blk_mq_stop_hw_queues(nbd->disk->queue, true);
 	blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
 	blk_mq_start_hw_queues(nbd->disk->queue);
 	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 94173de1efaa..a73d2823cdb2 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -844,7 +844,7 @@ static int virtblk_freeze(struct virtio_device *vdev)
 	/* Make sure no work handler is accessing the device. */
 	flush_work(&vblk->config_work);
 
-	blk_mq_stop_hw_queues(vblk->disk->queue);
+	blk_mq_stop_hw_queues(vblk->disk->queue, true);
 
 	vdev->config->del_vqs(vdev);
 	return 0;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 70e689bf1cad..b6891e4af837 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1969,7 +1969,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
 			return BLK_MQ_RQ_QUEUE_ERROR;
 
 		if (op->rq) {
-			blk_mq_stop_hw_queues(op->rq->q);
+			blk_mq_stop_hw_queues(op->rq->q, false);
 			blk_mq_delay_queue(queue->hctx, NVMEFC_QUEUE_DELAY);
 		}
 		return BLK_MQ_RQ_QUEUE_BUSY;
@@ -2478,7 +2478,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
 	 * use blk_mq_tagset_busy_itr() and the transport routine to
 	 * terminate the exchanges.
 	 */
-	blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
+	blk_mq_stop_hw_queues(ctrl->ctrl.admin_q, true);
 	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
 				nvme_fc_terminate_exchange, &ctrl->ctrl);
 
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 56a315bd4d96..6e87854eddd5 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1084,7 +1084,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
 	spin_unlock_irq(&nvmeq->q_lock);
 
 	if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
-		blk_mq_stop_hw_queues(nvmeq->dev->ctrl.admin_q);
+		blk_mq_stop_hw_queues(nvmeq->dev->ctrl.admin_q, true);
 
 	free_irq(vector, nvmeq);
 
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index dd1c6deef82f..b478839c5d79 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -792,7 +792,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
 	return;
 
 stop_admin_q:
-	blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
+	blk_mq_stop_hw_queues(ctrl->ctrl.admin_q, true);
 requeue:
 	dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
 			ctrl->ctrl.opts->nr_reconnects);
@@ -814,7 +814,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
 
 	if (ctrl->queue_count > 1)
 		nvme_stop_queues(&ctrl->ctrl);
-	blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
+	blk_mq_stop_hw_queues(ctrl->ctrl.admin_q, true);
 
 	/* We must take care of fastfail/requeue all our inflight requests */
 	if (ctrl->queue_count > 1)
@@ -1651,7 +1651,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl)
 	if (test_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[0].flags))
 		nvme_shutdown_ctrl(&ctrl->ctrl);
 
-	blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
+	blk_mq_stop_hw_queues(ctrl->ctrl.admin_q, true);
 	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
 				nvme_cancel_request, &ctrl->ctrl);
 	nvme_rdma_destroy_admin_queue(ctrl);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index feb497134aee..ea0911e36f2d 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -446,7 +446,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
 	if (ctrl->ctrl.state == NVME_CTRL_LIVE)
 		nvme_shutdown_ctrl(&ctrl->ctrl);
 
-	blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
+	blk_mq_stop_hw_queues(ctrl->ctrl.admin_q, true);
 	blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
 				nvme_cancel_request, &ctrl->ctrl);
 	nvme_loop_destroy_admin_queue(ctrl);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 327b10206d63..64100c6b5811 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2976,7 +2976,7 @@ scsi_internal_device_block(struct scsi_device *sdev, bool wait)
 		if (wait)
 			blk_mq_quiesce_queue(q);
 		else
-			blk_mq_stop_hw_queues(q);
+			blk_mq_stop_hw_queues(q, true);
 	} else {
 		spin_lock_irqsave(q->queue_lock, flags);
 		blk_stop_queue(q);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a104832e7ae5..1f6684aa465f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -239,7 +239,7 @@ void blk_mq_complete_request(struct request *rq);
 bool blk_mq_queue_stopped(struct request_queue *q);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx);
-void blk_mq_stop_hw_queues(struct request_queue *q);
+void blk_mq_stop_hw_queues(struct request_queue *q, bool sync);
 void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough
  2017-05-03 17:00       ` Omar Sandoval
@ 2017-05-03 17:33         ` Ming Lei
  0 siblings, 0 replies; 57+ messages in thread
From: Ming Lei @ 2017-05-03 17:33 UTC (permalink / raw)
  To: Omar Sandoval; +Cc: Jens Axboe, linux-block, Christoph Hellwig, Omar Sandoval

On Wed, May 03, 2017 at 10:00:04AM -0700, Omar Sandoval wrote:
> On Thu, May 04, 2017 at 12:55:30AM +0800, Ming Lei wrote:
> > On Wed, May 03, 2017 at 09:29:36AM -0700, Omar Sandoval wrote:
> > > On Fri, Apr 28, 2017 at 11:15:38PM +0800, Ming Lei wrote:
> > > > When tag space of one device is big enough, we use hw tag
> > > > directly for I/O scheduling.
> > > > 
> > > > Now the decision is made if hw queue depth is not less than
> > > > q->nr_requests and the tag set isn't shared.
> > > > 
> > > > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > > > ---
> > > >  block/blk-mq-sched.c |  8 ++++++++
> > > >  block/blk-mq-sched.h | 15 +++++++++++++++
> > > >  block/blk-mq.c       | 18 +++++++++++++++++-
> > > >  3 files changed, 40 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > > index 45a675f07b8b..4681e27c127e 100644
> > > > --- a/block/blk-mq-sched.c
> > > > +++ b/block/blk-mq-sched.c
> > > > @@ -507,6 +507,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> > > >  	struct elevator_queue *eq;
> > > >  	unsigned int i;
> > > >  	int ret;
> > > > +	bool auto_hw_tag;
> > > >  
> > > >  	if (!e) {
> > > >  		q->elevator = NULL;
> > > > @@ -519,7 +520,14 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> > > >  	 */
> > > >  	q->nr_requests = 2 * BLKDEV_MAX_RQ;
> > > >  
> > > > +	auto_hw_tag = blk_mq_sched_may_use_hw_tag(q);
> > > > +
> > > >  	queue_for_each_hw_ctx(q, hctx, i) {
> > > > +		if (auto_hw_tag)
> > > > +			hctx->flags |= BLK_MQ_F_SCHED_USE_HW_TAG;
> > > > +		else
> > > > +			hctx->flags &= ~BLK_MQ_F_SCHED_USE_HW_TAG;
> > > > +
> > > >  		ret = blk_mq_sched_alloc_tags(q, hctx, i);
> > > >  		if (ret)
> > > >  			goto err;
> > > 
> > > I think you should also clear the BLK_MQ_F_SCHED_USE_HW_TAG flag in
> > > blk_mq_exit_sched()?
> > 
> > Looks not necessary since the flag is always evaluated in
> > blk_mq_init_sched().
> 
> What if we're setting the scheduler to "none"? Then blk_mq_init_sched()
> will go in here:
> 
> if (!e) {
> 	q->elevator = NULL;
> 	return 0;
> }

Good catch, will clear the flag in blk_mq_exit_sched() in V2.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 17:24                       ` Jens Axboe
@ 2017-05-03 17:35                         ` Bart Van Assche
  2017-05-03 17:40                           ` Jens Axboe
  0 siblings, 1 reply; 57+ messages in thread
From: Bart Van Assche @ 2017-05-03 17:35 UTC (permalink / raw)
  To: axboe, ming.lei; +Cc: hch, linux-block, osandov

On Wed, 2017-05-03 at 11:24 -0600, Jens Axboe wrote:
> diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/m=
tip32xx.c
> index 3a779a4f5653..33b5d1382c45 100644
> --- a/drivers/block/mtip32xx/mtip32xx.c
> +++ b/drivers/block/mtip32xx/mtip32xx.c
> [ ... ]
> @@ -4009,7 +4009,7 @@ static int mtip_block_remove(struct driver_data *dd=
)
>  						dd->disk->disk_name);
> =20
>  	blk_freeze_queue_start(dd->queue);
> -	blk_mq_stop_hw_queues(dd->queue);
> +	blk_mq_stop_hw_queues(dd->queue, true);
>  	blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd);
> =20
>  	/*

Hello Jens,

How about replacing the blk_freeze_queue_start() and blk_mq_stop_hw_queues(=
)
calls by a single call to blk_set_queue_dying()? I think that would be more
appropriate in the context of mtip_block_remove().

> diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
> index 6b98ec2a3824..ce5490938232 100644
> --- a/drivers/block/nbd.c
> +++ b/drivers/block/nbd.c
> @@ -656,7 +656,7 @@ static void nbd_clear_req(struct request *req, void *=
data, bool reserved)
> =20
>  static void nbd_clear_que(struct nbd_device *nbd)
>  {
> -	blk_mq_stop_hw_queues(nbd->disk->queue);
> +	blk_mq_stop_hw_queues(nbd->disk->queue, true);
>  	blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
>  	blk_mq_start_hw_queues(nbd->disk->queue);
>  	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");

A similar comment here: since nbd_clear_que() is called just before shuttin=
g
down the block layer queue in the NBD driver, how about replacing the
blk_mq_stop_hw_queues() / blk_mq_start_hw_queues() pair by a single=20
blk_set_queue_dying() call?

> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
> index 94173de1efaa..a73d2823cdb2 100644
> --- a/drivers/block/virtio_blk.c
> +++ b/drivers/block/virtio_blk.c
> @@ -844,7 +844,7 @@ static int virtblk_freeze(struct virtio_device *vdev)
>  	/* Make sure no work handler is accessing the device. */
>  	flush_work(&vblk->config_work);
> =20
> -	blk_mq_stop_hw_queues(vblk->disk->queue);
> +	blk_mq_stop_hw_queues(vblk->disk->queue, true);
> =20
>  	vdev->config->del_vqs(vdev);
>  	return 0;

Since the above blk_mq_stop_hw_queues() call is followed by a .del_vqs() ca=
ll,
shouldn't the blk_mq_stop_hw_queues() / blk_mq_start_hw_queues() pair in th=
e
virtio_blk driver be changed into a blk_mq_freeze_queue() / blk_mq_unfreeze=
_queue()
pair?

Thanks,

Bart.=

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 17:35                         ` Bart Van Assche
@ 2017-05-03 17:40                           ` Jens Axboe
  2017-05-03 17:43                             ` Bart Van Assche
  0 siblings, 1 reply; 57+ messages in thread
From: Jens Axboe @ 2017-05-03 17:40 UTC (permalink / raw)
  To: Bart Van Assche, ming.lei; +Cc: hch, linux-block, osandov

On 05/03/2017 11:35 AM, Bart Van Assche wrote:
> On Wed, 2017-05-03 at 11:24 -0600, Jens Axboe wrote:
>> diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
>> index 3a779a4f5653..33b5d1382c45 100644
>> --- a/drivers/block/mtip32xx/mtip32xx.c
>> +++ b/drivers/block/mtip32xx/mtip32xx.c
>> [ ... ]
>> @@ -4009,7 +4009,7 @@ static int mtip_block_remove(struct driver_data *dd)
>>  						dd->disk->disk_name);
>>  
>>  	blk_freeze_queue_start(dd->queue);
>> -	blk_mq_stop_hw_queues(dd->queue);
>> +	blk_mq_stop_hw_queues(dd->queue, true);
>>  	blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd);
>>  
>>  	/*
> 
> Hello Jens,
> 
> How about replacing the blk_freeze_queue_start() and blk_mq_stop_hw_queues()
> calls by a single call to blk_set_queue_dying()? I think that would be more
> appropriate in the context of mtip_block_remove().

Looking at all of the drivers, it's somewhat of a mess and a lot of it looks
like it's not even needed. I'm going to propose that we do the below first,
since it fixes the immediate regression and makes us no worse than 4.11
already was. Then we can do a round two of improving the situation, but
I'd prefer not to turn the immediate fix into a huge round of fixes and
driver touching.


>From 66e22c6402af135cdc9913d2e298107629b02cdb Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Wed, 3 May 2017 11:08:14 -0600
Subject: [PATCH] blk-mq: don't use sync workqueue flushing from drivers

A previous commit introduced the sync flush, which we need from
internal callers like blk_mq_quiesce_queue(). However, we also
call the stop helpers from drivers, particularly from ->queue_rq()
when we have to stop processing for a bit. We can't block from
those locations, and we don't have to guarantee that we're
fully flushed.

Fixes: 9f993737906b ("blk-mq: unify hctx delayed_run_work and run_work")
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 block/blk-mq.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index e339247a2570..dec70ca0aafd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -41,6 +41,7 @@ static LIST_HEAD(all_q_list);
 
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
+static void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync);
 
 static int blk_mq_poll_stats_bkt(const struct request *rq)
 {
@@ -166,7 +167,7 @@ void blk_mq_quiesce_queue(struct request_queue *q)
 	unsigned int i;
 	bool rcu = false;
 
-	blk_mq_stop_hw_queues(q);
+	__blk_mq_stop_hw_queues(q, true);
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (hctx->flags & BLK_MQ_F_BLOCKING)
@@ -1218,20 +1219,34 @@ bool blk_mq_queue_stopped(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_mq_queue_stopped);
 
-void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
+static void __blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx, bool sync)
 {
-	cancel_delayed_work_sync(&hctx->run_work);
+	if (sync)
+		cancel_delayed_work_sync(&hctx->run_work);
+	else
+		cancel_delayed_work(&hctx->run_work);
+
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
+
+void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
+{
+	__blk_mq_stop_hw_queue(hctx, false);
+}
 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
 
-void blk_mq_stop_hw_queues(struct request_queue *q)
+void __blk_mq_stop_hw_queues(struct request_queue *q, bool sync)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 
 	queue_for_each_hw_ctx(q, hctx, i)
-		blk_mq_stop_hw_queue(hctx);
+		__blk_mq_stop_hw_queue(hctx, sync);
+}
+
+void blk_mq_stop_hw_queues(struct request_queue *q)
+{
+	__blk_mq_stop_hw_queues(q, false);
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
 
-- 
2.7.4

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 17:19                   ` Ming Lei
@ 2017-05-03 17:41                     ` Bart Van Assche
  0 siblings, 0 replies; 57+ messages in thread
From: Bart Van Assche @ 2017-05-03 17:41 UTC (permalink / raw)
  To: ming.lei; +Cc: hch, linux-block, osandov, axboe

On Thu, 2017-05-04 at 01:19 +0800, Ming Lei wrote:
> On Wed, May 03, 2017 at 05:08:30PM +0000, Bart Van Assche wrote:
> > Callers of blk_mq_quiesce_queue() really need blk_mq_stop_hw_queue() to
> > cancel delayed work synchronously. The above call stack shows that we h=
ave
> > to do something about the blk_mq_stop_hw_queue() calls from inside .que=
ue_rq()
> > functions for queues for which BLK_MQ_F_BLOCKING has not been set. I'm =
not
> > sure what the best approach would be: setting BLK_MQ_F_BLOCKING for que=
ues
> > that call blk_mq_stop_hw_queue() from inside .queue_rq() or creating tw=
o
> > versions of blk_mq_stop_hw_queue().
>=20
> I think either synchronize_srcu() or synchronize_rcu() can handle=20
> this, since they are waiting for completion of .queue_rq(). So looks
> cancel_delayed_work_sync() shouldn't be needed?

Hello Ming,

What you wrote is not correct. synchronize_*rcu() only waits for ongoing
.queue_rq() calls. cancel_delayed_work() will return immediately if it is
called after hctx->run_work has already been started. And the hctx->run_wor=
k
handler can trigger new .queue_rq() calls after synchronize_*rcu() has
already returned.

Bart.=

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 0/4] blk-mq: support to use hw tag for scheduling
  2017-05-03 17:40                           ` Jens Axboe
@ 2017-05-03 17:43                             ` Bart Van Assche
  0 siblings, 0 replies; 57+ messages in thread
From: Bart Van Assche @ 2017-05-03 17:43 UTC (permalink / raw)
  To: axboe, ming.lei; +Cc: hch, linux-block, osandov

On Wed, 2017-05-03 at 11:40 -0600, Jens Axboe wrote:
> Subject: [PATCH] blk-mq: don't use sync workqueue flushing from drivers
>=20
> A previous commit introduced the sync flush, which we need from
> internal callers like blk_mq_quiesce_queue(). However, we also
> call the stop helpers from drivers, particularly from ->queue_rq()
> when we have to stop processing for a bit. We can't block from
> those locations, and we don't have to guarantee that we're
> fully flushed.

Reviewed-by: Bart Van Assche <Bart.VanAssche@sandisk.com>=

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG
  2017-05-03 16:46   ` Omar Sandoval
@ 2017-05-03 20:13     ` Ming Lei
  2017-05-03 21:40       ` Omar Sandoval
  0 siblings, 1 reply; 57+ messages in thread
From: Ming Lei @ 2017-05-03 20:13 UTC (permalink / raw)
  To: Omar Sandoval
  Cc: Ming Lei, Jens Axboe, linux-block, Christoph Hellwig, Omar Sandoval

On Thu, May 4, 2017 at 12:46 AM, Omar Sandoval <osandov@osandov.com> wrote:
> On Fri, Apr 28, 2017 at 11:15:36PM +0800, Ming Lei wrote:
>> When blk-mq I/O scheduler is used, we need two tags for
>> submitting one request. One is called scheduler tag for
>> allocating request and scheduling I/O, another one is called
>> driver tag, which is used for dispatching IO to hardware/driver.
>> This way introduces one extra per-queue allocation for both tags
>> and request pool, and may not be as efficient as case of none
>> scheduler.
>>
>> Also currently we put a default per-hctx limit on schedulable
>> requests, and this limit may be a bottleneck for some devices,
>> especialy when these devices have a quite big tag space.
>>
>> This patch introduces BLK_MQ_F_SCHED_USE_HW_TAG so that we can
>> allow to use hardware/driver tags directly for IO scheduling if
>> devices's hardware tag space is big enough. Then we can avoid
>> the extra resource allocation and make IO submission more
>> efficient.
>>
>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>> ---
>>  block/blk-mq-sched.c   | 10 +++++++++-
>>  block/blk-mq.c         | 35 +++++++++++++++++++++++++++++------
>>  include/linux/blk-mq.h |  1 +
>>  3 files changed, 39 insertions(+), 7 deletions(-)
>
> One more note on this: if we're using the hardware tags directly, then
> we are no longer limited to q->nr_requests requests in-flight. Instead,
> we're limited to the hw queue depth. We probably want to maintain the
> original behavior,

That need further investigation, and generally scheduler should be happy with
more requests which can be scheduled.

We can make it as one follow-up.

> so I think we need to resize the hw tags in blk_mq_init_sched() if we're using hardware tags.

That might not be good since hw tags are used by both scheduler and dispatching.


Thanks,
Ming Lei

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG
  2017-05-03 20:13     ` Ming Lei
@ 2017-05-03 21:40       ` Omar Sandoval
  2017-05-04  2:01         ` Ming Lei
  0 siblings, 1 reply; 57+ messages in thread
From: Omar Sandoval @ 2017-05-03 21:40 UTC (permalink / raw)
  To: Ming Lei
  Cc: Ming Lei, Jens Axboe, linux-block, Christoph Hellwig, Omar Sandoval

On Thu, May 04, 2017 at 04:13:51AM +0800, Ming Lei wrote:
> On Thu, May 4, 2017 at 12:46 AM, Omar Sandoval <osandov@osandov.com> wrote:
> > On Fri, Apr 28, 2017 at 11:15:36PM +0800, Ming Lei wrote:
> >> When blk-mq I/O scheduler is used, we need two tags for
> >> submitting one request. One is called scheduler tag for
> >> allocating request and scheduling I/O, another one is called
> >> driver tag, which is used for dispatching IO to hardware/driver.
> >> This way introduces one extra per-queue allocation for both tags
> >> and request pool, and may not be as efficient as case of none
> >> scheduler.
> >>
> >> Also currently we put a default per-hctx limit on schedulable
> >> requests, and this limit may be a bottleneck for some devices,
> >> especialy when these devices have a quite big tag space.
> >>
> >> This patch introduces BLK_MQ_F_SCHED_USE_HW_TAG so that we can
> >> allow to use hardware/driver tags directly for IO scheduling if
> >> devices's hardware tag space is big enough. Then we can avoid
> >> the extra resource allocation and make IO submission more
> >> efficient.
> >>
> >> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> >> ---
> >>  block/blk-mq-sched.c   | 10 +++++++++-
> >>  block/blk-mq.c         | 35 +++++++++++++++++++++++++++++------
> >>  include/linux/blk-mq.h |  1 +
> >>  3 files changed, 39 insertions(+), 7 deletions(-)
> >
> > One more note on this: if we're using the hardware tags directly, then
> > we are no longer limited to q->nr_requests requests in-flight. Instead,
> > we're limited to the hw queue depth. We probably want to maintain the
> > original behavior,
> 
> That need further investigation, and generally scheduler should be happy with
> more requests which can be scheduled.
> 
> We can make it as one follow-up.

If we say nr_requests is 256, then we should honor that. So either
update nr_requests to reflect the actual depth we're using or resize the
hardware tags.

> > so I think we need to resize the hw tags in blk_mq_init_sched() if we're using hardware tags.
> 
> That might not be good since hw tags are used by both scheduler and dispatching.

What do you mean? If we have BLK_MQ_F_SCHED_USE_HW_TAG set, then they
are not used for dispatching, and of course we shouldn't resize the
hardware tags if we are using scheduler tags.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG
  2017-05-03 21:40       ` Omar Sandoval
@ 2017-05-04  2:01         ` Ming Lei
  2017-05-04  2:13           ` Jens Axboe
  0 siblings, 1 reply; 57+ messages in thread
From: Ming Lei @ 2017-05-04  2:01 UTC (permalink / raw)
  To: Omar Sandoval
  Cc: Ming Lei, Jens Axboe, linux-block, Christoph Hellwig, Omar Sandoval

On Thu, May 4, 2017 at 5:40 AM, Omar Sandoval <osandov@osandov.com> wrote:
> On Thu, May 04, 2017 at 04:13:51AM +0800, Ming Lei wrote:
>> On Thu, May 4, 2017 at 12:46 AM, Omar Sandoval <osandov@osandov.com> wrote:
>> > On Fri, Apr 28, 2017 at 11:15:36PM +0800, Ming Lei wrote:
>> >> When blk-mq I/O scheduler is used, we need two tags for
>> >> submitting one request. One is called scheduler tag for
>> >> allocating request and scheduling I/O, another one is called
>> >> driver tag, which is used for dispatching IO to hardware/driver.
>> >> This way introduces one extra per-queue allocation for both tags
>> >> and request pool, and may not be as efficient as case of none
>> >> scheduler.
>> >>
>> >> Also currently we put a default per-hctx limit on schedulable
>> >> requests, and this limit may be a bottleneck for some devices,
>> >> especialy when these devices have a quite big tag space.
>> >>
>> >> This patch introduces BLK_MQ_F_SCHED_USE_HW_TAG so that we can
>> >> allow to use hardware/driver tags directly for IO scheduling if
>> >> devices's hardware tag space is big enough. Then we can avoid
>> >> the extra resource allocation and make IO submission more
>> >> efficient.
>> >>
>> >> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>> >> ---
>> >>  block/blk-mq-sched.c   | 10 +++++++++-
>> >>  block/blk-mq.c         | 35 +++++++++++++++++++++++++++++------
>> >>  include/linux/blk-mq.h |  1 +
>> >>  3 files changed, 39 insertions(+), 7 deletions(-)
>> >
>> > One more note on this: if we're using the hardware tags directly, then
>> > we are no longer limited to q->nr_requests requests in-flight. Instead,
>> > we're limited to the hw queue depth. We probably want to maintain the
>> > original behavior,
>>
>> That need further investigation, and generally scheduler should be happy with
>> more requests which can be scheduled.
>>
>> We can make it as one follow-up.
>
> If we say nr_requests is 256, then we should honor that. So either
> update nr_requests to reflect the actual depth we're using or resize the
> hardware tags.

Firstly nr_requests is set as 256 from blk-mq inside instead of user
space, it won't be
a big deal to violate that.

Secondly, when there is enough tags available, it might hurt
performance if we don't
use them all.

>
>> > so I think we need to resize the hw tags in blk_mq_init_sched() if we're using hardware tags.
>>
>> That might not be good since hw tags are used by both scheduler and dispatching.
>
> What do you mean? If we have BLK_MQ_F_SCHED_USE_HW_TAG set, then they
> are not used for dispatching, and of course we shouldn't resize the
> hardware tags if we are using scheduler tags.

The tag is used by both sched and dispatching actually, and if you
resize the hw tags,
the tag space for dispatching is resized(decreased) too.


Thanks,
Ming Lei

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 2/4] blk-mq: introduce blk_mq_get_queue_depth()
  2017-05-03 16:55   ` Omar Sandoval
@ 2017-05-04  2:10     ` Ming Lei
  0 siblings, 0 replies; 57+ messages in thread
From: Ming Lei @ 2017-05-04  2:10 UTC (permalink / raw)
  To: Omar Sandoval; +Cc: Jens Axboe, linux-block, Christoph Hellwig, Omar Sandoval

On Wed, May 03, 2017 at 09:55:30AM -0700, Omar Sandoval wrote:
> On Fri, Apr 28, 2017 at 11:15:37PM +0800, Ming Lei wrote:
> > The hardware queue depth can be resized via blk_mq_update_nr_requests(),
> > so introduce this helper for retrieving queue's depth easily.
> 
> One nit below. If the per-hardware queue tag space situation changes, we
> can revisit this and the similar thing in Kyber.

OK, will add comment about this situation in V3. 

> 
> Reviewed-by: Omar Sandoval <osandov@fb.com>
> 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  block/blk-mq.c | 12 ++++++++++++
> >  block/blk-mq.h |  1 +
> >  2 files changed, 13 insertions(+)
> > 
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index e530bc54f0d9..04761fb76ab4 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -2637,6 +2637,18 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
> >  }
> >  EXPORT_SYMBOL(blk_mq_free_tag_set);
> >  
> > +/*
> > + * Queue depth can be changed via blk_mq_update_nr_requests(),
> > + * so use this helper to retrieve queue's depth.
> > + */
> > +int blk_mq_get_queue_depth(struct request_queue *q)
> > +{
> > +	/* All queues have same queue depth */
> > +	struct blk_mq_tags	*tags = q->tag_set->tags[0];
> 
> Not sure what's going on with the spacing here. Tab instead of a space?

Either one should be fine, and I am sure this patch is warning/error
free when checking with ./scripts/checkpatch.pl, :-)

Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG
  2017-05-04  2:01         ` Ming Lei
@ 2017-05-04  2:13           ` Jens Axboe
  2017-05-04  2:51             ` Ming Lei
  0 siblings, 1 reply; 57+ messages in thread
From: Jens Axboe @ 2017-05-04  2:13 UTC (permalink / raw)
  To: Ming Lei, Omar Sandoval
  Cc: Ming Lei, linux-block, Christoph Hellwig, Omar Sandoval

On 05/03/2017 08:01 PM, Ming Lei wrote:
> On Thu, May 4, 2017 at 5:40 AM, Omar Sandoval <osandov@osandov.com> wrote:
>> On Thu, May 04, 2017 at 04:13:51AM +0800, Ming Lei wrote:
>>> On Thu, May 4, 2017 at 12:46 AM, Omar Sandoval <osandov@osandov.com> wrote:
>>>> On Fri, Apr 28, 2017 at 11:15:36PM +0800, Ming Lei wrote:
>>>>> When blk-mq I/O scheduler is used, we need two tags for
>>>>> submitting one request. One is called scheduler tag for
>>>>> allocating request and scheduling I/O, another one is called
>>>>> driver tag, which is used for dispatching IO to hardware/driver.
>>>>> This way introduces one extra per-queue allocation for both tags
>>>>> and request pool, and may not be as efficient as case of none
>>>>> scheduler.
>>>>>
>>>>> Also currently we put a default per-hctx limit on schedulable
>>>>> requests, and this limit may be a bottleneck for some devices,
>>>>> especialy when these devices have a quite big tag space.
>>>>>
>>>>> This patch introduces BLK_MQ_F_SCHED_USE_HW_TAG so that we can
>>>>> allow to use hardware/driver tags directly for IO scheduling if
>>>>> devices's hardware tag space is big enough. Then we can avoid
>>>>> the extra resource allocation and make IO submission more
>>>>> efficient.
>>>>>
>>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>>>>> ---
>>>>>  block/blk-mq-sched.c   | 10 +++++++++-
>>>>>  block/blk-mq.c         | 35 +++++++++++++++++++++++++++++------
>>>>>  include/linux/blk-mq.h |  1 +
>>>>>  3 files changed, 39 insertions(+), 7 deletions(-)
>>>>
>>>> One more note on this: if we're using the hardware tags directly, then
>>>> we are no longer limited to q->nr_requests requests in-flight. Instead,
>>>> we're limited to the hw queue depth. We probably want to maintain the
>>>> original behavior,
>>>
>>> That need further investigation, and generally scheduler should be happy with
>>> more requests which can be scheduled.
>>>
>>> We can make it as one follow-up.
>>
>> If we say nr_requests is 256, then we should honor that. So either
>> update nr_requests to reflect the actual depth we're using or resize the
>> hardware tags.
> 
> Firstly nr_requests is set as 256 from blk-mq inside instead of user
> space, it won't be a big deal to violate that.

The legacy scheduling layer used 2*128 by default, that's why I used the
"magic" 256 internally. FWIW, I agree with Omar here. If it's set to
256, we must honor that. Users will tweak this value down to trade peak
performance for latency, it's important that it does what it advertises.

> Secondly, when there is enough tags available, it might hurt
> performance if we don't use them all.

That's mostly bogus. Crazy large tag depths have only one use case -
synthetic peak performance benchmarks from manufacturers. We don't want
to allow really deep queues. Nothing good comes from that, just a lot of
pain and latency issues.

The most important part is actually that the scheduler has a higher
depth than the device, as mentioned in an email from a few days ago. We
need to be able to actually schedule IO to the device, we can't do that
if we always deplete the scheduler queue by letting the device drain it.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG
  2017-05-04  2:13           ` Jens Axboe
@ 2017-05-04  2:51             ` Ming Lei
  2017-05-04 14:06               ` Jens Axboe
  0 siblings, 1 reply; 57+ messages in thread
From: Ming Lei @ 2017-05-04  2:51 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Ming Lei, Omar Sandoval, linux-block, Christoph Hellwig, Omar Sandoval

On Wed, May 03, 2017 at 08:13:03PM -0600, Jens Axboe wrote:
> On 05/03/2017 08:01 PM, Ming Lei wrote:
> > On Thu, May 4, 2017 at 5:40 AM, Omar Sandoval <osandov@osandov.com> wrote:
> >> On Thu, May 04, 2017 at 04:13:51AM +0800, Ming Lei wrote:
> >>> On Thu, May 4, 2017 at 12:46 AM, Omar Sandoval <osandov@osandov.com> wrote:
> >>>> On Fri, Apr 28, 2017 at 11:15:36PM +0800, Ming Lei wrote:
> >>>>> When blk-mq I/O scheduler is used, we need two tags for
> >>>>> submitting one request. One is called scheduler tag for
> >>>>> allocating request and scheduling I/O, another one is called
> >>>>> driver tag, which is used for dispatching IO to hardware/driver.
> >>>>> This way introduces one extra per-queue allocation for both tags
> >>>>> and request pool, and may not be as efficient as case of none
> >>>>> scheduler.
> >>>>>
> >>>>> Also currently we put a default per-hctx limit on schedulable
> >>>>> requests, and this limit may be a bottleneck for some devices,
> >>>>> especialy when these devices have a quite big tag space.
> >>>>>
> >>>>> This patch introduces BLK_MQ_F_SCHED_USE_HW_TAG so that we can
> >>>>> allow to use hardware/driver tags directly for IO scheduling if
> >>>>> devices's hardware tag space is big enough. Then we can avoid
> >>>>> the extra resource allocation and make IO submission more
> >>>>> efficient.
> >>>>>
> >>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> >>>>> ---
> >>>>>  block/blk-mq-sched.c   | 10 +++++++++-
> >>>>>  block/blk-mq.c         | 35 +++++++++++++++++++++++++++++------
> >>>>>  include/linux/blk-mq.h |  1 +
> >>>>>  3 files changed, 39 insertions(+), 7 deletions(-)
> >>>>
> >>>> One more note on this: if we're using the hardware tags directly, then
> >>>> we are no longer limited to q->nr_requests requests in-flight. Instead,
> >>>> we're limited to the hw queue depth. We probably want to maintain the
> >>>> original behavior,
> >>>
> >>> That need further investigation, and generally scheduler should be happy with
> >>> more requests which can be scheduled.
> >>>
> >>> We can make it as one follow-up.
> >>
> >> If we say nr_requests is 256, then we should honor that. So either
> >> update nr_requests to reflect the actual depth we're using or resize the
> >> hardware tags.
> > 
> > Firstly nr_requests is set as 256 from blk-mq inside instead of user
> > space, it won't be a big deal to violate that.
> 
> The legacy scheduling layer used 2*128 by default, that's why I used the
> "magic" 256 internally. FWIW, I agree with Omar here. If it's set to
> 256, we must honor that. Users will tweak this value down to trade peak
> performance for latency, it's important that it does what it advertises.

In case of scheduling with hw tags, we share tags between scheduler and
dispatching, if we resize(only decrease actually) the tags, dispatching
space(hw tags) is decreased too. That means the actual usable device tag
space need to be decreased much.

> 
> > Secondly, when there is enough tags available, it might hurt
> > performance if we don't use them all.
> 
> That's mostly bogus. Crazy large tag depths have only one use case -
> synthetic peak performance benchmarks from manufacturers. We don't want
> to allow really deep queues. Nothing good comes from that, just a lot of
> pain and latency issues.

Given device provides so high queue depth, it might be reasonable to just
allow to use them up. For example of NVMe, once mq scheduler is enabled,
the actual size of device tag space is just 256 at default, even though
the hardware provides very big tag space(>= 10K).

The problem is that lifetime of sched tag is same with request's
lifetime(from submission to completion), and it covers lifetime of
device tag.  In theory sched tag should have been freed just after
the rq is dispatched to driver. Unfortunately we can't do that because
request is allocated from sched tag set.

> 
> The most important part is actually that the scheduler has a higher
> depth than the device, as mentioned in an email from a few days ago. We

I agree this point, but:

Unfortunately in case of NVMe or other high depth devices, the default
scheduler queue depth(256) is much less than device depth, do we need to
adjust the default value for this devices? In theory, the default 256
scheduler depth may hurt performance on this devices since the device
tag space is much under-utilized.


Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG
  2017-05-04  2:51             ` Ming Lei
@ 2017-05-04 14:06               ` Jens Axboe
  2017-05-05 22:54                   ` Ming Lei
  2017-05-10  7:25                 ` Ming Lei
  0 siblings, 2 replies; 57+ messages in thread
From: Jens Axboe @ 2017-05-04 14:06 UTC (permalink / raw)
  To: Ming Lei
  Cc: Ming Lei, Omar Sandoval, linux-block, Christoph Hellwig, Omar Sandoval

On 05/03/2017 08:51 PM, Ming Lei wrote:
> On Wed, May 03, 2017 at 08:13:03PM -0600, Jens Axboe wrote:
>> On 05/03/2017 08:01 PM, Ming Lei wrote:
>>> On Thu, May 4, 2017 at 5:40 AM, Omar Sandoval <osandov@osandov.com> wrote:
>>>> On Thu, May 04, 2017 at 04:13:51AM +0800, Ming Lei wrote:
>>>>> On Thu, May 4, 2017 at 12:46 AM, Omar Sandoval <osandov@osandov.com> wrote:
>>>>>> On Fri, Apr 28, 2017 at 11:15:36PM +0800, Ming Lei wrote:
>>>>>>> When blk-mq I/O scheduler is used, we need two tags for
>>>>>>> submitting one request. One is called scheduler tag for
>>>>>>> allocating request and scheduling I/O, another one is called
>>>>>>> driver tag, which is used for dispatching IO to hardware/driver.
>>>>>>> This way introduces one extra per-queue allocation for both tags
>>>>>>> and request pool, and may not be as efficient as case of none
>>>>>>> scheduler.
>>>>>>>
>>>>>>> Also currently we put a default per-hctx limit on schedulable
>>>>>>> requests, and this limit may be a bottleneck for some devices,
>>>>>>> especialy when these devices have a quite big tag space.
>>>>>>>
>>>>>>> This patch introduces BLK_MQ_F_SCHED_USE_HW_TAG so that we can
>>>>>>> allow to use hardware/driver tags directly for IO scheduling if
>>>>>>> devices's hardware tag space is big enough. Then we can avoid
>>>>>>> the extra resource allocation and make IO submission more
>>>>>>> efficient.
>>>>>>>
>>>>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>>>>>>> ---
>>>>>>>  block/blk-mq-sched.c   | 10 +++++++++-
>>>>>>>  block/blk-mq.c         | 35 +++++++++++++++++++++++++++++------
>>>>>>>  include/linux/blk-mq.h |  1 +
>>>>>>>  3 files changed, 39 insertions(+), 7 deletions(-)
>>>>>>
>>>>>> One more note on this: if we're using the hardware tags directly, then
>>>>>> we are no longer limited to q->nr_requests requests in-flight. Instead,
>>>>>> we're limited to the hw queue depth. We probably want to maintain the
>>>>>> original behavior,
>>>>>
>>>>> That need further investigation, and generally scheduler should be happy with
>>>>> more requests which can be scheduled.
>>>>>
>>>>> We can make it as one follow-up.
>>>>
>>>> If we say nr_requests is 256, then we should honor that. So either
>>>> update nr_requests to reflect the actual depth we're using or resize the
>>>> hardware tags.
>>>
>>> Firstly nr_requests is set as 256 from blk-mq inside instead of user
>>> space, it won't be a big deal to violate that.
>>
>> The legacy scheduling layer used 2*128 by default, that's why I used the
>> "magic" 256 internally. FWIW, I agree with Omar here. If it's set to
>> 256, we must honor that. Users will tweak this value down to trade peak
>> performance for latency, it's important that it does what it advertises.
> 
> In case of scheduling with hw tags, we share tags between scheduler and
> dispatching, if we resize(only decrease actually) the tags, dispatching
> space(hw tags) is decreased too. That means the actual usable device tag
> space need to be decreased much.

I think the solution here is to handle it differently. Previous, we had
requests and tags independent. That meant that we could have an
independent set of requests for scheduling, then assign tags as we need
to dispatch them to hardware. This is how the old schedulers worked, and
with the scheduler tags, this is how the new blk-mq scheduling works as
well.

Once you start treating them as one space again, we run into this issue.
I can think of two solutions:

1) Keep our current split, so we can schedule independently of hardware
   tags.

2) Throttle the queue depth independently. If the user asks for a depth
   of, eg, 32, retain a larger set of requests but limit the queue depth
   on the device side fo 32.

This is much easier to support with split hardware and scheduler tags...

>>> Secondly, when there is enough tags available, it might hurt
>>> performance if we don't use them all.
>>
>> That's mostly bogus. Crazy large tag depths have only one use case -
>> synthetic peak performance benchmarks from manufacturers. We don't want
>> to allow really deep queues. Nothing good comes from that, just a lot of
>> pain and latency issues.
> 
> Given device provides so high queue depth, it might be reasonable to just
> allow to use them up. For example of NVMe, once mq scheduler is enabled,
> the actual size of device tag space is just 256 at default, even though
> the hardware provides very big tag space(>= 10K).

Correct.

> The problem is that lifetime of sched tag is same with request's
> lifetime(from submission to completion), and it covers lifetime of
> device tag.  In theory sched tag should have been freed just after
> the rq is dispatched to driver. Unfortunately we can't do that because
> request is allocated from sched tag set.

Yep

>> The most important part is actually that the scheduler has a higher
>> depth than the device, as mentioned in an email from a few days ago. We
> 
> I agree this point, but:
> 
> Unfortunately in case of NVMe or other high depth devices, the default
> scheduler queue depth(256) is much less than device depth, do we need to
> adjust the default value for this devices? In theory, the default 256
> scheduler depth may hurt performance on this devices since the device
> tag space is much under-utilized.

No we do not. 256 is a LOT. I realize most of the devices expose 64K *
num_hw_queues of depth. Expecting to utilize all that is insane.
Internally, these devices have nowhere near that amount of parallelism.
Hence we'd go well beyond the latency knee in the curve if we just allow
tons of writeback to queue up, for example. Reaching peak performance on
these devices do not require more than 256 requests, in fact it can be
done much sooner. For a default setting, I'd actually argue that 256 is
too much, and that we should set it lower.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG
  2017-05-04 14:06               ` Jens Axboe
@ 2017-05-05 22:54                   ` Ming Lei
  2017-05-10  7:25                 ` Ming Lei
  1 sibling, 0 replies; 57+ messages in thread
From: Ming Lei @ 2017-05-05 22:54 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Ming Lei, Omar Sandoval, linux-block, Christoph Hellwig,
	Omar Sandoval, linux-nvme, linux-scsi, James E.J. Bottomley,
	Martin K. Petersen, Keith Busch, Sagi Grimberg

On Thu, May 04, 2017 at 08:06:15AM -0600, Jens Axboe wrote:
> On 05/03/2017 08:51 PM, Ming Lei wrote:
> > On Wed, May 03, 2017 at 08:13:03PM -0600, Jens Axboe wrote:
> >> On 05/03/2017 08:01 PM, Ming Lei wrote:
> >>> On Thu, May 4, 2017 at 5:40 AM, Omar Sandoval <osandov@osandov.com> wrote:
> >>>> On Thu, May 04, 2017 at 04:13:51AM +0800, Ming Lei wrote:
> >>>>> On Thu, May 4, 2017 at 12:46 AM, Omar Sandoval <osandov@osandov.com> wrote:
> >>>>>> On Fri, Apr 28, 2017 at 11:15:36PM +0800, Ming Lei wrote:
> >>>>>>> When blk-mq I/O scheduler is used, we need two tags for
> >>>>>>> submitting one request. One is called scheduler tag for
> >>>>>>> allocating request and scheduling I/O, another one is called
> >>>>>>> driver tag, which is used for dispatching IO to hardware/driver.
> >>>>>>> This way introduces one extra per-queue allocation for both tags
> >>>>>>> and request pool, and may not be as efficient as case of none
> >>>>>>> scheduler.
> >>>>>>>
> >>>>>>> Also currently we put a default per-hctx limit on schedulable
> >>>>>>> requests, and this limit may be a bottleneck for some devices,
> >>>>>>> especialy when these devices have a quite big tag space.
> >>>>>>>
> >>>>>>> This patch introduces BLK_MQ_F_SCHED_USE_HW_TAG so that we can
> >>>>>>> allow to use hardware/driver tags directly for IO scheduling if
> >>>>>>> devices's hardware tag space is big enough. Then we can avoid
> >>>>>>> the extra resource allocation and make IO submission more
> >>>>>>> efficient.
> >>>>>>>
> >>>>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> >>>>>>> ---
> >>>>>>>  block/blk-mq-sched.c   | 10 +++++++++-
> >>>>>>>  block/blk-mq.c         | 35 +++++++++++++++++++++++++++++------
> >>>>>>>  include/linux/blk-mq.h |  1 +
> >>>>>>>  3 files changed, 39 insertions(+), 7 deletions(-)
> >>>>>>
> >>>>>> One more note on this: if we're using the hardware tags directly, then
> >>>>>> we are no longer limited to q->nr_requests requests in-flight. Instead,
> >>>>>> we're limited to the hw queue depth. We probably want to maintain the
> >>>>>> original behavior,
> >>>>>
> >>>>> That need further investigation, and generally scheduler should be happy with
> >>>>> more requests which can be scheduled.
> >>>>>
> >>>>> We can make it as one follow-up.
> >>>>
> >>>> If we say nr_requests is 256, then we should honor that. So either
> >>>> update nr_requests to reflect the actual depth we're using or resize the
> >>>> hardware tags.
> >>>
> >>> Firstly nr_requests is set as 256 from blk-mq inside instead of user
> >>> space, it won't be a big deal to violate that.
> >>
> >> The legacy scheduling layer used 2*128 by default, that's why I used the
> >> "magic" 256 internally. FWIW, I agree with Omar here. If it's set to
> >> 256, we must honor that. Users will tweak this value down to trade peak
> >> performance for latency, it's important that it does what it advertises.
> > 
> > In case of scheduling with hw tags, we share tags between scheduler and
> > dispatching, if we resize(only decrease actually) the tags, dispatching
> > space(hw tags) is decreased too. That means the actual usable device tag
> > space need to be decreased much.
> 
> I think the solution here is to handle it differently. Previous, we had
> requests and tags independent. That meant that we could have an
> independent set of requests for scheduling, then assign tags as we need
> to dispatch them to hardware. This is how the old schedulers worked, and
> with the scheduler tags, this is how the new blk-mq scheduling works as
> well.
> 
> Once you start treating them as one space again, we run into this issue.
> I can think of two solutions:
> 
> 1) Keep our current split, so we can schedule independently of hardware
>    tags.

Actually hw tag depends on scheduler tag as I explained, so both aren't
independently, and even they looks split.

Also I am not sure how we do that if we need to support scheduling with
hw tag, could you explain it a bit?

> 
> 2) Throttle the queue depth independently. If the user asks for a depth
>    of, eg, 32, retain a larger set of requests but limit the queue depth
>    on the device side fo 32.

If I understand correctly, we can support scheduling with hw tag by this
patchset plus setting q->nr_requests as size of hw tag space. Otherwise
it isn't easy to throttle the queue depth independently because hw tag
actually depends on scheduler tag.

The 3rd one is to follow Omar's suggestion, by resizing the queue depth
as q->nr_requests simply.

> 
> This is much easier to support with split hardware and scheduler tags...
> 
> >>> Secondly, when there is enough tags available, it might hurt
> >>> performance if we don't use them all.
> >>
> >> That's mostly bogus. Crazy large tag depths have only one use case -
> >> synthetic peak performance benchmarks from manufacturers. We don't want
> >> to allow really deep queues. Nothing good comes from that, just a lot of
> >> pain and latency issues.
> > 
> > Given device provides so high queue depth, it might be reasonable to just
> > allow to use them up. For example of NVMe, once mq scheduler is enabled,
> > the actual size of device tag space is just 256 at default, even though
> > the hardware provides very big tag space(>= 10K).
> 
> Correct.
> 
> > The problem is that lifetime of sched tag is same with request's
> > lifetime(from submission to completion), and it covers lifetime of
> > device tag.  In theory sched tag should have been freed just after
> > the rq is dispatched to driver. Unfortunately we can't do that because
> > request is allocated from sched tag set.
> 
> Yep

Request copying like what your first post of mq scheduler patch did
may fix this issue, and in that way we can make both two tag space
independent, but with extra cost. What do you think about this approach?

> 
> >> The most important part is actually that the scheduler has a higher
> >> depth than the device, as mentioned in an email from a few days ago. We
> > 
> > I agree this point, but:
> > 
> > Unfortunately in case of NVMe or other high depth devices, the default
> > scheduler queue depth(256) is much less than device depth, do we need to
> > adjust the default value for this devices? In theory, the default 256
> > scheduler depth may hurt performance on this devices since the device
> > tag space is much under-utilized.
> 
> No we do not. 256 is a LOT. I realize most of the devices expose 64K *
> num_hw_queues of depth. Expecting to utilize all that is insane.
> Internally, these devices have nowhere near that amount of parallelism.
> Hence we'd go well beyond the latency knee in the curve if we just allow
> tons of writeback to queue up, for example. Reaching peak performance on
> these devices do not require more than 256 requests, in fact it can be
> done much sooner. For a default setting, I'd actually argue that 256 is
> too much, and that we should set it lower.

Then the question is that why storage hardware/industry introduce
so big tag space in modern storage hardwares? Definitly tag space isn't
free from view of hardware.

IMO, big tag space may improve latency a bit in heavy I/O load, and my
simple test has showed the improvement.

Is there anyone who knows the design befind big tag space/hw queue
depth? Cc NVMe and SCSI list, since there may have more storage
hw experts.

IMO, if this degisn is just for meeting some very special performance
requirements, block layer should have allowed useres to make full use
of the big tag space.


Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG
@ 2017-05-05 22:54                   ` Ming Lei
  0 siblings, 0 replies; 57+ messages in thread
From: Ming Lei @ 2017-05-05 22:54 UTC (permalink / raw)


On Thu, May 04, 2017@08:06:15AM -0600, Jens Axboe wrote:
> On 05/03/2017 08:51 PM, Ming Lei wrote:
> > On Wed, May 03, 2017@08:13:03PM -0600, Jens Axboe wrote:
> >> On 05/03/2017 08:01 PM, Ming Lei wrote:
> >>> On Thu, May 4, 2017@5:40 AM, Omar Sandoval <osandov@osandov.com> wrote:
> >>>> On Thu, May 04, 2017@04:13:51AM +0800, Ming Lei wrote:
> >>>>> On Thu, May 4, 2017@12:46 AM, Omar Sandoval <osandov@osandov.com> wrote:
> >>>>>> On Fri, Apr 28, 2017@11:15:36PM +0800, Ming Lei wrote:
> >>>>>>> When blk-mq I/O scheduler is used, we need two tags for
> >>>>>>> submitting one request. One is called scheduler tag for
> >>>>>>> allocating request and scheduling I/O, another one is called
> >>>>>>> driver tag, which is used for dispatching IO to hardware/driver.
> >>>>>>> This way introduces one extra per-queue allocation for both tags
> >>>>>>> and request pool, and may not be as efficient as case of none
> >>>>>>> scheduler.
> >>>>>>>
> >>>>>>> Also currently we put a default per-hctx limit on schedulable
> >>>>>>> requests, and this limit may be a bottleneck for some devices,
> >>>>>>> especialy when these devices have a quite big tag space.
> >>>>>>>
> >>>>>>> This patch introduces BLK_MQ_F_SCHED_USE_HW_TAG so that we can
> >>>>>>> allow to use hardware/driver tags directly for IO scheduling if
> >>>>>>> devices's hardware tag space is big enough. Then we can avoid
> >>>>>>> the extra resource allocation and make IO submission more
> >>>>>>> efficient.
> >>>>>>>
> >>>>>>> Signed-off-by: Ming Lei <ming.lei at redhat.com>
> >>>>>>> ---
> >>>>>>>  block/blk-mq-sched.c   | 10 +++++++++-
> >>>>>>>  block/blk-mq.c         | 35 +++++++++++++++++++++++++++++------
> >>>>>>>  include/linux/blk-mq.h |  1 +
> >>>>>>>  3 files changed, 39 insertions(+), 7 deletions(-)
> >>>>>>
> >>>>>> One more note on this: if we're using the hardware tags directly, then
> >>>>>> we are no longer limited to q->nr_requests requests in-flight. Instead,
> >>>>>> we're limited to the hw queue depth. We probably want to maintain the
> >>>>>> original behavior,
> >>>>>
> >>>>> That need further investigation, and generally scheduler should be happy with
> >>>>> more requests which can be scheduled.
> >>>>>
> >>>>> We can make it as one follow-up.
> >>>>
> >>>> If we say nr_requests is 256, then we should honor that. So either
> >>>> update nr_requests to reflect the actual depth we're using or resize the
> >>>> hardware tags.
> >>>
> >>> Firstly nr_requests is set as 256 from blk-mq inside instead of user
> >>> space, it won't be a big deal to violate that.
> >>
> >> The legacy scheduling layer used 2*128 by default, that's why I used the
> >> "magic" 256 internally. FWIW, I agree with Omar here. If it's set to
> >> 256, we must honor that. Users will tweak this value down to trade peak
> >> performance for latency, it's important that it does what it advertises.
> > 
> > In case of scheduling with hw tags, we share tags between scheduler and
> > dispatching, if we resize(only decrease actually) the tags, dispatching
> > space(hw tags) is decreased too. That means the actual usable device tag
> > space need to be decreased much.
> 
> I think the solution here is to handle it differently. Previous, we had
> requests and tags independent. That meant that we could have an
> independent set of requests for scheduling, then assign tags as we need
> to dispatch them to hardware. This is how the old schedulers worked, and
> with the scheduler tags, this is how the new blk-mq scheduling works as
> well.
> 
> Once you start treating them as one space again, we run into this issue.
> I can think of two solutions:
> 
> 1) Keep our current split, so we can schedule independently of hardware
>    tags.

Actually hw tag depends on scheduler tag as I explained, so both aren't
independently, and even they looks split.

Also I am not sure how we do that if we need to support scheduling with
hw tag, could you explain it a bit?

> 
> 2) Throttle the queue depth independently. If the user asks for a depth
>    of, eg, 32, retain a larger set of requests but limit the queue depth
>    on the device side fo 32.

If I understand correctly, we can support scheduling with hw tag by this
patchset plus setting q->nr_requests as size of hw tag space. Otherwise
it isn't easy to throttle the queue depth independently because hw tag
actually depends on scheduler tag.

The 3rd one is to follow Omar's suggestion, by resizing the queue depth
as q->nr_requests simply.

> 
> This is much easier to support with split hardware and scheduler tags...
> 
> >>> Secondly, when there is enough tags available, it might hurt
> >>> performance if we don't use them all.
> >>
> >> That's mostly bogus. Crazy large tag depths have only one use case -
> >> synthetic peak performance benchmarks from manufacturers. We don't want
> >> to allow really deep queues. Nothing good comes from that, just a lot of
> >> pain and latency issues.
> > 
> > Given device provides so high queue depth, it might be reasonable to just
> > allow to use them up. For example of NVMe, once mq scheduler is enabled,
> > the actual size of device tag space is just 256 at default, even though
> > the hardware provides very big tag space(>= 10K).
> 
> Correct.
> 
> > The problem is that lifetime of sched tag is same with request's
> > lifetime(from submission to completion), and it covers lifetime of
> > device tag.  In theory sched tag should have been freed just after
> > the rq is dispatched to driver. Unfortunately we can't do that because
> > request is allocated from sched tag set.
> 
> Yep

Request copying like what your first post of mq scheduler patch did
may fix this issue, and in that way we can make both two tag space
independent, but with extra cost. What do you think about this approach?

> 
> >> The most important part is actually that the scheduler has a higher
> >> depth than the device, as mentioned in an email from a few days ago. We
> > 
> > I agree this point, but:
> > 
> > Unfortunately in case of NVMe or other high depth devices, the default
> > scheduler queue depth(256) is much less than device depth, do we need to
> > adjust the default value for this devices? In theory, the default 256
> > scheduler depth may hurt performance on this devices since the device
> > tag space is much under-utilized.
> 
> No we do not. 256 is a LOT. I realize most of the devices expose 64K *
> num_hw_queues of depth. Expecting to utilize all that is insane.
> Internally, these devices have nowhere near that amount of parallelism.
> Hence we'd go well beyond the latency knee in the curve if we just allow
> tons of writeback to queue up, for example. Reaching peak performance on
> these devices do not require more than 256 requests, in fact it can be
> done much sooner. For a default setting, I'd actually argue that 256 is
> too much, and that we should set it lower.

Then the question is that why storage hardware/industry introduce
so big tag space in modern storage hardwares? Definitly tag space isn't
free from view of hardware.

IMO, big tag space may improve latency a bit in heavy I/O load, and my
simple test has showed the improvement.

Is there anyone who knows the design befind big tag space/hw queue
depth? Cc NVMe and SCSI list, since there may have more storage
hw experts.

IMO, if this degisn is just for meeting some very special performance
requirements, block layer should have allowed useres to make full use
of the big tag space.


Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG
  2017-05-05 22:54                   ` Ming Lei
@ 2017-05-05 23:33                     ` Ming Lei
  -1 siblings, 0 replies; 57+ messages in thread
From: Ming Lei @ 2017-05-05 23:33 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Ming Lei, Omar Sandoval, linux-block, Christoph Hellwig,
	Omar Sandoval, linux-nvme, linux-scsi, James E.J. Bottomley,
	Martin K. Petersen, Keith Busch, Sagi Grimberg

On Sat, May 06, 2017 at 06:54:09AM +0800, Ming Lei wrote:
> On Thu, May 04, 2017 at 08:06:15AM -0600, Jens Axboe wrote:
> > On 05/03/2017 08:51 PM, Ming Lei wrote:
> > > On Wed, May 03, 2017 at 08:13:03PM -0600, Jens Axboe wrote:
> > >> On 05/03/2017 08:01 PM, Ming Lei wrote:
> > >>> On Thu, May 4, 2017 at 5:40 AM, Omar Sandoval <osandov@osandov.com> wrote:
> > >>>> On Thu, May 04, 2017 at 04:13:51AM +0800, Ming Lei wrote:
> > >>>>> On Thu, May 4, 2017 at 12:46 AM, Omar Sandoval <osandov@osandov.com> wrote:
> > >>>>>> On Fri, Apr 28, 2017 at 11:15:36PM +0800, Ming Lei wrote:
> > >>>>>>> When blk-mq I/O scheduler is used, we need two tags for
> > >>>>>>> submitting one request. One is called scheduler tag for
> > >>>>>>> allocating request and scheduling I/O, another one is called
> > >>>>>>> driver tag, which is used for dispatching IO to hardware/driver.
> > >>>>>>> This way introduces one extra per-queue allocation for both tags
> > >>>>>>> and request pool, and may not be as efficient as case of none
> > >>>>>>> scheduler.
> > >>>>>>>
> > >>>>>>> Also currently we put a default per-hctx limit on schedulable
> > >>>>>>> requests, and this limit may be a bottleneck for some devices,
> > >>>>>>> especialy when these devices have a quite big tag space.
> > >>>>>>>
> > >>>>>>> This patch introduces BLK_MQ_F_SCHED_USE_HW_TAG so that we can
> > >>>>>>> allow to use hardware/driver tags directly for IO scheduling if
> > >>>>>>> devices's hardware tag space is big enough. Then we can avoid
> > >>>>>>> the extra resource allocation and make IO submission more
> > >>>>>>> efficient.
> > >>>>>>>
> > >>>>>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > >>>>>>> ---
> > >>>>>>>  block/blk-mq-sched.c   | 10 +++++++++-
> > >>>>>>>  block/blk-mq.c         | 35 +++++++++++++++++++++++++++++------
> > >>>>>>>  include/linux/blk-mq.h |  1 +
> > >>>>>>>  3 files changed, 39 insertions(+), 7 deletions(-)
> > >>>>>>
> > >>>>>> One more note on this: if we're using the hardware tags directly, then
> > >>>>>> we are no longer limited to q->nr_requests requests in-flight. Instead,
> > >>>>>> we're limited to the hw queue depth. We probably want to maintain the
> > >>>>>> original behavior,
> > >>>>>
> > >>>>> That need further investigation, and generally scheduler should be happy with
> > >>>>> more requests which can be scheduled.
> > >>>>>
> > >>>>> We can make it as one follow-up.
> > >>>>
> > >>>> If we say nr_requests is 256, then we should honor that. So either
> > >>>> update nr_requests to reflect the actual depth we're using or resize the
> > >>>> hardware tags.
> > >>>
> > >>> Firstly nr_requests is set as 256 from blk-mq inside instead of user
> > >>> space, it won't be a big deal to violate that.
> > >>
> > >> The legacy scheduling layer used 2*128 by default, that's why I used the
> > >> "magic" 256 internally. FWIW, I agree with Omar here. If it's set to
> > >> 256, we must honor that. Users will tweak this value down to trade peak
> > >> performance for latency, it's important that it does what it advertises.
> > > 
> > > In case of scheduling with hw tags, we share tags between scheduler and
> > > dispatching, if we resize(only decrease actually) the tags, dispatching
> > > space(hw tags) is decreased too. That means the actual usable device tag
> > > space need to be decreased much.
> > 
> > I think the solution here is to handle it differently. Previous, we had
> > requests and tags independent. That meant that we could have an
> > independent set of requests for scheduling, then assign tags as we need
> > to dispatch them to hardware. This is how the old schedulers worked, and
> > with the scheduler tags, this is how the new blk-mq scheduling works as
> > well.
> > 
> > Once you start treating them as one space again, we run into this issue.
> > I can think of two solutions:
> > 
> > 1) Keep our current split, so we can schedule independently of hardware
> >    tags.
> 
> Actually hw tag depends on scheduler tag as I explained, so both aren't
> independently, and even they looks split.
> 
> Also I am not sure how we do that if we need to support scheduling with
> hw tag, could you explain it a bit?
> 
> > 
> > 2) Throttle the queue depth independently. If the user asks for a depth
> >    of, eg, 32, retain a larger set of requests but limit the queue depth
> >    on the device side fo 32.
> 
> If I understand correctly, we can support scheduling with hw tag by this
> patchset plus setting q->nr_requests as size of hw tag space. Otherwise
> it isn't easy to throttle the queue depth independently because hw tag
> actually depends on scheduler tag.
> 
> The 3rd one is to follow Omar's suggestion, by resizing the queue depth
> as q->nr_requests simply.
> 
> > 
> > This is much easier to support with split hardware and scheduler tags...
> > 
> > >>> Secondly, when there is enough tags available, it might hurt
> > >>> performance if we don't use them all.
> > >>
> > >> That's mostly bogus. Crazy large tag depths have only one use case -
> > >> synthetic peak performance benchmarks from manufacturers. We don't want
> > >> to allow really deep queues. Nothing good comes from that, just a lot of
> > >> pain and latency issues.
> > > 
> > > Given device provides so high queue depth, it might be reasonable to just
> > > allow to use them up. For example of NVMe, once mq scheduler is enabled,
> > > the actual size of device tag space is just 256 at default, even though
> > > the hardware provides very big tag space(>= 10K).
> > 
> > Correct.
> > 
> > > The problem is that lifetime of sched tag is same with request's
> > > lifetime(from submission to completion), and it covers lifetime of
> > > device tag.  In theory sched tag should have been freed just after
> > > the rq is dispatched to driver. Unfortunately we can't do that because
> > > request is allocated from sched tag set.
> > 
> > Yep
> 
> Request copying like what your first post of mq scheduler patch did
> may fix this issue, and in that way we can make both two tag space
> independent, but with extra cost. What do you think about this approach?

Or something like the following(I am on a trip now, and it is totally
un-tested)?

---

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8f72f16b498a..ce6bb849a395 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -855,6 +855,17 @@ static inline unsigned int queued_to_index(unsigned int queued)
 	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
 }
 
+static void blk_mq_replace_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	int sched_tag = rq->internal_tag;
+	struct request *rq_for_replacement = hctx->tags->static_rqs[rq->tag];
+
+	hctx->sched_tags->static_rqs[rq->internal_tag] = rq_for_replacement;
+
+	blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
+	rq->internal_tag = -1;
+}
+
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 			   bool wait)
 {
@@ -878,7 +889,9 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 			rq->rq_flags |= RQF_MQ_INFLIGHT;
 			atomic_inc(&data.hctx->nr_active);
 		}
 		data.hctx->tags->rqs[rq->tag] = rq;
+		blk_mq_replace_rq(data.hctx, rq);
 	}
 
 done:
@@ -887,38 +900,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 	return rq->tag != -1;
 }
 
-static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
-				    struct request *rq)
-{
-	blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
-	rq->tag = -1;
-
-	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
-		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
-		atomic_dec(&hctx->nr_active);
-	}
-}
-
-static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
-				       struct request *rq)
-{
-	if (rq->tag == -1 || rq->internal_tag == -1)
-		return;
-
-	__blk_mq_put_driver_tag(hctx, rq);
-}
-
-static void blk_mq_put_driver_tag(struct request *rq)
-{
-	struct blk_mq_hw_ctx *hctx;
-
-	if (rq->tag == -1 || rq->internal_tag == -1)
-		return;
-
-	hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
-	__blk_mq_put_driver_tag(hctx, rq);
-}
-
 /*
  * If we fail getting a driver tag because all the driver tags are already
  * assigned and on the dispatch list, BUT the first entry does not have a
@@ -1041,7 +1022,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 			queued++;
 			break;
 		case BLK_MQ_RQ_QUEUE_BUSY:
-			blk_mq_put_driver_tag_hctx(hctx, rq);
 			list_add(&rq->queuelist, list);
 			__blk_mq_requeue_request(rq);
 			break;
@@ -1069,7 +1049,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 		 * tag for the next request already, free it again.
 		 */
 		rq = list_first_entry(list, struct request, queuelist);
-		blk_mq_put_driver_tag(rq);
 
 		spin_lock(&hctx->lock);
 		list_splice_init(list, &hctx->dispatch);

Thanks,
Ming

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG
@ 2017-05-05 23:33                     ` Ming Lei
  0 siblings, 0 replies; 57+ messages in thread
From: Ming Lei @ 2017-05-05 23:33 UTC (permalink / raw)


On Sat, May 06, 2017@06:54:09AM +0800, Ming Lei wrote:
> On Thu, May 04, 2017@08:06:15AM -0600, Jens Axboe wrote:
> > On 05/03/2017 08:51 PM, Ming Lei wrote:
> > > On Wed, May 03, 2017@08:13:03PM -0600, Jens Axboe wrote:
> > >> On 05/03/2017 08:01 PM, Ming Lei wrote:
> > >>> On Thu, May 4, 2017@5:40 AM, Omar Sandoval <osandov@osandov.com> wrote:
> > >>>> On Thu, May 04, 2017@04:13:51AM +0800, Ming Lei wrote:
> > >>>>> On Thu, May 4, 2017@12:46 AM, Omar Sandoval <osandov@osandov.com> wrote:
> > >>>>>> On Fri, Apr 28, 2017@11:15:36PM +0800, Ming Lei wrote:
> > >>>>>>> When blk-mq I/O scheduler is used, we need two tags for
> > >>>>>>> submitting one request. One is called scheduler tag for
> > >>>>>>> allocating request and scheduling I/O, another one is called
> > >>>>>>> driver tag, which is used for dispatching IO to hardware/driver.
> > >>>>>>> This way introduces one extra per-queue allocation for both tags
> > >>>>>>> and request pool, and may not be as efficient as case of none
> > >>>>>>> scheduler.
> > >>>>>>>
> > >>>>>>> Also currently we put a default per-hctx limit on schedulable
> > >>>>>>> requests, and this limit may be a bottleneck for some devices,
> > >>>>>>> especialy when these devices have a quite big tag space.
> > >>>>>>>
> > >>>>>>> This patch introduces BLK_MQ_F_SCHED_USE_HW_TAG so that we can
> > >>>>>>> allow to use hardware/driver tags directly for IO scheduling if
> > >>>>>>> devices's hardware tag space is big enough. Then we can avoid
> > >>>>>>> the extra resource allocation and make IO submission more
> > >>>>>>> efficient.
> > >>>>>>>
> > >>>>>>> Signed-off-by: Ming Lei <ming.lei at redhat.com>
> > >>>>>>> ---
> > >>>>>>>  block/blk-mq-sched.c   | 10 +++++++++-
> > >>>>>>>  block/blk-mq.c         | 35 +++++++++++++++++++++++++++++------
> > >>>>>>>  include/linux/blk-mq.h |  1 +
> > >>>>>>>  3 files changed, 39 insertions(+), 7 deletions(-)
> > >>>>>>
> > >>>>>> One more note on this: if we're using the hardware tags directly, then
> > >>>>>> we are no longer limited to q->nr_requests requests in-flight. Instead,
> > >>>>>> we're limited to the hw queue depth. We probably want to maintain the
> > >>>>>> original behavior,
> > >>>>>
> > >>>>> That need further investigation, and generally scheduler should be happy with
> > >>>>> more requests which can be scheduled.
> > >>>>>
> > >>>>> We can make it as one follow-up.
> > >>>>
> > >>>> If we say nr_requests is 256, then we should honor that. So either
> > >>>> update nr_requests to reflect the actual depth we're using or resize the
> > >>>> hardware tags.
> > >>>
> > >>> Firstly nr_requests is set as 256 from blk-mq inside instead of user
> > >>> space, it won't be a big deal to violate that.
> > >>
> > >> The legacy scheduling layer used 2*128 by default, that's why I used the
> > >> "magic" 256 internally. FWIW, I agree with Omar here. If it's set to
> > >> 256, we must honor that. Users will tweak this value down to trade peak
> > >> performance for latency, it's important that it does what it advertises.
> > > 
> > > In case of scheduling with hw tags, we share tags between scheduler and
> > > dispatching, if we resize(only decrease actually) the tags, dispatching
> > > space(hw tags) is decreased too. That means the actual usable device tag
> > > space need to be decreased much.
> > 
> > I think the solution here is to handle it differently. Previous, we had
> > requests and tags independent. That meant that we could have an
> > independent set of requests for scheduling, then assign tags as we need
> > to dispatch them to hardware. This is how the old schedulers worked, and
> > with the scheduler tags, this is how the new blk-mq scheduling works as
> > well.
> > 
> > Once you start treating them as one space again, we run into this issue.
> > I can think of two solutions:
> > 
> > 1) Keep our current split, so we can schedule independently of hardware
> >    tags.
> 
> Actually hw tag depends on scheduler tag as I explained, so both aren't
> independently, and even they looks split.
> 
> Also I am not sure how we do that if we need to support scheduling with
> hw tag, could you explain it a bit?
> 
> > 
> > 2) Throttle the queue depth independently. If the user asks for a depth
> >    of, eg, 32, retain a larger set of requests but limit the queue depth
> >    on the device side fo 32.
> 
> If I understand correctly, we can support scheduling with hw tag by this
> patchset plus setting q->nr_requests as size of hw tag space. Otherwise
> it isn't easy to throttle the queue depth independently because hw tag
> actually depends on scheduler tag.
> 
> The 3rd one is to follow Omar's suggestion, by resizing the queue depth
> as q->nr_requests simply.
> 
> > 
> > This is much easier to support with split hardware and scheduler tags...
> > 
> > >>> Secondly, when there is enough tags available, it might hurt
> > >>> performance if we don't use them all.
> > >>
> > >> That's mostly bogus. Crazy large tag depths have only one use case -
> > >> synthetic peak performance benchmarks from manufacturers. We don't want
> > >> to allow really deep queues. Nothing good comes from that, just a lot of
> > >> pain and latency issues.
> > > 
> > > Given device provides so high queue depth, it might be reasonable to just
> > > allow to use them up. For example of NVMe, once mq scheduler is enabled,
> > > the actual size of device tag space is just 256 at default, even though
> > > the hardware provides very big tag space(>= 10K).
> > 
> > Correct.
> > 
> > > The problem is that lifetime of sched tag is same with request's
> > > lifetime(from submission to completion), and it covers lifetime of
> > > device tag.  In theory sched tag should have been freed just after
> > > the rq is dispatched to driver. Unfortunately we can't do that because
> > > request is allocated from sched tag set.
> > 
> > Yep
> 
> Request copying like what your first post of mq scheduler patch did
> may fix this issue, and in that way we can make both two tag space
> independent, but with extra cost. What do you think about this approach?

Or something like the following(I am on a trip now, and it is totally
un-tested)?

---

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8f72f16b498a..ce6bb849a395 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -855,6 +855,17 @@ static inline unsigned int queued_to_index(unsigned int queued)
 	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
 }
 
+static void blk_mq_replace_rq(struct blk_mq_hw_ctx *hctx, struct request *rq)
+{
+	int sched_tag = rq->internal_tag;
+	struct request *rq_for_replacement = hctx->tags->static_rqs[rq->tag];
+
+	hctx->sched_tags->static_rqs[rq->internal_tag] = rq_for_replacement;
+
+	blk_mq_put_tag(hctx, hctx->sched_tags, rq->mq_ctx, rq->internal_tag);
+	rq->internal_tag = -1;
+}
+
 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 			   bool wait)
 {
@@ -878,7 +889,9 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 			rq->rq_flags |= RQF_MQ_INFLIGHT;
 			atomic_inc(&data.hctx->nr_active);
 		}
 		data.hctx->tags->rqs[rq->tag] = rq;
+		blk_mq_replace_rq(data.hctx, rq);
 	}
 
 done:
@@ -887,38 +900,6 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
 	return rq->tag != -1;
 }
 
-static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
-				    struct request *rq)
-{
-	blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
-	rq->tag = -1;
-
-	if (rq->rq_flags & RQF_MQ_INFLIGHT) {
-		rq->rq_flags &= ~RQF_MQ_INFLIGHT;
-		atomic_dec(&hctx->nr_active);
-	}
-}
-
-static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
-				       struct request *rq)
-{
-	if (rq->tag == -1 || rq->internal_tag == -1)
-		return;
-
-	__blk_mq_put_driver_tag(hctx, rq);
-}
-
-static void blk_mq_put_driver_tag(struct request *rq)
-{
-	struct blk_mq_hw_ctx *hctx;
-
-	if (rq->tag == -1 || rq->internal_tag == -1)
-		return;
-
-	hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
-	__blk_mq_put_driver_tag(hctx, rq);
-}
-
 /*
  * If we fail getting a driver tag because all the driver tags are already
  * assigned and on the dispatch list, BUT the first entry does not have a
@@ -1041,7 +1022,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 			queued++;
 			break;
 		case BLK_MQ_RQ_QUEUE_BUSY:
-			blk_mq_put_driver_tag_hctx(hctx, rq);
 			list_add(&rq->queuelist, list);
 			__blk_mq_requeue_request(rq);
 			break;
@@ -1069,7 +1049,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
 		 * tag for the next request already, free it again.
 		 */
 		rq = list_first_entry(list, struct request, queuelist);
-		blk_mq_put_driver_tag(rq);
 
 		spin_lock(&hctx->lock);
 		list_splice_init(list, &hctx->dispatch);

Thanks,
Ming

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG
  2017-05-04 14:06               ` Jens Axboe
  2017-05-05 22:54                   ` Ming Lei
@ 2017-05-10  7:25                 ` Ming Lei
  1 sibling, 0 replies; 57+ messages in thread
From: Ming Lei @ 2017-05-10  7:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Ming Lei, Omar Sandoval, linux-block, Christoph Hellwig, Omar Sandoval

Hi Jens,

On Thu, May 04, 2017 at 08:06:15AM -0600, Jens Axboe wrote:
...
> 
> No we do not. 256 is a LOT. I realize most of the devices expose 64K *
> num_hw_queues of depth. Expecting to utilize all that is insane.
> Internally, these devices have nowhere near that amount of parallelism.
> Hence we'd go well beyond the latency knee in the curve if we just allow
> tons of writeback to queue up, for example. Reaching peak performance on
> these devices do not require more than 256 requests, in fact it can be
> done much sooner. For a default setting, I'd actually argue that 256 is
> too much, and that we should set it lower.

After studying SSD and NVMe a bit, I think your point of '256 is a LOT'
is correct:

1) inside SSD, the channel number won't be big, which is often about 10
in high-end SSD, so 256 should be big enough to maximize the utilization
of each channel, even though multi-bank, multi-die or multi-plane are
considered.

2) For NVMe, the IO queue depth(size) is at most 64K according to spec,
now the driver limits it at most 1024, but the queue itself is totally
allocated from system memory, then big queue size won't increase NVMe
chip cost.

So I think we can respect .nr_requests via resizing hw tags in
blk_mq_init_sched(), as suggested by Omar. If you don't object that,
I will send out V3 soon.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2017-05-10  7:25 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-28 15:15 [PATCH 0/4] blk-mq: support to use hw tag for scheduling Ming Lei
2017-04-28 15:15 ` [PATCH 1/4] blk-mq: introduce BLK_MQ_F_SCHED_USE_HW_TAG Ming Lei
2017-05-03 16:21   ` Omar Sandoval
2017-05-03 16:46   ` Omar Sandoval
2017-05-03 20:13     ` Ming Lei
2017-05-03 21:40       ` Omar Sandoval
2017-05-04  2:01         ` Ming Lei
2017-05-04  2:13           ` Jens Axboe
2017-05-04  2:51             ` Ming Lei
2017-05-04 14:06               ` Jens Axboe
2017-05-05 22:54                 ` Ming Lei
2017-05-05 22:54                   ` Ming Lei
2017-05-05 23:33                   ` Ming Lei
2017-05-05 23:33                     ` Ming Lei
2017-05-10  7:25                 ` Ming Lei
2017-04-28 15:15 ` [PATCH 2/4] blk-mq: introduce blk_mq_get_queue_depth() Ming Lei
2017-04-28 18:23   ` Jens Axboe
2017-04-29  9:55     ` Ming Lei
2017-05-03 16:55   ` Omar Sandoval
2017-05-04  2:10     ` Ming Lei
2017-04-28 15:15 ` [PATCH 3/4] blk-mq: use hw tag for scheduling if hw tag space is big enough Ming Lei
2017-04-28 18:09   ` Bart Van Assche
2017-04-29 10:35     ` Ming Lei
2017-05-01 15:06       ` Bart Van Assche
2017-05-02  3:49         ` Omar Sandoval
2017-05-02  8:46         ` Ming Lei
2017-04-28 18:22   ` Jens Axboe
2017-04-28 20:11     ` Bart Van Assche
2017-04-29 10:59     ` Ming Lei
2017-05-03 16:29   ` Omar Sandoval
2017-05-03 16:55     ` Ming Lei
2017-05-03 17:00       ` Omar Sandoval
2017-05-03 17:33         ` Ming Lei
2017-04-28 15:15 ` [PATCH 4/4] blk-mq: dump new introduced flag of BLK_MQ_F_SCHED_USE_HW_TAG Ming Lei
2017-04-28 18:10   ` Bart Van Assche
2017-04-29 11:00     ` Ming Lei
2017-04-28 20:29 ` [PATCH 0/4] blk-mq: support to use hw tag for scheduling Jens Axboe
2017-05-03  4:03   ` Ming Lei
2017-05-03 14:08     ` Jens Axboe
2017-05-03 14:10       ` Jens Axboe
2017-05-03 15:03         ` Ming Lei
2017-05-03 15:08           ` Jens Axboe
2017-05-03 15:38             ` Ming Lei
2017-05-03 16:06               ` Omar Sandoval
2017-05-03 16:21                 ` Ming Lei
2017-05-03 16:52               ` Ming Lei
2017-05-03 17:03                 ` Ming Lei
2017-05-03 17:07                   ` Jens Axboe
2017-05-03 17:15                     ` Bart Van Assche
2017-05-03 17:24                       ` Jens Axboe
2017-05-03 17:35                         ` Bart Van Assche
2017-05-03 17:40                           ` Jens Axboe
2017-05-03 17:43                             ` Bart Van Assche
2017-05-03 17:08                 ` Bart Van Assche
2017-05-03 17:11                   ` Jens Axboe
2017-05-03 17:19                   ` Ming Lei
2017-05-03 17:41                     ` Bart Van Assche

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.