Linux-Block Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH] blk-mq: Wait for for hctx inflight requests on CPU unplug
@ 2019-05-17  9:14 Ming Lei
  2019-05-21  7:40 ` Christoph Hellwig
  2019-05-21 13:50 ` John Garry
  0 siblings, 2 replies; 11+ messages in thread
From: Ming Lei @ 2019-05-17  9:14 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Ming Lei, Christoph Hellwig, Bart Van Assche,
	Hannes Reinecke, John Garry, Keith Busch, Thomas Gleixner

Managed interrupts can not migrate affinity when their CPUs are offline.
If the CPU is allowed to shutdown before they're returned, commands
dispatched to managed queues won't be able to complete through their
irq handlers.

Wait in cpu hotplug handler until all inflight requests on the tags
are completed or timeout. Wait once for each tags, so we can save time
in case of shared tags.

Based on the following patch from Keith, and use simple delay-spin
instead.

https://lore.kernel.org/linux-block/20190405215920.27085-1-keith.busch@intel.com/

Cc: Christoph Hellwig <hch@lst.de>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Hannes Reinecke <hare@suse.com>
Cc: John Garry <john.garry@huawei.com>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq-tag.c |  2 +-
 block/blk-mq-tag.h |  5 ++++
 block/blk-mq.c     | 65 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 7513c8eaabee..b24334f99c5d 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -332,7 +332,7 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
  *		true to continue iterating tags, false to stop.
  * @priv:	Will be passed as second argument to @fn.
  */
-static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
 		busy_tag_iter_fn *fn, void *priv)
 {
 	if (tags->nr_reserved_tags)
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..f8de50485b42 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -19,6 +19,9 @@ struct blk_mq_tags {
 	struct request **rqs;
 	struct request **static_rqs;
 	struct list_head page_list;
+
+	#define BLK_MQ_TAGS_DRAINED           0
+	unsigned long flags;
 };
 
 
@@ -35,6 +38,8 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv);
+void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
+		busy_tag_iter_fn *fn, void *priv);
 
 static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
 						 struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 08a6248d8536..d1d1b1a9628f 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2214,6 +2214,60 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 	return -ENOMEM;
 }
 
+static int blk_mq_hctx_notify_prepare(unsigned int cpu, struct hlist_node *node)
+{
+	struct blk_mq_hw_ctx	*hctx;
+	struct blk_mq_tags	*tags;
+
+	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
+	tags = hctx->tags;
+
+	if (tags)
+		clear_bit(BLK_MQ_TAGS_DRAINED, &tags->flags);
+
+	return 0;
+}
+
+static bool blk_mq_count_inflight_rq(struct request *rq, void *data,
+				     bool reserved)
+{
+	if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
+		(*(unsigned long *)data)++;
+
+	return true;
+}
+
+static unsigned blk_mq_tags_inflight_rqs(struct blk_mq_tags *tags)
+{
+	unsigned long cnt = 0;
+
+	blk_mq_all_tag_busy_iter(tags, blk_mq_count_inflight_rq, &cnt);
+
+	return cnt;
+}
+
+static void blk_mq_drain_inflight_rqs(struct blk_mq_tags *tags, int dead_cpu)
+{
+	unsigned long msecs_left = 1000 * 5;
+
+	if (!tags)
+		return;
+
+	if (test_and_set_bit(BLK_MQ_TAGS_DRAINED, &tags->flags))
+		return;
+
+	while (msecs_left > 0) {
+		if (!blk_mq_tags_inflight_rqs(tags))
+			break;
+		msleep(5);
+		msecs_left -= 5;
+	}
+
+	if (msecs_left > 0)
+		printk(KERN_WARNING "requests not completed from dead "
+				"CPU %d\n", dead_cpu);
+}
+
 /*
  * 'cpu' is going away. splice any existing rq_list entries from this
  * software queue to the hw queue dispatch list, and ensure that it
@@ -2245,6 +2299,14 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 	spin_unlock(&hctx->lock);
 
 	blk_mq_run_hw_queue(hctx, true);
+
+	/*
+	 * Interrupt for this queue will be shutdown, so wait until all
+	 * requests from this hw queue is done or timeout.
+	 */
+	if (cpumask_first_and(hctx->cpumask, cpu_online_mask) >= nr_cpu_ids)
+		blk_mq_drain_inflight_rqs(hctx->tags, cpu);
+
 	return 0;
 }
 
@@ -3540,7 +3602,8 @@ EXPORT_SYMBOL(blk_mq_rq_cpu);
 
 static int __init blk_mq_init(void)
 {
-	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
+	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead",
+				blk_mq_hctx_notify_prepare,
 				blk_mq_hctx_notify_dead);
 	return 0;
 }
-- 
2.20.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] blk-mq: Wait for for hctx inflight requests on CPU unplug
  2019-05-17  9:14 [PATCH] blk-mq: Wait for for hctx inflight requests on CPU unplug Ming Lei
@ 2019-05-21  7:40 ` Christoph Hellwig
  2019-05-21  8:03   ` Ming Lei
  2019-05-21 13:50 ` John Garry
  1 sibling, 1 reply; 11+ messages in thread
From: Christoph Hellwig @ 2019-05-21  7:40 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Christoph Hellwig, Bart Van Assche,
	Hannes Reinecke, John Garry, Keith Busch, Thomas Gleixner

> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 7513c8eaabee..b24334f99c5d 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -332,7 +332,7 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
>   *		true to continue iterating tags, false to stop.
>   * @priv:	Will be passed as second argument to @fn.
>   */
> -static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
> +void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
>  		busy_tag_iter_fn *fn, void *priv)

How about moving blk_mq_tags_inflight_rqs to blk-mq-tag.c instead?

> +	#define BLK_MQ_TAGS_DRAINED           0

Please don't indent #defines.

>  
> +static int blk_mq_hctx_notify_prepare(unsigned int cpu, struct hlist_node *node)
> +{
> +	struct blk_mq_hw_ctx	*hctx;
> +	struct blk_mq_tags	*tags;
> +
> +	tags = hctx->tags;
> +
> +	if (tags)
> +		clear_bit(BLK_MQ_TAGS_DRAINED, &tags->flags);
> +
> +	return 0;

I'd write this as:

{
	struct blk_mq_hw_ctx	*hctx = 
		hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);

	if (hctx->tags)
		clear_bit(BLK_MQ_TAGS_DRAINED, &hctx->tags->flags);
	return 0;
}

> +static void blk_mq_drain_inflight_rqs(struct blk_mq_tags *tags, int dead_cpu)
> +{
> +	unsigned long msecs_left = 1000 * 5;
> +
> +	if (!tags)
> +		return;
> +
> +	if (test_and_set_bit(BLK_MQ_TAGS_DRAINED, &tags->flags))
> +		return;
> +
> +	while (msecs_left > 0) {
> +		if (!blk_mq_tags_inflight_rqs(tags))
> +			break;
> +		msleep(5);
> +		msecs_left -= 5;
> +	}
> +
> +	if (msecs_left > 0)
> +		printk(KERN_WARNING "requests not completed from dead "
> +				"CPU %d\n", dead_cpu);
> +}

Isn't this condition inverted?  If we break out early msecs_left will
be larger than 0 and we are fine.

Why not:

	for (attempts = 0; attempts < 1000; attempts++) {
		if (!blk_mq_tags_inflight_rqs(tags))
			return;
	}

	...

But more importantly I'm not sure we can just give up that easily.
Shouldn't we at lest wait the same timeout we otherwise have for
requests, and if the command isn't finished in time kick off error
handling?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] blk-mq: Wait for for hctx inflight requests on CPU unplug
  2019-05-21  7:40 ` Christoph Hellwig
@ 2019-05-21  8:03   ` Ming Lei
  0 siblings, 0 replies; 11+ messages in thread
From: Ming Lei @ 2019-05-21  8:03 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, linux-block, Bart Van Assche, Hannes Reinecke,
	John Garry, Keith Busch, Thomas Gleixner

On Tue, May 21, 2019 at 09:40:19AM +0200, Christoph Hellwig wrote:
> > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> > index 7513c8eaabee..b24334f99c5d 100644
> > --- a/block/blk-mq-tag.c
> > +++ b/block/blk-mq-tag.c
> > @@ -332,7 +332,7 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
> >   *		true to continue iterating tags, false to stop.
> >   * @priv:	Will be passed as second argument to @fn.
> >   */
> > -static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
> > +void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
> >  		busy_tag_iter_fn *fn, void *priv)
> 
> How about moving blk_mq_tags_inflight_rqs to blk-mq-tag.c instead?

OK.

> 
> > +	#define BLK_MQ_TAGS_DRAINED           0
> 
> Please don't indent #defines.

OK.

> 
> >  
> > +static int blk_mq_hctx_notify_prepare(unsigned int cpu, struct hlist_node *node)
> > +{
> > +	struct blk_mq_hw_ctx	*hctx;
> > +	struct blk_mq_tags	*tags;
> > +
> > +	tags = hctx->tags;
> > +
> > +	if (tags)
> > +		clear_bit(BLK_MQ_TAGS_DRAINED, &tags->flags);
> > +
> > +	return 0;
> 
> I'd write this as:
> 
> {
> 	struct blk_mq_hw_ctx	*hctx = 
> 		hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
> 
> 	if (hctx->tags)
> 		clear_bit(BLK_MQ_TAGS_DRAINED, &hctx->tags->flags);
> 	return 0;
> }

OK.

> 
> > +static void blk_mq_drain_inflight_rqs(struct blk_mq_tags *tags, int dead_cpu)
> > +{
> > +	unsigned long msecs_left = 1000 * 5;
> > +
> > +	if (!tags)
> > +		return;
> > +
> > +	if (test_and_set_bit(BLK_MQ_TAGS_DRAINED, &tags->flags))
> > +		return;
> > +
> > +	while (msecs_left > 0) {
> > +		if (!blk_mq_tags_inflight_rqs(tags))
> > +			break;
> > +		msleep(5);
> > +		msecs_left -= 5;
> > +	}
> > +
> > +	if (msecs_left > 0)
> > +		printk(KERN_WARNING "requests not completed from dead "
> > +				"CPU %d\n", dead_cpu);
> > +}
> 
> Isn't this condition inverted?  If we break out early msecs_left will
> be larger than 0 and we are fine.

Yeah, I saw that just after the patch was posted.

> 
> Why not:
> 
> 	for (attempts = 0; attempts < 1000; attempts++) {
> 		if (!blk_mq_tags_inflight_rqs(tags))
> 			return;
> 	}

Fine.

> 
> 	...
> 
> But more importantly I'm not sure we can just give up that easily.
> Shouldn't we at lest wait the same timeout we otherwise have for
> requests, and if the command isn't finished in time kick off error
> handling?

The default 30sec timeout is too long here, and may cause new bug
report on CPU hotplug.

Also it makes no difference by waiting 30sec, given timeout
handler will be triggered when request is really timed out.

However, one problem is that some drivers may simply return RESET_TIMER
in their timeout handler, then no simple solution for these drivers.


Thanks,
Ming

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] blk-mq: Wait for for hctx inflight requests on CPU unplug
  2019-05-17  9:14 [PATCH] blk-mq: Wait for for hctx inflight requests on CPU unplug Ming Lei
  2019-05-21  7:40 ` Christoph Hellwig
@ 2019-05-21 13:50 ` John Garry
  2019-05-22  1:56   ` Ming Lei
  1 sibling, 1 reply; 11+ messages in thread
From: John Garry @ 2019-05-21 13:50 UTC (permalink / raw)
  To: Ming Lei, Jens Axboe
  Cc: linux-block, Christoph Hellwig, Bart Van Assche, Hannes Reinecke,
	Keith Busch, Thomas Gleixner, linux-scsi

On 17/05/2019 10:14, Ming Lei wrote:
> Managed interrupts can not migrate affinity when their CPUs are offline.
> If the CPU is allowed to shutdown before they're returned, commands
> dispatched to managed queues won't be able to complete through their
> irq handlers.
>
> Wait in cpu hotplug handler until all inflight requests on the tags
> are completed or timeout. Wait once for each tags, so we can save time
> in case of shared tags.
>
> Based on the following patch from Keith, and use simple delay-spin
> instead.
>
> https://lore.kernel.org/linux-block/20190405215920.27085-1-keith.busch@intel.com/
>
> Cc: Christoph Hellwig <hch@lst.de>
> Cc: Bart Van Assche <bvanassche@acm.org>
> Cc: Hannes Reinecke <hare@suse.com>
> Cc: John Garry <john.garry@huawei.com>
> Cc: Keith Busch <keith.busch@intel.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/blk-mq-tag.c |  2 +-
>  block/blk-mq-tag.h |  5 ++++
>  block/blk-mq.c     | 65 +++++++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 70 insertions(+), 2 deletions(-)
>
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 7513c8eaabee..b24334f99c5d 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -332,7 +332,7 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
>   *		true to continue iterating tags, false to stop.
>   * @priv:	Will be passed as second argument to @fn.
>   */
> -static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
> +void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
>  		busy_tag_iter_fn *fn, void *priv)
>  {
>  	if (tags->nr_reserved_tags)
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> index 61deab0b5a5a..f8de50485b42 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -19,6 +19,9 @@ struct blk_mq_tags {
>  	struct request **rqs;
>  	struct request **static_rqs;
>  	struct list_head page_list;
> +
> +	#define BLK_MQ_TAGS_DRAINED           0
> +	unsigned long flags;
>  };
>
>
> @@ -35,6 +38,8 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
>  extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
>  void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
>  		void *priv);
> +void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
> +		busy_tag_iter_fn *fn, void *priv);
>
>  static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
>  						 struct blk_mq_hw_ctx *hctx)
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 08a6248d8536..d1d1b1a9628f 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2214,6 +2214,60 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
>  	return -ENOMEM;
>  }
>
> +static int blk_mq_hctx_notify_prepare(unsigned int cpu, struct hlist_node *node)
> +{
> +	struct blk_mq_hw_ctx	*hctx;
> +	struct blk_mq_tags	*tags;
> +
> +	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
> +	tags = hctx->tags;
> +
> +	if (tags)
> +		clear_bit(BLK_MQ_TAGS_DRAINED, &tags->flags);
> +

Hi Ming,

Thanks for the effort here.

I would like to make an assertion on a related topic, which I hope you 
can comment on:

For this drain mechanism to work, the blk_mq_hw_ctx’s (and related cpu 
masks) for a request queue are required to match the hw queues used in 
the LLDD (if using managed interrupts).

In others words, a SCSI LLDD needs to expose all hw queues for this to work.

The reason I say this is because if the LLDD does not expose the hw 
queues and manages them internally - as some SCSI LLDDs do - yet uses 
managed interrupts to spread the hw queue MSI vectors across all CPUs, 
then we still only have a single blk_mq_hw_ctx per rq with a cpumask 
covering all cpus, which is not what we would want.

Cheers,
John

> +	return 0;
> +}
> +
> +static bool blk_mq_count_inflight_rq(struct request *rq, void *data,
> +				     bool reserved)
> +{
> +	if (blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
> +		(*(unsigned long *)data)++;
> +
> +	return true;
> +}
> +
> +static unsigned blk_mq_tags_inflight_rqs(struct blk_mq_tags *tags)
> +{
> +	unsigned long cnt = 0;
> +
> +	blk_mq_all_tag_busy_iter(tags, blk_mq_count_inflight_rq, &cnt);
> +
> +	return cnt;
> +}
> +
> +static void blk_mq_drain_inflight_rqs(struct blk_mq_tags *tags, int dead_cpu)
> +{
> +	unsigned long msecs_left = 1000 * 5;
> +
> +	if (!tags)
> +		return;
> +
> +	if (test_and_set_bit(BLK_MQ_TAGS_DRAINED, &tags->flags))
> +		return;
> +
> +	while (msecs_left > 0) {
> +		if (!blk_mq_tags_inflight_rqs(tags))
> +			break;
> +		msleep(5);
> +		msecs_left -= 5;
> +	}
> +
> +	if (msecs_left > 0)
> +		printk(KERN_WARNING "requests not completed from dead "
> +				"CPU %d\n", dead_cpu);
> +}
> +
>  /*
>   * 'cpu' is going away. splice any existing rq_list entries from this
>   * software queue to the hw queue dispatch list, and ensure that it
> @@ -2245,6 +2299,14 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
>  	spin_unlock(&hctx->lock);
>
>  	blk_mq_run_hw_queue(hctx, true);
> +
> +	/*
> +	 * Interrupt for this queue will be shutdown, so wait until all
> +	 * requests from this hw queue is done or timeout.
> +	 */
> +	if (cpumask_first_and(hctx->cpumask, cpu_online_mask) >= nr_cpu_ids)
> +		blk_mq_drain_inflight_rqs(hctx->tags, cpu);
> +
>  	return 0;
>  }
>
> @@ -3540,7 +3602,8 @@ EXPORT_SYMBOL(blk_mq_rq_cpu);
>
>  static int __init blk_mq_init(void)
>  {
> -	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
> +	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead",
> +				blk_mq_hctx_notify_prepare,
>  				blk_mq_hctx_notify_dead);
>  	return 0;
>  }
>



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] blk-mq: Wait for for hctx inflight requests on CPU unplug
  2019-05-21 13:50 ` John Garry
@ 2019-05-22  1:56   ` Ming Lei
  2019-05-22  9:06     ` John Garry
  0 siblings, 1 reply; 11+ messages in thread
From: Ming Lei @ 2019-05-22  1:56 UTC (permalink / raw)
  To: John Garry
  Cc: Jens Axboe, linux-block, Christoph Hellwig, Bart Van Assche,
	Hannes Reinecke, Keith Busch, Thomas Gleixner, linux-scsi,
	Kashyap Desai

Hi John,

On Tue, May 21, 2019 at 02:50:06PM +0100, John Garry wrote:
> On 17/05/2019 10:14, Ming Lei wrote:
> > Managed interrupts can not migrate affinity when their CPUs are offline.
> > If the CPU is allowed to shutdown before they're returned, commands
> > dispatched to managed queues won't be able to complete through their
> > irq handlers.
> > 
> > Wait in cpu hotplug handler until all inflight requests on the tags
> > are completed or timeout. Wait once for each tags, so we can save time
> > in case of shared tags.
> > 
> > Based on the following patch from Keith, and use simple delay-spin
> > instead.
> > 
> > https://lore.kernel.org/linux-block/20190405215920.27085-1-keith.busch@intel.com/
> > 
> > Cc: Christoph Hellwig <hch@lst.de>
> > Cc: Bart Van Assche <bvanassche@acm.org>
> > Cc: Hannes Reinecke <hare@suse.com>
> > Cc: John Garry <john.garry@huawei.com>
> > Cc: Keith Busch <keith.busch@intel.com>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  block/blk-mq-tag.c |  2 +-
> >  block/blk-mq-tag.h |  5 ++++
> >  block/blk-mq.c     | 65 +++++++++++++++++++++++++++++++++++++++++++++-
> >  3 files changed, 70 insertions(+), 2 deletions(-)
> > 
> > diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> > index 7513c8eaabee..b24334f99c5d 100644
> > --- a/block/blk-mq-tag.c
> > +++ b/block/blk-mq-tag.c
> > @@ -332,7 +332,7 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
> >   *		true to continue iterating tags, false to stop.
> >   * @priv:	Will be passed as second argument to @fn.
> >   */
> > -static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
> > +void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
> >  		busy_tag_iter_fn *fn, void *priv)
> >  {
> >  	if (tags->nr_reserved_tags)
> > diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> > index 61deab0b5a5a..f8de50485b42 100644
> > --- a/block/blk-mq-tag.h
> > +++ b/block/blk-mq-tag.h
> > @@ -19,6 +19,9 @@ struct blk_mq_tags {
> >  	struct request **rqs;
> >  	struct request **static_rqs;
> >  	struct list_head page_list;
> > +
> > +	#define BLK_MQ_TAGS_DRAINED           0
> > +	unsigned long flags;
> >  };
> > 
> > 
> > @@ -35,6 +38,8 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
> >  extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
> >  void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
> >  		void *priv);
> > +void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
> > +		busy_tag_iter_fn *fn, void *priv);
> > 
> >  static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
> >  						 struct blk_mq_hw_ctx *hctx)
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 08a6248d8536..d1d1b1a9628f 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -2214,6 +2214,60 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
> >  	return -ENOMEM;
> >  }
> > 
> > +static int blk_mq_hctx_notify_prepare(unsigned int cpu, struct hlist_node *node)
> > +{
> > +	struct blk_mq_hw_ctx	*hctx;
> > +	struct blk_mq_tags	*tags;
> > +
> > +	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
> > +	tags = hctx->tags;
> > +
> > +	if (tags)
> > +		clear_bit(BLK_MQ_TAGS_DRAINED, &tags->flags);
> > +
> 
> Hi Ming,
> 
> Thanks for the effort here.
> 
> I would like to make an assertion on a related topic, which I hope you can
> comment on:
> 
> For this drain mechanism to work, the blk_mq_hw_ctx’s (and related cpu
> masks) for a request queue are required to match the hw queues used in the
> LLDD (if using managed interrupts).
> 
> In others words, a SCSI LLDD needs to expose all hw queues for this to work.
> 
> The reason I say this is because if the LLDD does not expose the hw queues
> and manages them internally - as some SCSI LLDDs do - yet uses managed
> interrupts to spread the hw queue MSI vectors across all CPUs, then we still
> only have a single blk_mq_hw_ctx per rq with a cpumask covering all cpus,
> which is not what we would want.

Good catch!

This drain mechanism won't address the issue for these SCSI LLDDs in which:

	1) blk_mq_hw_ctx serves as submission hw queue

	2) one private reply queue serves as completion queue, for which one
	MSI vector with cpumask is setup via pci_alloc_irq_vectors_affinity(PCI_IRQ_AFFINITY).

What we should only drain is the completion queue if all its mapped
CPUs are offline.

Looks you suggest to expose all completion(reply) queues as 'struct blk_mq_hw_ctx',
which may involve in another more hard problem: how to split the single
hostwide tags into each reply queue. I'd rather not work towards that
direction because:

1) it is very hard to partition global resources into several parts,
especially it is hard to make every part happy.

2) sbitmap is smart/efficient enough for this global allocation

3) no obvious improvement is obtained from the resource partition, according
to previous experiment result done by Kashyap.

I think we could implement the drain mechanism in the following way:

1) if 'struct blk_mq_hw_ctx' serves as completion queue, use the
approach in the patch

2) otherwise:
- introduce one callbcack of .prep_queue_dead(hctx, down_cpu) to
'struct blk_mq_ops'

- call .prep_queue_dead from blk_mq_hctx_notify_dead() 

3) inside .prep_queue_dead():
- the driver checks if all mapped CPU on the completion queue is offline
- if yes, wait for in-flight requests originated from all CPUs mapped to
this completion queue, and it can be implemented as one block layer API

Any comments on the above approach?

Thanks,
Ming

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] blk-mq: Wait for for hctx inflight requests on CPU unplug
  2019-05-22  1:56   ` Ming Lei
@ 2019-05-22  9:06     ` John Garry
  2019-05-22  9:47       ` Hannes Reinecke
  2019-05-22 10:01       ` Ming Lei
  0 siblings, 2 replies; 11+ messages in thread
From: John Garry @ 2019-05-22  9:06 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Christoph Hellwig, Bart Van Assche,
	Hannes Reinecke, Keith Busch, Thomas Gleixner, linux-scsi,
	Kashyap Desai, chenxiang

>>>
>>> +static int blk_mq_hctx_notify_prepare(unsigned int cpu, struct hlist_node *node)
>>> +{
>>> +	struct blk_mq_hw_ctx	*hctx;
>>> +	struct blk_mq_tags	*tags;
>>> +
>>> +	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
>>> +	tags = hctx->tags;
>>> +
>>> +	if (tags)
>>> +		clear_bit(BLK_MQ_TAGS_DRAINED, &tags->flags);
>>> +
>>
>> Hi Ming,
>>
>> Thanks for the effort here.
>>
>> I would like to make an assertion on a related topic, which I hope you can
>> comment on:
>>
>> For this drain mechanism to work, the blk_mq_hw_ctx’s (and related cpu
>> masks) for a request queue are required to match the hw queues used in the
>> LLDD (if using managed interrupts).
>>
>> In others words, a SCSI LLDD needs to expose all hw queues for this to work.
>>
>> The reason I say this is because if the LLDD does not expose the hw queues
>> and manages them internally - as some SCSI LLDDs do - yet uses managed
>> interrupts to spread the hw queue MSI vectors across all CPUs, then we still
>> only have a single blk_mq_hw_ctx per rq with a cpumask covering all cpus,
>> which is not what we would want.
>

Hi Ming,

> Good catch!
>
> This drain mechanism won't address the issue for these SCSI LLDDs in which:
>
> 	1) blk_mq_hw_ctx serves as submission hw queue
>
> 	2) one private reply queue serves as completion queue, for which one
> 	MSI vector with cpumask is setup via pci_alloc_irq_vectors_affinity(PCI_IRQ_AFFINITY).
>
> What we should only drain is the completion queue if all its mapped
> CPUs are offline.
>
> Looks you suggest to expose all completion(reply) queues as 'struct blk_mq_hw_ctx',
> which may involve in another more hard problem:  how to split the single
> hostwide tags into each reply queue.

Yes, and this is what I expecting to hear Re. hostwide tags.

I'd rather not work towards that
> direction because:
>
> 1) it is very hard to partition global resources into several parts,
> especially it is hard to make every part happy.
>
> 2) sbitmap is smart/efficient enough for this global allocation
>
> 3) no obvious improvement is obtained from the resource partition, according
> to previous experiment result done by Kashyap.

I'd like to also do the test.

However I would need to forward port the patchset, which no longer 
cleanly applies (I was referring to this 
https://lore.kernel.org/linux-block/20180205152035.15016-1-ming.lei@redhat.com/). 
Any help with that would be appreciated.

>
> I think we could implement the drain mechanism in the following way:
>
> 1) if 'struct blk_mq_hw_ctx' serves as completion queue, use the
> approach in the patch

Maybe the gain of exposing multiple queues+managed interrupts outweighs 
the loss in the LLDD of having to generate this unique tag with sbitmap; 
I know that we did not use sbitmap ever in the LLDD for generating the 
tag when testing previously. However I'm still not too hopeful.

>
> 2) otherwise:
> - introduce one callbcack of .prep_queue_dead(hctx, down_cpu) to
> 'struct blk_mq_ops'

This would not be allowed to block, right?

>
> - call .prep_queue_dead from blk_mq_hctx_notify_dead()
>
> 3) inside .prep_queue_dead():
> - the driver checks if all mapped CPU on the completion queue is offline
> - if yes, wait for in-flight requests originated from all CPUs mapped to
> this completion queue, and it can be implemented as one block layer API

That could work. However I think that someone may ask why the LLDD just 
doesn't register for the CPU hotplug event itself (which I would really 
rather avoid), instead of being relayed the info from the block layer.

>
> Any comments on the above approach?
>
> Thanks,
> Ming

Cheers,
John

>
> .
>



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] blk-mq: Wait for for hctx inflight requests on CPU unplug
  2019-05-22  9:06     ` John Garry
@ 2019-05-22  9:47       ` Hannes Reinecke
  2019-05-22 10:31         ` Ming Lei
  2019-05-22 12:30         ` John Garry
  2019-05-22 10:01       ` Ming Lei
  1 sibling, 2 replies; 11+ messages in thread
From: Hannes Reinecke @ 2019-05-22  9:47 UTC (permalink / raw)
  To: John Garry, Ming Lei
  Cc: Jens Axboe, linux-block, Christoph Hellwig, Bart Van Assche,
	Hannes Reinecke, Keith Busch, Thomas Gleixner, linux-scsi,
	Kashyap Desai, chenxiang

On 5/22/19 11:06 AM, John Garry wrote:
>>>>
>>>> +static int blk_mq_hctx_notify_prepare(unsigned int cpu, struct 
>>>> hlist_node *node)
>>>> +{
>>>> +    struct blk_mq_hw_ctx    *hctx;
>>>> +    struct blk_mq_tags    *tags;
>>>> +
>>>> +    hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
>>>> +    tags = hctx->tags;
>>>> +
>>>> +    if (tags)
>>>> +        clear_bit(BLK_MQ_TAGS_DRAINED, &tags->flags);
>>>> +
>>>
>>> Hi Ming,
>>>
>>> Thanks for the effort here.
>>>
>>> I would like to make an assertion on a related topic, which I hope 
>>> you can
>>> comment on:
>>>
>>> For this drain mechanism to work, the blk_mq_hw_ctx’s (and related cpu
>>> masks) for a request queue are required to match the hw queues used 
>>> in the
>>> LLDD (if using managed interrupts).
>>>
>>> In others words, a SCSI LLDD needs to expose all hw queues for this 
>>> to work.
>>>
More importantly, the SCSI LLDD needs to be _able_ to expose one hw 
queue per CPU.
Which cannot be taken for granted; especially in larger machines it's 
relatively easy to have more CPUs than MSI-X vectores...

>>> The reason I say this is because if the LLDD does not expose the hw 
>>> queues
>>> and manages them internally - as some SCSI LLDDs do - yet uses managed
>>> interrupts to spread the hw queue MSI vectors across all CPUs, then 
>>> we still
>>> only have a single blk_mq_hw_ctx per rq with a cpumask covering all 
>>> cpus,
>>> which is not what we would want.
>>
> 
> Hi Ming,
> 
>> Good catch!
>>
>> This drain mechanism won't address the issue for these SCSI LLDDs in 
>> which:
>>
>>     1) blk_mq_hw_ctx serves as submission hw queue
>>
>>     2) one private reply queue serves as completion queue, for which one
>>     MSI vector with cpumask is setup via 
>> pci_alloc_irq_vectors_affinity(PCI_IRQ_AFFINITY).
>>
>> What we should only drain is the completion queue if all its mapped
>> CPUs are offline.
>>
Hmm. That's a very unusual setup. Typically it's the other way round; 
SCSI LLDDs have several completion queues (as these are mapped to MSI-X 
vectors), but only one submission queue.
(Classical examples are mptsas, megaraid_sas, aacraid, and hpsa)

But I still do think we need to handle this case; the HBA might not 
expose enough MSI-X vectors/hw queues for us to map to all CPUs.
In which case we'd be running into the same situation.

And I do think we _need_ to drain the associated completion queue as 
soon as _any_ CPU in that set it plugged; otherwise we can't ensure that 
any interrupt for pending I/O will _not_ arrive at the dead CPU.

And yes, this would amount to quiesce the HBA completely if only one 
queue is exposed. But there's no way around this; the alternative would 
be to code a fallback patch in each driver to catch missing completions.
Which would actually be an interface change, requiring each vendor / 
maintainer to change their driver. Not very nice.

>> Looks you suggest to expose all completion(reply) queues as 'struct 
>> blk_mq_hw_ctx',
>> which may involve in another more hard problem:  how to split the single
>> hostwide tags into each reply queue.
> 
> Yes, and this is what I expecting to hear Re. hostwide tags.
> 
But this case is handled already; things like lpfc and qla2xxx have been 
converted to this model (exposing all hw queues, and use a host-wide 
tagmap).

So from that side there is not really an issue.

I even provided patchset to convert megaraid_sas (cf 'megaraid_sas: 
enable blk-mq for fusion'); you might want to have a look there to see 
how it can be done.

> I'd rather not work towards that
>> direction because:
>>
>> 1) it is very hard to partition global resources into several parts,
>> especially it is hard to make every part happy.
>>
>> 2) sbitmap is smart/efficient enough for this global allocation
>>
>> 3) no obvious improvement is obtained from the resource partition, 
>> according
>> to previous experiment result done by Kashyap.
> 
> I'd like to also do the test.
> 
> However I would need to forward port the patchset, which no longer 
> cleanly applies (I was referring to this 
> https://lore.kernel.org/linux-block/20180205152035.15016-1-ming.lei@redhat.com/). 
> Any help with that would be appreciated.
> 
If you would post it on the mailing list (or send it to me) I can have a 
look. Converting SAS is on my list of things to do, anyway.

>>
>> I think we could implement the drain mechanism in the following way:
>>
>> 1) if 'struct blk_mq_hw_ctx' serves as completion queue, use the
>> approach in the patch
> 
> Maybe the gain of exposing multiple queues+managed interrupts outweighs 
> the loss in the LLDD of having to generate this unique tag with sbitmap; 
> I know that we did not use sbitmap ever in the LLDD for generating the 
> tag when testing previously. However I'm still not too hopeful.
> 
Thing is, the tag _is_ already generated by the time the command is 
passed to the LLDD. So there is no overhead; you just need to establish 
a 1:1 mapping between SCSI cmds from the midlayer and your internal 
commands.

Which is where the problem starts: if you have to use the same command 
pool for internal commands you have to set some tags aside to avoid a 
clash with the tags generated from the block layer.
That's easily done, but if you do that quiescing is getting harder, as 
then the block layer wouldn't know about these internal commands.
This is what I'm trying to address with my patchset to use private tags 
in SCSI, as then the block layer maintains all tags, and is able to 
figure out if the queue really is quiesced.
(And I really need to post my patchset).

>>
>> 2) otherwise:
>> - introduce one callbcack of .prep_queue_dead(hctx, down_cpu) to
>> 'struct blk_mq_ops'
> 
> This would not be allowed to block, right?
> 
>>
>> - call .prep_queue_dead from blk_mq_hctx_notify_dead()
>>
>> 3) inside .prep_queue_dead():
>> - the driver checks if all mapped CPU on the completion queue is offline
>> - if yes, wait for in-flight requests originated from all CPUs mapped to
>> this completion queue, and it can be implemented as one block layer API
> 
> That could work. However I think that someone may ask why the LLDD just 
> doesn't register for the CPU hotplug event itself (which I would really 
> rather avoid), instead of being relayed the info from the block layer.
> 
Again; what would you do if not all CPUs from a pool are gone?
You still might be getting interrupts for non-associated interrupts, and 
quite some drivers are unhappy under these circumstances.
Hence I guess it'll be better to quiesce the queue as soon as _any_ CPU 
from the pool is gone.

Plus we could be doing this from the block layer without any callbacks 
from the driver...

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Mary Higgins, Sri Rasiah
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] blk-mq: Wait for for hctx inflight requests on CPU unplug
  2019-05-22  9:06     ` John Garry
  2019-05-22  9:47       ` Hannes Reinecke
@ 2019-05-22 10:01       ` Ming Lei
  2019-05-22 12:21         ` John Garry
  1 sibling, 1 reply; 11+ messages in thread
From: Ming Lei @ 2019-05-22 10:01 UTC (permalink / raw)
  To: John Garry
  Cc: Jens Axboe, linux-block, Christoph Hellwig, Bart Van Assche,
	Hannes Reinecke, Keith Busch, Thomas Gleixner, linux-scsi,
	Kashyap Desai, chenxiang

On Wed, May 22, 2019 at 10:06:16AM +0100, John Garry wrote:
> > > > 
> > > > +static int blk_mq_hctx_notify_prepare(unsigned int cpu, struct hlist_node *node)
> > > > +{
> > > > +	struct blk_mq_hw_ctx	*hctx;
> > > > +	struct blk_mq_tags	*tags;
> > > > +
> > > > +	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
> > > > +	tags = hctx->tags;
> > > > +
> > > > +	if (tags)
> > > > +		clear_bit(BLK_MQ_TAGS_DRAINED, &tags->flags);
> > > > +
> > > 
> > > Hi Ming,
> > > 
> > > Thanks for the effort here.
> > > 
> > > I would like to make an assertion on a related topic, which I hope you can
> > > comment on:
> > > 
> > > For this drain mechanism to work, the blk_mq_hw_ctx’s (and related cpu
> > > masks) for a request queue are required to match the hw queues used in the
> > > LLDD (if using managed interrupts).
> > > 
> > > In others words, a SCSI LLDD needs to expose all hw queues for this to work.
> > > 
> > > The reason I say this is because if the LLDD does not expose the hw queues
> > > and manages them internally - as some SCSI LLDDs do - yet uses managed
> > > interrupts to spread the hw queue MSI vectors across all CPUs, then we still
> > > only have a single blk_mq_hw_ctx per rq with a cpumask covering all cpus,
> > > which is not what we would want.
> > 
> 
> Hi Ming,
> 
> > Good catch!
> > 
> > This drain mechanism won't address the issue for these SCSI LLDDs in which:
> > 
> > 	1) blk_mq_hw_ctx serves as submission hw queue
> > 
> > 	2) one private reply queue serves as completion queue, for which one
> > 	MSI vector with cpumask is setup via pci_alloc_irq_vectors_affinity(PCI_IRQ_AFFINITY).
> > 
> > What we should only drain is the completion queue if all its mapped
> > CPUs are offline.
> > 
> > Looks you suggest to expose all completion(reply) queues as 'struct blk_mq_hw_ctx',
> > which may involve in another more hard problem:  how to split the single
> > hostwide tags into each reply queue.
> 
> Yes, and this is what I expecting to hear Re. hostwide tags.
> 
> I'd rather not work towards that
> > direction because:
> > 
> > 1) it is very hard to partition global resources into several parts,
> > especially it is hard to make every part happy.
> > 
> > 2) sbitmap is smart/efficient enough for this global allocation
> > 
> > 3) no obvious improvement is obtained from the resource partition, according
> > to previous experiment result done by Kashyap.
> 
> I'd like to also do the test.
> 
> However I would need to forward port the patchset, which no longer cleanly
> applies (I was referring to this https://lore.kernel.org/linux-block/20180205152035.15016-1-ming.lei@redhat.com/).
> Any help with that would be appreciated.

The queue type change causes patches not applied any more.

Could you just test the patch against v4.15 and see if there is any
improvement?

Even if it may improve performance on hisi_sas, I still suggest to not
use that approach to solve the issue for draining in-flight requests when
all CPUs of one hw queue becomes offline, since this way might hurt
performance on other drivers.

> 
> > 
> > I think we could implement the drain mechanism in the following way:
> > 
> > 1) if 'struct blk_mq_hw_ctx' serves as completion queue, use the
> > approach in the patch
> 
> Maybe the gain of exposing multiple queues+managed interrupts outweighs the
> loss in the LLDD of having to generate this unique tag with sbitmap; I know

The unique tag has zero cost, see blk_mq_unique_tag().

> that we did not use sbitmap ever in the LLDD for generating the tag when
> testing previously. However I'm still not too hopeful.
> 
> > 
> > 2) otherwise:
> > - introduce one callbcack of .prep_queue_dead(hctx, down_cpu) to
> > 'struct blk_mq_ops'
> 
> This would not be allowed to block, right?

It is allowed to block in CPU hotplug handler.

> 
> > 
> > - call .prep_queue_dead from blk_mq_hctx_notify_dead()
> > 
> > 3) inside .prep_queue_dead():
> > - the driver checks if all mapped CPU on the completion queue is offline
> > - if yes, wait for in-flight requests originated from all CPUs mapped to
> > this completion queue, and it can be implemented as one block layer API
> 
> That could work. However I think that someone may ask why the LLDD just
> doesn't register for the CPU hotplug event itself (which I would really
> rather avoid), instead of being relayed the info from the block layer.

.prep_queue_dead() is run from blk-mq's CPU hotplug handler.

I also think of abstracting completion queue in blk-mq for hpsa,
hisi_sas_v3_hw and  mpt3sas, but that can't cover to drain device's internal
command, so looks it is inevitable for us to introduce driver callback.


Thanks,
Ming

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] blk-mq: Wait for for hctx inflight requests on CPU unplug
  2019-05-22  9:47       ` Hannes Reinecke
@ 2019-05-22 10:31         ` Ming Lei
  2019-05-22 12:30         ` John Garry
  1 sibling, 0 replies; 11+ messages in thread
From: Ming Lei @ 2019-05-22 10:31 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: John Garry, Jens Axboe, linux-block, Christoph Hellwig,
	Bart Van Assche, Hannes Reinecke, Keith Busch, Thomas Gleixner,
	linux-scsi, Kashyap Desai, chenxiang

Hi Hannes,

On Wed, May 22, 2019 at 11:47:49AM +0200, Hannes Reinecke wrote:
> On 5/22/19 11:06 AM, John Garry wrote:
> > > > > 
> > > > > +static int blk_mq_hctx_notify_prepare(unsigned int cpu,
> > > > > struct hlist_node *node)
> > > > > +{
> > > > > +    struct blk_mq_hw_ctx    *hctx;
> > > > > +    struct blk_mq_tags    *tags;
> > > > > +
> > > > > +    hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
> > > > > +    tags = hctx->tags;
> > > > > +
> > > > > +    if (tags)
> > > > > +        clear_bit(BLK_MQ_TAGS_DRAINED, &tags->flags);
> > > > > +
> > > > 
> > > > Hi Ming,
> > > > 
> > > > Thanks for the effort here.
> > > > 
> > > > I would like to make an assertion on a related topic, which I
> > > > hope you can
> > > > comment on:
> > > > 
> > > > For this drain mechanism to work, the blk_mq_hw_ctx’s (and related cpu
> > > > masks) for a request queue are required to match the hw queues
> > > > used in the
> > > > LLDD (if using managed interrupts).
> > > > 
> > > > In others words, a SCSI LLDD needs to expose all hw queues for
> > > > this to work.
> > > > 
> More importantly, the SCSI LLDD needs to be _able_ to expose one hw queue
> per CPU.
> Which cannot be taken for granted; especially in larger machines it's
> relatively easy to have more CPUs than MSI-X vectores...
> 
> > > > The reason I say this is because if the LLDD does not expose the
> > > > hw queues
> > > > and manages them internally - as some SCSI LLDDs do - yet uses managed
> > > > interrupts to spread the hw queue MSI vectors across all CPUs,
> > > > then we still
> > > > only have a single blk_mq_hw_ctx per rq with a cpumask covering
> > > > all cpus,
> > > > which is not what we would want.
> > > 
> > 
> > Hi Ming,
> > 
> > > Good catch!
> > > 
> > > This drain mechanism won't address the issue for these SCSI LLDDs in
> > > which:
> > > 
> > >     1) blk_mq_hw_ctx serves as submission hw queue
> > > 
> > >     2) one private reply queue serves as completion queue, for which one
> > >     MSI vector with cpumask is setup via
> > > pci_alloc_irq_vectors_affinity(PCI_IRQ_AFFINITY).
> > > 
> > > What we should only drain is the completion queue if all its mapped
> > > CPUs are offline.
> > > 
> Hmm. That's a very unusual setup. Typically it's the other way round; SCSI
> LLDDs have several completion queues (as these are mapped to MSI-X vectors),
> but only one submission queue.
> (Classical examples are mptsas, megaraid_sas, aacraid, and hpsa)

Right, that is exactly what we are discussing, one instance of
blk_mq_hw_ctx, and lots of reply queue, each reply queue is mapped to
one MSI-X vector.

> 
> But I still do think we need to handle this case; the HBA might not expose
> enough MSI-X vectors/hw queues for us to map to all CPUs.
> In which case we'd be running into the same situation.

PCI_IRQ_AFFINITY guarantees that all possible CPUs are covered no matter
how many MSI-X vectors there are.

> 
> And I do think we _need_ to drain the associated completion queue as soon as
> _any_ CPU in that set it plugged; otherwise we can't ensure that any
> interrupt for pending I/O will _not_ arrive at the dead CPU.

The genirq code supposes to handle the case well if there is at least
one CPU available for handling the managed IRQ.

However, if it can't, the approach I suggested can cover this case well,
given driver can drain any request, included internal commands which
is sent via the reply queue.

> 
> And yes, this would amount to quiesce the HBA completely if only one queue
> is exposed. But there's no way around this; the alternative would be to code
> a fallback patch in each driver to catch missing completions.
> Which would actually be an interface change, requiring each vendor /
> maintainer to change their driver. Not very nice.

We still can solve the issue by exposing one submission queue(hctx) like the
current code.

Driver can drain requests in the introduced callback of .prep_queue_dead().

> 
> > > Looks you suggest to expose all completion(reply) queues as 'struct
> > > blk_mq_hw_ctx',
> > > which may involve in another more hard problem:  how to split the single
> > > hostwide tags into each reply queue.
> > 
> > Yes, and this is what I expecting to hear Re. hostwide tags.
> > 
> But this case is handled already; things like lpfc and qla2xxx have been
> converted to this model (exposing all hw queues, and use a host-wide
> tagmap).

I remember that the performance won't be good from your initial test
on this way.

https://marc.info/?t=149130779400002&r=1&w=2

I am just wondering how both lpfc or qla2xx share the hostwide tags
among all hctxs, given not see such code in blk-mq, could you explain
it a bit?

> 
> So from that side there is not really an issue.
> 
> I even provided patchset to convert megaraid_sas (cf 'megaraid_sas: enable
> blk-mq for fusion'); you might want to have a look there to see how it can
> be done.
> 
> > I'd rather not work towards that
> > > direction because:
> > > 
> > > 1) it is very hard to partition global resources into several parts,
> > > especially it is hard to make every part happy.
> > > 
> > > 2) sbitmap is smart/efficient enough for this global allocation
> > > 
> > > 3) no obvious improvement is obtained from the resource partition,
> > > according
> > > to previous experiment result done by Kashyap.
> > 
> > I'd like to also do the test.
> > 
> > However I would need to forward port the patchset, which no longer
> > cleanly applies (I was referring to this https://lore.kernel.org/linux-block/20180205152035.15016-1-ming.lei@redhat.com/).
> > Any help with that would be appreciated.
> > 
> If you would post it on the mailing list (or send it to me) I can have a
> look. Converting SAS is on my list of things to do, anyway.
> 
> > > 
> > > I think we could implement the drain mechanism in the following way:
> > > 
> > > 1) if 'struct blk_mq_hw_ctx' serves as completion queue, use the
> > > approach in the patch
> > 
> > Maybe the gain of exposing multiple queues+managed interrupts outweighs
> > the loss in the LLDD of having to generate this unique tag with sbitmap;
> > I know that we did not use sbitmap ever in the LLDD for generating the
> > tag when testing previously. However I'm still not too hopeful.
> > 
> Thing is, the tag _is_ already generated by the time the command is passed
> to the LLDD. So there is no overhead; you just need to establish a 1:1
> mapping between SCSI cmds from the midlayer and your internal commands.
> 
> Which is where the problem starts: if you have to use the same command pool
> for internal commands you have to set some tags aside to avoid a clash with
> the tags generated from the block layer.
> That's easily done, but if you do that quiescing is getting harder, as then
> the block layer wouldn't know about these internal commands.
> This is what I'm trying to address with my patchset to use private tags in
> SCSI, as then the block layer maintains all tags, and is able to figure out
> if the queue really is quiesced.
> (And I really need to post my patchset).
> 
> > > 
> > > 2) otherwise:
> > > - introduce one callbcack of .prep_queue_dead(hctx, down_cpu) to
> > > 'struct blk_mq_ops'
> > 
> > This would not be allowed to block, right?
> > 
> > > 
> > > - call .prep_queue_dead from blk_mq_hctx_notify_dead()
> > > 
> > > 3) inside .prep_queue_dead():
> > > - the driver checks if all mapped CPU on the completion queue is offline
> > > - if yes, wait for in-flight requests originated from all CPUs mapped to
> > > this completion queue, and it can be implemented as one block layer API
> > 
> > That could work. However I think that someone may ask why the LLDD just
> > doesn't register for the CPU hotplug event itself (which I would really
> > rather avoid), instead of being relayed the info from the block layer.
> > 
> Again; what would you do if not all CPUs from a pool are gone?
> You still might be getting interrupts for non-associated interrupts, and
> quite some drivers are unhappy under these circumstances.
> Hence I guess it'll be better to quiesce the queue as soon as _any_ CPU from
> the pool is gone.

Why? If there is CPU alive for the queue, the queue should work for
that CPU. 

Also if there is one CPU alive for the queue, you can't quiesce the
queue easily, given we have to allow the alive CPU to submit IO.

Also we can't use queue freeze in CPU hotplug handler, that may cause queue
dependency issue.

> 
> Plus we could be doing this from the block layer without any callbacks from
> the driver...

That depends if all drivers can avoid to send internal command via managed IRQ,
at least now, almost all hpsa, mpt3sas and hisi_sas may do that, cause
the reply queue is selected simply by passing 'raw_processor_id()'.

I am not sure if private tags can help on this issue, what we need to do
is to drain and in-flight requests(internal or io) from one reply queue(
MSI-X vector). That means private tags still have to deal with the reply
queue mapping.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] blk-mq: Wait for for hctx inflight requests on CPU unplug
  2019-05-22 10:01       ` Ming Lei
@ 2019-05-22 12:21         ` John Garry
  0 siblings, 0 replies; 11+ messages in thread
From: John Garry @ 2019-05-22 12:21 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Christoph Hellwig, Bart Van Assche,
	Hannes Reinecke, Keith Busch, Thomas Gleixner, linux-scsi,
	Kashyap Desai, chenxiang


>> I'd rather not work towards that
>>> direction because:
>>>
>>> 1) it is very hard to partition global resources into several parts,
>>> especially it is hard to make every part happy.
>>>
>>> 2) sbitmap is smart/efficient enough for this global allocation
>>>
>>> 3) no obvious improvement is obtained from the resource partition, according
>>> to previous experiment result done by Kashyap.
>>
>> I'd like to also do the test.
>>
>> However I would need to forward port the patchset, which no longer cleanly
>> applies (I was referring to this https://lore.kernel.org/linux-block/20180205152035.15016-1-ming.lei@redhat.com/).
>> Any help with that would be appreciated.
>
> The queue type change causes patches not applied any more.
>
> Could you just test the patch against v4.15 and see if there is any
> improvement?
>

I'd rather test against latest mainline, but if it is too difficult then 
I can backport LLDD stuff and test against 4.15. It may take a while.

> Even if it may improve performance on hisi_sas, I still suggest to not
> use that approach to solve the issue for draining in-flight requests when
> all CPUs of one hw queue becomes offline, since this way might hurt
> performance on other drivers.
>
>>
>>>
>>> I think we could implement the drain mechanism in the following way:
>>>
>>> 1) if 'struct blk_mq_hw_ctx' serves as completion queue, use the
>>> approach in the patch
>>
>> Maybe the gain of exposing multiple queues+managed interrupts outweighs the
>> loss in the LLDD of having to generate this unique tag with sbitmap; I know
>
> The unique tag has zero cost, see blk_mq_unique_tag().

But we want a tag unique in range [0, host tag count), which 
blk_mq_unique_tag() does not provide.

>
>> that we did not use sbitmap ever in the LLDD for generating the tag when
>> testing previously. However I'm still not too hopeful.
>>
>>>
>>> 2) otherwise:
>>> - introduce one callbcack of .prep_queue_dead(hctx, down_cpu) to
>>> 'struct blk_mq_ops'
>>
>> This would not be allowed to block, right?
>
> It is allowed to block in CPU hotplug handler.
>
>>
>>>
>>> - call .prep_queue_dead from blk_mq_hctx_notify_dead()
>>>
>>> 3) inside .prep_queue_dead():
>>> - the driver checks if all mapped CPU on the completion queue is offline
>>> - if yes, wait for in-flight requests originated from all CPUs mapped to
>>> this completion queue, and it can be implemented as one block layer API
>>
>> That could work. However I think that someone may ask why the LLDD just
>> doesn't register for the CPU hotplug event itself (which I would really
>> rather avoid), instead of being relayed the info from the block layer.
>
> .prep_queue_dead() is run from blk-mq's CPU hotplug handler.
>
> I also think of abstracting completion queue in blk-mq for hpsa,
> hisi_sas_v3_hw and  mpt3sas, but that can't cover to drain device's internal
> command, so looks it is inevitable for us to introduce driver callback.
>

On topic of internal commands, I assume that the approach would be to 
reserve tags in blk_mq_tag_set.reserved_tags (currently not set in 
scsi_mq_setup_tags()), and the LLDD would use 
blk_mq_alloc_request(,,BLK_MQ_REQ_RESERVED) to get a tag.

I guess that this may be Hannes' idea also (see "as then the block layer 
maintains all tags, and is able to figure out if the queue really is 
quiesced").

Thanks,
John

>
> Thanks,
> Ming
>
> .
>



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] blk-mq: Wait for for hctx inflight requests on CPU unplug
  2019-05-22  9:47       ` Hannes Reinecke
  2019-05-22 10:31         ` Ming Lei
@ 2019-05-22 12:30         ` John Garry
  1 sibling, 0 replies; 11+ messages in thread
From: John Garry @ 2019-05-22 12:30 UTC (permalink / raw)
  To: Hannes Reinecke, Ming Lei
  Cc: Jens Axboe, linux-block, Christoph Hellwig, Bart Van Assche,
	Hannes Reinecke, Keith Busch, Thomas Gleixner, linux-scsi,
	Kashyap Desai, chenxiang

>
> But I still do think we need to handle this case; the HBA might not
> expose enough MSI-X vectors/hw queues for us to map to all CPUs.
> In which case we'd be running into the same situation.
>
> And I do think we _need_ to drain the associated completion queue as
> soon as _any_ CPU in that set it plugged; otherwise we can't ensure that
> any interrupt for pending I/O will _not_ arrive at the dead CPU.

Really? I did not think that it was possible for this to happen.

>
> And yes, this would amount to quiesce the HBA completely if only one
> queue is exposed. But there's no way around this; the alternative would
> be to code a fallback patch in each driver to catch missing completions.
> Which would actually be an interface change, requiring each vendor /
> maintainer to change their driver. Not very nice.
>
>>> Looks you suggest to expose all completion(reply) queues as 'struct
>>> blk_mq_hw_ctx',
>>> which may involve in another more hard problem:  how to split the single
>>> hostwide tags into each reply queue.
>>
>> Yes, and this is what I expecting to hear Re. hostwide tags.
>>
> But this case is handled already; things like lpfc and qla2xxx have been
> converted to this model (exposing all hw queues, and use a host-wide
> tagmap).
>
> So from that side there is not really an issue.
>
> I even provided patchset to convert megaraid_sas (cf 'megaraid_sas:
> enable blk-mq for fusion'); you might want to have a look there to see
> how it can be done.

ok, I'll have a search.

>
>> I'd rather not work towards that
>>> direction because:
>>>
>>> 1) it is very hard to partition global resources into several parts,
>>> especially it is hard to make every part happy.
>>>
>>> 2) sbitmap is smart/efficient enough for this global allocation
>>>
>>> 3) no obvious improvement is obtained from the resource partition,
>>> according
>>> to previous experiment result done by Kashyap.
>>
>> I'd like to also do the test.
>>
>> However I would need to forward port the patchset, which no longer
>> cleanly applies (I was referring to this
>> https://lore.kernel.org/linux-block/20180205152035.15016-1-ming.lei@redhat.com/).
>> Any help with that would be appreciated.
>>
> If you would post it on the mailing list (or send it to me) I can have a
> look. Converting SAS is on my list of things to do, anyway.
>

ok

>>>
>>> I think we could implement the drain mechanism in the following way:
>>>
>>> 1) if 'struct blk_mq_hw_ctx' serves as completion queue, use the
>>> approach in the patch
>>
>> Maybe the gain of exposing multiple queues+managed interrupts
>> outweighs the loss in the LLDD of having to generate this unique tag
>> with sbitmap; I know that we did not use sbitmap ever in the LLDD for
>> generating the tag when testing previously. However I'm still not too
>> hopeful.
>>
> Thing is, the tag _is_ already generated by the time the command is
> passed to the LLDD. So there is no overhead; you just need to establish
> a 1:1 mapping between SCSI cmds from the midlayer and your internal
> commands.
>
> Which is where the problem starts: if you have to use the same command
> pool for internal commands you have to set some tags aside to avoid a
> clash with the tags generated from the block layer.
> That's easily done, but if you do that quiescing is getting harder, as
> then the block layer wouldn't know about these internal commands.
> This is what I'm trying to address with my patchset to use private tags
> in SCSI, as then the block layer maintains all tags, and is able to
> figure out if the queue really is quiesced.
> (And I really need to post my patchset).

Ack

>
>>>
>>> 2) otherwise:
>>> - introduce one callbcack of .prep_queue_dead(hctx, down_cpu) to
>>> 'struct blk_mq_ops'
>>
>> This would not be allowed to block, right?
>>
>>>
>>> - call .prep_queue_dead from blk_mq_hctx_notify_dead()
>>>
>>> 3) inside .prep_queue_dead():
>>> - the driver checks if all mapped CPU on the completion queue is offline
>>> - if yes, wait for in-flight requests originated from all CPUs mapped to
>>> this completion queue, and it can be implemented as one block layer API
>>
>> That could work. However I think that someone may ask why the LLDD
>> just doesn't register for the CPU hotplug event itself (which I would
>> really rather avoid), instead of being relayed the info from the block
>> layer.
>>
> Again; what would you do if not all CPUs from a pool are gone?
> You still might be getting interrupts for non-associated interrupts, and
> quite some drivers are unhappy under these circumstances.
> Hence I guess it'll be better to quiesce the queue as soon as _any_ CPU
> from the pool is gone.
>
> Plus we could be doing this from the block layer without any callbacks
> from the driver...
>
> Cheers,
>
> Hannes

Thanks,
John




^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, back to index

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-17  9:14 [PATCH] blk-mq: Wait for for hctx inflight requests on CPU unplug Ming Lei
2019-05-21  7:40 ` Christoph Hellwig
2019-05-21  8:03   ` Ming Lei
2019-05-21 13:50 ` John Garry
2019-05-22  1:56   ` Ming Lei
2019-05-22  9:06     ` John Garry
2019-05-22  9:47       ` Hannes Reinecke
2019-05-22 10:31         ` Ming Lei
2019-05-22 12:30         ` John Garry
2019-05-22 10:01       ` Ming Lei
2019-05-22 12:21         ` John Garry

Linux-Block Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-block/0 linux-block/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-block linux-block/ https://lore.kernel.org/linux-block \
		linux-block@vger.kernel.org linux-block@archiver.kernel.org
	public-inbox-index linux-block


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-block


AGPL code for this site: git clone https://public-inbox.org/ public-inbox