Linux-Block Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH v5 0/2] improve nvme quiesce time for large amount of namespaces
@ 2020-07-27 23:10 Sagi Grimberg
  2020-07-27 23:10 ` [PATCH v5 1/2] blk-mq: add tagset quiesce interface Sagi Grimberg
  2020-07-27 23:10 ` [PATCH v5 2/2] nvme: use blk_mq_[un]quiesce_tagset Sagi Grimberg
  0 siblings, 2 replies; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-27 23:10 UTC (permalink / raw)
  To: linux-nvme, Christoph Hellwig, Keith Busch, Jens Axboe
  Cc: linux-block, Ming Lin, Chao Leng

This set improves the quiesce time when using a large set of
namespaces, which also improves I/O failover time in a multipath environment.

We improve for both non-blocking hctxs and blocking hctxs introducing
blk_mq_[un]quiesce_tagset which works on all request queues over a given
tagset in parallel (which is the case in nvme) for both tagset types (blocking
and non-blocking);

Changes from v4:
- introduce blk_mq_[un]quiesce_tagset as one interface
- move nvme core to use this interface

Changes from v3:
- make hctx->rcu_sync dynamically allocated from the heap instead
  of a static member function

Changes from v2:
- made blk_mq_quiesce_queue_async operate on both blocking and
  non-blocking hctxs.
- removed separation between blocking vs. non-blocking queues
- dropeed patch from Chao
- dropped nvme-rdma test patch

Changes from v1:
- trivial typo fixes

*** SUBJECT HERE ***

*** BLURB HERE ***

Sagi Grimberg (2):
  blk-mq: add tagset quiesce interface
  nvme: use blk_mq_[un]quiesce_tagset

 block/blk-mq.c           | 66 ++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/core.c | 14 ++-------
 include/linux/blk-mq.h   |  4 +++
 3 files changed, 72 insertions(+), 12 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-27 23:10 [PATCH v5 0/2] improve nvme quiesce time for large amount of namespaces Sagi Grimberg
@ 2020-07-27 23:10 ` Sagi Grimberg
  2020-07-27 23:32   ` Keith Busch
                     ` (2 more replies)
  2020-07-27 23:10 ` [PATCH v5 2/2] nvme: use blk_mq_[un]quiesce_tagset Sagi Grimberg
  1 sibling, 3 replies; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-27 23:10 UTC (permalink / raw)
  To: linux-nvme, Christoph Hellwig, Keith Busch, Jens Axboe
  Cc: linux-block, Ming Lin, Chao Leng

drivers that have shared tagsets may need to quiesce potentially a lot
of request queues that all share a single tagset (e.g. nvme). Add an interface
to quiesce all the queues on a given tagset. This interface is useful because
it can speedup the quiesce by doing it in parallel.

For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs
in parallel such that all of them wait for the same rcu elapsed period with
a per-hctx heap allocated rcu_synchronize. for tagsets that don't have
BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is
sufficient.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 block/blk-mq.c         | 66 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq.h |  4 +++
 2 files changed, 70 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index abcf590f6238..c37e37354330 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -209,6 +209,42 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 
+static void blk_mq_quiesce_blocking_queue_async(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+
+	blk_mq_quiesce_queue_nowait(q);
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
+		hctx->rcu_sync = kmalloc(sizeof(*hctx->rcu_sync), GFP_KERNEL);
+		if (!hctx->rcu_sync)
+			continue;
+
+		init_completion(&hctx->rcu_sync->completion);
+		init_rcu_head(&hctx->rcu_sync->head);
+		call_srcu(hctx->srcu, &hctx->rcu_sync->head,
+				wakeme_after_rcu);
+	}
+}
+
+static void blk_mq_quiesce_blocking_queue_async_wait(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
+		if (!hctx->rcu_sync) {
+			synchronize_srcu(hctx->srcu);
+			continue;
+		}
+		wait_for_completion(&hctx->rcu_sync->completion);
+		destroy_rcu_head(&hctx->rcu_sync->head);
+	}
+}
+
 /**
  * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
  * @q: request queue.
@@ -2884,6 +2920,36 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
 	}
 }
 
+void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
+{
+	struct request_queue *q;
+
+	mutex_lock(&set->tag_list_lock);
+	if (set->flags & BLK_MQ_F_BLOCKING) {
+		list_for_each_entry(q, &set->tag_list, tag_set_list)
+			blk_mq_quiesce_blocking_queue_async(q);
+		list_for_each_entry(q, &set->tag_list, tag_set_list)
+			blk_mq_quiesce_blocking_queue_async_wait(q);
+	} else {
+		list_for_each_entry(q, &set->tag_list, tag_set_list)
+			blk_mq_quiesce_queue_nowait(q);
+		synchronize_rcu();
+	}
+	mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
+
+void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
+{
+	struct request_queue *q;
+
+	mutex_lock(&set->tag_list_lock);
+	list_for_each_entry(q, &set->tag_list, tag_set_list)
+		blk_mq_unquiesce_queue(q);
+	mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
+
 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
 					bool shared)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 23230c1d031e..a85f2dedc947 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -5,6 +5,7 @@
 #include <linux/blkdev.h>
 #include <linux/sbitmap.h>
 #include <linux/srcu.h>
+#include <linux/rcupdate_wait.h>
 
 struct blk_mq_tags;
 struct blk_flush_queue;
@@ -170,6 +171,7 @@ struct blk_mq_hw_ctx {
 	 */
 	struct list_head	hctx_list;
 
+	struct rcu_synchronize	*rcu_sync;
 	/**
 	 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
 	 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
@@ -532,6 +534,8 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
 
 void blk_mq_quiesce_queue_nowait(struct request_queue *q);
+void blk_mq_quiesce_tagset(struct request_queue *q);
+void blk_mq_unquiesce_tagset(struct request_queue *q);
 
 unsigned int blk_mq_rq_cpu(struct request *rq);
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v5 2/2] nvme: use blk_mq_[un]quiesce_tagset
  2020-07-27 23:10 [PATCH v5 0/2] improve nvme quiesce time for large amount of namespaces Sagi Grimberg
  2020-07-27 23:10 ` [PATCH v5 1/2] blk-mq: add tagset quiesce interface Sagi Grimberg
@ 2020-07-27 23:10 ` Sagi Grimberg
  2020-07-28  0:54   ` Sagi Grimberg
  1 sibling, 1 reply; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-27 23:10 UTC (permalink / raw)
  To: linux-nvme, Christoph Hellwig, Keith Busch, Jens Axboe
  Cc: linux-block, Ming Lin, Chao Leng

All controller namespaces share the same tagset, so we
can use this interface which does the optimal operation
for parallel quiesce based on the tagset type (e.g.
blocking tagsets and non-blocking tagsets).

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 05aa568a60af..c41df20996d7 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4557,23 +4557,13 @@ EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
-	struct nvme_ns *ns;
-
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
-		blk_mq_quiesce_queue(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	blk_mq_quiesce_tagset(ctrl->tagset);
 }
 EXPORT_SYMBOL_GPL(nvme_stop_queues);
 
 void nvme_start_queues(struct nvme_ctrl *ctrl)
 {
-	struct nvme_ns *ns;
-
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
-		blk_mq_unquiesce_queue(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	blk_mq_unquiesce_tagset(ctrl->tagset);
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 
-- 
2.25.1


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-27 23:10 ` [PATCH v5 1/2] blk-mq: add tagset quiesce interface Sagi Grimberg
@ 2020-07-27 23:32   ` Keith Busch
  2020-07-28  0:12     ` Sagi Grimberg
  2020-07-28  1:40   ` Ming Lei
  2020-07-28  7:18   ` Christoph Hellwig
  2 siblings, 1 reply; 40+ messages in thread
From: Keith Busch @ 2020-07-27 23:32 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: linux-nvme, Christoph Hellwig, Jens Axboe, linux-block, Ming Lin,
	Chao Leng

On Mon, Jul 27, 2020 at 04:10:21PM -0700, Sagi Grimberg wrote:
> +static void blk_mq_quiesce_blocking_queue_async_wait(struct request_queue *q)
> +{
> +	struct blk_mq_hw_ctx *hctx;
> +	unsigned int i;
> +
> +	queue_for_each_hw_ctx(q, hctx, i) {
> +		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
> +		if (!hctx->rcu_sync) {
> +			synchronize_srcu(hctx->srcu);
> +			continue;
> +		}
> +		wait_for_completion(&hctx->rcu_sync->completion);
> +		destroy_rcu_head(&hctx->rcu_sync->head);

Leaking rcu_sync, and not clearing it across quiesce contexts. Needs:

		kfree(hctx->rcu_sync);
		hctx->rcu_sync = NULL;

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-27 23:32   ` Keith Busch
@ 2020-07-28  0:12     ` Sagi Grimberg
  0 siblings, 0 replies; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-28  0:12 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-nvme, Christoph Hellwig, Jens Axboe, linux-block, Ming Lin,
	Chao Leng


>> +static void blk_mq_quiesce_blocking_queue_async_wait(struct request_queue *q)
>> +{
>> +	struct blk_mq_hw_ctx *hctx;
>> +	unsigned int i;
>> +
>> +	queue_for_each_hw_ctx(q, hctx, i) {
>> +		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
>> +		if (!hctx->rcu_sync) {
>> +			synchronize_srcu(hctx->srcu);
>> +			continue;
>> +		}
>> +		wait_for_completion(&hctx->rcu_sync->completion);
>> +		destroy_rcu_head(&hctx->rcu_sync->head);
> 
> Leaking rcu_sync, and not clearing it across quiesce contexts. Needs:
> 
> 		kfree(hctx->rcu_sync);
> 		hctx->rcu_sync = NULL;

Good catch, will wait for some more review and submit a v6.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 2/2] nvme: use blk_mq_[un]quiesce_tagset
  2020-07-27 23:10 ` [PATCH v5 2/2] nvme: use blk_mq_[un]quiesce_tagset Sagi Grimberg
@ 2020-07-28  0:54   ` Sagi Grimberg
  2020-07-28  3:21     ` Chao Leng
  0 siblings, 1 reply; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-28  0:54 UTC (permalink / raw)
  To: linux-nvme, Christoph Hellwig, Keith Busch, Jens Axboe
  Cc: linux-block, Ming Lin, Chao Leng



On 7/27/20 4:10 PM, Sagi Grimberg wrote:
> All controller namespaces share the same tagset, so we
> can use this interface which does the optimal operation
> for parallel quiesce based on the tagset type (e.g.
> blocking tagsets and non-blocking tagsets).
> 
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> ---
>   drivers/nvme/host/core.c | 14 ++------------
>   1 file changed, 2 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 05aa568a60af..c41df20996d7 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -4557,23 +4557,13 @@ EXPORT_SYMBOL_GPL(nvme_start_freeze);
>   
>   void nvme_stop_queues(struct nvme_ctrl *ctrl)
>   {
> -	struct nvme_ns *ns;
> -
> -	down_read(&ctrl->namespaces_rwsem);
> -	list_for_each_entry(ns, &ctrl->namespaces, list)
> -		blk_mq_quiesce_queue(ns->queue);
> -	up_read(&ctrl->namespaces_rwsem);
> +	blk_mq_quiesce_tagset(ctrl->tagset);

Rrr.. this one is slightly annoying. We have the connect_q in
fabrics that we use to issue the connect command which is now
quiesced too...

If we will use this interface, we can unquiesce it right away,
but that seems kinda backwards...

--
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 05aa568a60af..70af0ff63e7f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4557,23 +4557,15 @@ EXPORT_SYMBOL_GPL(nvme_start_freeze);

  void nvme_stop_queues(struct nvme_ctrl *ctrl)
  {
-       struct nvme_ns *ns;
-
-       down_read(&ctrl->namespaces_rwsem);
-       list_for_each_entry(ns, &ctrl->namespaces, list)
-               blk_mq_quiesce_queue(ns->queue);
-       up_read(&ctrl->namespaces_rwsem);
+       blk_mq_quiesce_tagset(ctrl->tagset);
+       if (ctrl->connect_q)
+               blk_mq_unquiesce_queue(ctrl->connect_q);
  }
  EXPORT_SYMBOL_GPL(nvme_stop_queues);
--

Thoughts?

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-27 23:10 ` [PATCH v5 1/2] blk-mq: add tagset quiesce interface Sagi Grimberg
  2020-07-27 23:32   ` Keith Busch
@ 2020-07-28  1:40   ` Ming Lei
  2020-07-28  1:51     ` Jens Axboe
  2020-07-28  3:25     ` Sagi Grimberg
  2020-07-28  7:18   ` Christoph Hellwig
  2 siblings, 2 replies; 40+ messages in thread
From: Ming Lei @ 2020-07-28  1:40 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: linux-nvme, Christoph Hellwig, Keith Busch, Jens Axboe,
	linux-block, Ming Lin, Chao Leng

On Mon, Jul 27, 2020 at 04:10:21PM -0700, Sagi Grimberg wrote:
> drivers that have shared tagsets may need to quiesce potentially a lot
> of request queues that all share a single tagset (e.g. nvme). Add an interface
> to quiesce all the queues on a given tagset. This interface is useful because
> it can speedup the quiesce by doing it in parallel.
> 
> For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs
> in parallel such that all of them wait for the same rcu elapsed period with
> a per-hctx heap allocated rcu_synchronize. for tagsets that don't have
> BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is
> sufficient.
> 
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> ---
>  block/blk-mq.c         | 66 ++++++++++++++++++++++++++++++++++++++++++
>  include/linux/blk-mq.h |  4 +++
>  2 files changed, 70 insertions(+)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index abcf590f6238..c37e37354330 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -209,6 +209,42 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
>  }
>  EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
>  
> +static void blk_mq_quiesce_blocking_queue_async(struct request_queue *q)
> +{
> +	struct blk_mq_hw_ctx *hctx;
> +	unsigned int i;
> +
> +	blk_mq_quiesce_queue_nowait(q);
> +
> +	queue_for_each_hw_ctx(q, hctx, i) {
> +		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
> +		hctx->rcu_sync = kmalloc(sizeof(*hctx->rcu_sync), GFP_KERNEL);
> +		if (!hctx->rcu_sync)
> +			continue;

This approach of quiesce/unquiesce tagset is good abstraction.

Just one more thing, please allocate a rcu_sync array because hctx is supposed
to not store scratch stuff.


thanks, 
Ming


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  1:40   ` Ming Lei
@ 2020-07-28  1:51     ` Jens Axboe
  2020-07-28  2:17       ` Ming Lei
  2020-07-28  3:25     ` Sagi Grimberg
  1 sibling, 1 reply; 40+ messages in thread
From: Jens Axboe @ 2020-07-28  1:51 UTC (permalink / raw)
  To: Ming Lei, Sagi Grimberg
  Cc: linux-nvme, Christoph Hellwig, Keith Busch, linux-block,
	Ming Lin, Chao Leng

On 7/27/20 7:40 PM, Ming Lei wrote:
> On Mon, Jul 27, 2020 at 04:10:21PM -0700, Sagi Grimberg wrote:
>> drivers that have shared tagsets may need to quiesce potentially a lot
>> of request queues that all share a single tagset (e.g. nvme). Add an interface
>> to quiesce all the queues on a given tagset. This interface is useful because
>> it can speedup the quiesce by doing it in parallel.
>>
>> For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs
>> in parallel such that all of them wait for the same rcu elapsed period with
>> a per-hctx heap allocated rcu_synchronize. for tagsets that don't have
>> BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is
>> sufficient.
>>
>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
>> ---
>>  block/blk-mq.c         | 66 ++++++++++++++++++++++++++++++++++++++++++
>>  include/linux/blk-mq.h |  4 +++
>>  2 files changed, 70 insertions(+)
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index abcf590f6238..c37e37354330 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -209,6 +209,42 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
>>  }
>>  EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
>>  
>> +static void blk_mq_quiesce_blocking_queue_async(struct request_queue *q)
>> +{
>> +	struct blk_mq_hw_ctx *hctx;
>> +	unsigned int i;
>> +
>> +	blk_mq_quiesce_queue_nowait(q);
>> +
>> +	queue_for_each_hw_ctx(q, hctx, i) {
>> +		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
>> +		hctx->rcu_sync = kmalloc(sizeof(*hctx->rcu_sync), GFP_KERNEL);
>> +		if (!hctx->rcu_sync)
>> +			continue;
> 
> This approach of quiesce/unquiesce tagset is good abstraction.
> 
> Just one more thing, please allocate a rcu_sync array because hctx is
> supposed to not store scratch stuff.

I'd be all for not stuffing this in the hctx, but how would that work?
The only thing I can think of that would work reliably is batching the
queue+wait into units of N. We could potentially have many thousands of
queues, and it could get iffy (and/or unreliable) in terms of allocation
size. Looks like rcu_synchronize is 48-bytes on my local install, and it
doesn't take a lot of devices at current CPU counts to make an alloc
covering all of it huge. Let's say 64 threads, and 32 devices, then
we're already at 64*32*48 bytes which is an order 5 allocation. Not
friendly, and not going to be reliable when you need it. And if we start
batching in reasonable counts, then we're _almost_ back to doing a queue
or two at the time... 32 * 48 is 1536 bytes, so we could only do two at
the time for single page allocations.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  1:51     ` Jens Axboe
@ 2020-07-28  2:17       ` Ming Lei
  2020-07-28  2:23         ` Jens Axboe
  0 siblings, 1 reply; 40+ messages in thread
From: Ming Lei @ 2020-07-28  2:17 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Sagi Grimberg, linux-nvme, Christoph Hellwig, Keith Busch,
	linux-block, Ming Lin, Chao Leng

On Mon, Jul 27, 2020 at 07:51:16PM -0600, Jens Axboe wrote:
> On 7/27/20 7:40 PM, Ming Lei wrote:
> > On Mon, Jul 27, 2020 at 04:10:21PM -0700, Sagi Grimberg wrote:
> >> drivers that have shared tagsets may need to quiesce potentially a lot
> >> of request queues that all share a single tagset (e.g. nvme). Add an interface
> >> to quiesce all the queues on a given tagset. This interface is useful because
> >> it can speedup the quiesce by doing it in parallel.
> >>
> >> For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs
> >> in parallel such that all of them wait for the same rcu elapsed period with
> >> a per-hctx heap allocated rcu_synchronize. for tagsets that don't have
> >> BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is
> >> sufficient.
> >>
> >> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> >> ---
> >>  block/blk-mq.c         | 66 ++++++++++++++++++++++++++++++++++++++++++
> >>  include/linux/blk-mq.h |  4 +++
> >>  2 files changed, 70 insertions(+)
> >>
> >> diff --git a/block/blk-mq.c b/block/blk-mq.c
> >> index abcf590f6238..c37e37354330 100644
> >> --- a/block/blk-mq.c
> >> +++ b/block/blk-mq.c
> >> @@ -209,6 +209,42 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
> >>  }
> >>  EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
> >>  
> >> +static void blk_mq_quiesce_blocking_queue_async(struct request_queue *q)
> >> +{
> >> +	struct blk_mq_hw_ctx *hctx;
> >> +	unsigned int i;
> >> +
> >> +	blk_mq_quiesce_queue_nowait(q);
> >> +
> >> +	queue_for_each_hw_ctx(q, hctx, i) {
> >> +		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
> >> +		hctx->rcu_sync = kmalloc(sizeof(*hctx->rcu_sync), GFP_KERNEL);
> >> +		if (!hctx->rcu_sync)
> >> +			continue;
> > 
> > This approach of quiesce/unquiesce tagset is good abstraction.
> > 
> > Just one more thing, please allocate a rcu_sync array because hctx is
> > supposed to not store scratch stuff.
> 
> I'd be all for not stuffing this in the hctx, but how would that work?
> The only thing I can think of that would work reliably is batching the
> queue+wait into units of N. We could potentially have many thousands of
> queues, and it could get iffy (and/or unreliable) in terms of allocation
> size. Looks like rcu_synchronize is 48-bytes on my local install, and it
> doesn't take a lot of devices at current CPU counts to make an alloc
> covering all of it huge. Let's say 64 threads, and 32 devices, then
> we're already at 64*32*48 bytes which is an order 5 allocation. Not
> friendly, and not going to be reliable when you need it. And if we start
> batching in reasonable counts, then we're _almost_ back to doing a queue
> or two at the time... 32 * 48 is 1536 bytes, so we could only do two at
> the time for single page allocations.

We can convert to order 0 allocation by one extra indirect array. 


Thanks,
Ming


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  2:17       ` Ming Lei
@ 2020-07-28  2:23         ` Jens Axboe
  2020-07-28  2:28           ` Ming Lei
  0 siblings, 1 reply; 40+ messages in thread
From: Jens Axboe @ 2020-07-28  2:23 UTC (permalink / raw)
  To: Ming Lei
  Cc: Sagi Grimberg, linux-nvme, Christoph Hellwig, Keith Busch,
	linux-block, Ming Lin, Chao Leng

On 7/27/20 8:17 PM, Ming Lei wrote:
> On Mon, Jul 27, 2020 at 07:51:16PM -0600, Jens Axboe wrote:
>> On 7/27/20 7:40 PM, Ming Lei wrote:
>>> On Mon, Jul 27, 2020 at 04:10:21PM -0700, Sagi Grimberg wrote:
>>>> drivers that have shared tagsets may need to quiesce potentially a lot
>>>> of request queues that all share a single tagset (e.g. nvme). Add an interface
>>>> to quiesce all the queues on a given tagset. This interface is useful because
>>>> it can speedup the quiesce by doing it in parallel.
>>>>
>>>> For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs
>>>> in parallel such that all of them wait for the same rcu elapsed period with
>>>> a per-hctx heap allocated rcu_synchronize. for tagsets that don't have
>>>> BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is
>>>> sufficient.
>>>>
>>>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
>>>> ---
>>>>  block/blk-mq.c         | 66 ++++++++++++++++++++++++++++++++++++++++++
>>>>  include/linux/blk-mq.h |  4 +++
>>>>  2 files changed, 70 insertions(+)
>>>>
>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>> index abcf590f6238..c37e37354330 100644
>>>> --- a/block/blk-mq.c
>>>> +++ b/block/blk-mq.c
>>>> @@ -209,6 +209,42 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
>>>>  
>>>> +static void blk_mq_quiesce_blocking_queue_async(struct request_queue *q)
>>>> +{
>>>> +	struct blk_mq_hw_ctx *hctx;
>>>> +	unsigned int i;
>>>> +
>>>> +	blk_mq_quiesce_queue_nowait(q);
>>>> +
>>>> +	queue_for_each_hw_ctx(q, hctx, i) {
>>>> +		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
>>>> +		hctx->rcu_sync = kmalloc(sizeof(*hctx->rcu_sync), GFP_KERNEL);
>>>> +		if (!hctx->rcu_sync)
>>>> +			continue;
>>>
>>> This approach of quiesce/unquiesce tagset is good abstraction.
>>>
>>> Just one more thing, please allocate a rcu_sync array because hctx is
>>> supposed to not store scratch stuff.
>>
>> I'd be all for not stuffing this in the hctx, but how would that work?
>> The only thing I can think of that would work reliably is batching the
>> queue+wait into units of N. We could potentially have many thousands of
>> queues, and it could get iffy (and/or unreliable) in terms of allocation
>> size. Looks like rcu_synchronize is 48-bytes on my local install, and it
>> doesn't take a lot of devices at current CPU counts to make an alloc
>> covering all of it huge. Let's say 64 threads, and 32 devices, then
>> we're already at 64*32*48 bytes which is an order 5 allocation. Not
>> friendly, and not going to be reliable when you need it. And if we start
>> batching in reasonable counts, then we're _almost_ back to doing a queue
>> or two at the time... 32 * 48 is 1536 bytes, so we could only do two at
>> the time for single page allocations.
> 
> We can convert to order 0 allocation by one extra indirect array. 

I guess that could work, and would just be one extra alloc + free if we
still retain the batch. That'd take it to 16 devices (at 32 CPUs) per
round, potentially way less of course if we have more CPUs. So still
somewhat limiting, rather than do all at once.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  2:23         ` Jens Axboe
@ 2020-07-28  2:28           ` Ming Lei
  2020-07-28  2:32             ` Jens Axboe
  0 siblings, 1 reply; 40+ messages in thread
From: Ming Lei @ 2020-07-28  2:28 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Sagi Grimberg, linux-nvme, Christoph Hellwig, Keith Busch,
	linux-block, Ming Lin, Chao Leng

On Mon, Jul 27, 2020 at 08:23:15PM -0600, Jens Axboe wrote:
> On 7/27/20 8:17 PM, Ming Lei wrote:
> > On Mon, Jul 27, 2020 at 07:51:16PM -0600, Jens Axboe wrote:
> >> On 7/27/20 7:40 PM, Ming Lei wrote:
> >>> On Mon, Jul 27, 2020 at 04:10:21PM -0700, Sagi Grimberg wrote:
> >>>> drivers that have shared tagsets may need to quiesce potentially a lot
> >>>> of request queues that all share a single tagset (e.g. nvme). Add an interface
> >>>> to quiesce all the queues on a given tagset. This interface is useful because
> >>>> it can speedup the quiesce by doing it in parallel.
> >>>>
> >>>> For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs
> >>>> in parallel such that all of them wait for the same rcu elapsed period with
> >>>> a per-hctx heap allocated rcu_synchronize. for tagsets that don't have
> >>>> BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is
> >>>> sufficient.
> >>>>
> >>>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> >>>> ---
> >>>>  block/blk-mq.c         | 66 ++++++++++++++++++++++++++++++++++++++++++
> >>>>  include/linux/blk-mq.h |  4 +++
> >>>>  2 files changed, 70 insertions(+)
> >>>>
> >>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
> >>>> index abcf590f6238..c37e37354330 100644
> >>>> --- a/block/blk-mq.c
> >>>> +++ b/block/blk-mq.c
> >>>> @@ -209,6 +209,42 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
> >>>>  }
> >>>>  EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
> >>>>  
> >>>> +static void blk_mq_quiesce_blocking_queue_async(struct request_queue *q)
> >>>> +{
> >>>> +	struct blk_mq_hw_ctx *hctx;
> >>>> +	unsigned int i;
> >>>> +
> >>>> +	blk_mq_quiesce_queue_nowait(q);
> >>>> +
> >>>> +	queue_for_each_hw_ctx(q, hctx, i) {
> >>>> +		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
> >>>> +		hctx->rcu_sync = kmalloc(sizeof(*hctx->rcu_sync), GFP_KERNEL);
> >>>> +		if (!hctx->rcu_sync)
> >>>> +			continue;
> >>>
> >>> This approach of quiesce/unquiesce tagset is good abstraction.
> >>>
> >>> Just one more thing, please allocate a rcu_sync array because hctx is
> >>> supposed to not store scratch stuff.
> >>
> >> I'd be all for not stuffing this in the hctx, but how would that work?
> >> The only thing I can think of that would work reliably is batching the
> >> queue+wait into units of N. We could potentially have many thousands of
> >> queues, and it could get iffy (and/or unreliable) in terms of allocation
> >> size. Looks like rcu_synchronize is 48-bytes on my local install, and it
> >> doesn't take a lot of devices at current CPU counts to make an alloc
> >> covering all of it huge. Let's say 64 threads, and 32 devices, then
> >> we're already at 64*32*48 bytes which is an order 5 allocation. Not
> >> friendly, and not going to be reliable when you need it. And if we start
> >> batching in reasonable counts, then we're _almost_ back to doing a queue
> >> or two at the time... 32 * 48 is 1536 bytes, so we could only do two at
> >> the time for single page allocations.
> > 
> > We can convert to order 0 allocation by one extra indirect array. 
> 
> I guess that could work, and would just be one extra alloc + free if we
> still retain the batch. That'd take it to 16 devices (at 32 CPUs) per
> round, potentially way less of course if we have more CPUs. So still
> somewhat limiting, rather than do all at once.

With the approach in blk_mq_alloc_rqs(), each allocated page can be added
to one list, so the indirect array can be saved. Then it is possible to allocate
for any size queues/devices since every allocation is just for single page
in case that it is needed, even no pre-calculation is required.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  2:28           ` Ming Lei
@ 2020-07-28  2:32             ` Jens Axboe
  2020-07-28  3:29               ` Sagi Grimberg
  0 siblings, 1 reply; 40+ messages in thread
From: Jens Axboe @ 2020-07-28  2:32 UTC (permalink / raw)
  To: Ming Lei
  Cc: Sagi Grimberg, linux-nvme, Christoph Hellwig, Keith Busch,
	linux-block, Ming Lin, Chao Leng

On 7/27/20 8:28 PM, Ming Lei wrote:
> On Mon, Jul 27, 2020 at 08:23:15PM -0600, Jens Axboe wrote:
>> On 7/27/20 8:17 PM, Ming Lei wrote:
>>> On Mon, Jul 27, 2020 at 07:51:16PM -0600, Jens Axboe wrote:
>>>> On 7/27/20 7:40 PM, Ming Lei wrote:
>>>>> On Mon, Jul 27, 2020 at 04:10:21PM -0700, Sagi Grimberg wrote:
>>>>>> drivers that have shared tagsets may need to quiesce potentially a lot
>>>>>> of request queues that all share a single tagset (e.g. nvme). Add an interface
>>>>>> to quiesce all the queues on a given tagset. This interface is useful because
>>>>>> it can speedup the quiesce by doing it in parallel.
>>>>>>
>>>>>> For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs
>>>>>> in parallel such that all of them wait for the same rcu elapsed period with
>>>>>> a per-hctx heap allocated rcu_synchronize. for tagsets that don't have
>>>>>> BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is
>>>>>> sufficient.
>>>>>>
>>>>>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
>>>>>> ---
>>>>>>  block/blk-mq.c         | 66 ++++++++++++++++++++++++++++++++++++++++++
>>>>>>  include/linux/blk-mq.h |  4 +++
>>>>>>  2 files changed, 70 insertions(+)
>>>>>>
>>>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>>>> index abcf590f6238..c37e37354330 100644
>>>>>> --- a/block/blk-mq.c
>>>>>> +++ b/block/blk-mq.c
>>>>>> @@ -209,6 +209,42 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
>>>>>>  }
>>>>>>  EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
>>>>>>  
>>>>>> +static void blk_mq_quiesce_blocking_queue_async(struct request_queue *q)
>>>>>> +{
>>>>>> +	struct blk_mq_hw_ctx *hctx;
>>>>>> +	unsigned int i;
>>>>>> +
>>>>>> +	blk_mq_quiesce_queue_nowait(q);
>>>>>> +
>>>>>> +	queue_for_each_hw_ctx(q, hctx, i) {
>>>>>> +		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
>>>>>> +		hctx->rcu_sync = kmalloc(sizeof(*hctx->rcu_sync), GFP_KERNEL);
>>>>>> +		if (!hctx->rcu_sync)
>>>>>> +			continue;
>>>>>
>>>>> This approach of quiesce/unquiesce tagset is good abstraction.
>>>>>
>>>>> Just one more thing, please allocate a rcu_sync array because hctx is
>>>>> supposed to not store scratch stuff.
>>>>
>>>> I'd be all for not stuffing this in the hctx, but how would that work?
>>>> The only thing I can think of that would work reliably is batching the
>>>> queue+wait into units of N. We could potentially have many thousands of
>>>> queues, and it could get iffy (and/or unreliable) in terms of allocation
>>>> size. Looks like rcu_synchronize is 48-bytes on my local install, and it
>>>> doesn't take a lot of devices at current CPU counts to make an alloc
>>>> covering all of it huge. Let's say 64 threads, and 32 devices, then
>>>> we're already at 64*32*48 bytes which is an order 5 allocation. Not
>>>> friendly, and not going to be reliable when you need it. And if we start
>>>> batching in reasonable counts, then we're _almost_ back to doing a queue
>>>> or two at the time... 32 * 48 is 1536 bytes, so we could only do two at
>>>> the time for single page allocations.
>>>
>>> We can convert to order 0 allocation by one extra indirect array. 
>>
>> I guess that could work, and would just be one extra alloc + free if we
>> still retain the batch. That'd take it to 16 devices (at 32 CPUs) per
>> round, potentially way less of course if we have more CPUs. So still
>> somewhat limiting, rather than do all at once.
> 
> With the approach in blk_mq_alloc_rqs(), each allocated page can be
> added to one list, so the indirect array can be saved. Then it is
> possible to allocate for any size queues/devices since every
> allocation is just for single page in case that it is needed, even no
> pre-calculation is required.

As long as we watch the complexity, don't think we need to go overboard
here in the risk of adding issues for the failure path. But yes, we
could use the same trick I did in blk_mq_alloc_rqs() and just alloc
pages as we go.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 2/2] nvme: use blk_mq_[un]quiesce_tagset
  2020-07-28  0:54   ` Sagi Grimberg
@ 2020-07-28  3:21     ` Chao Leng
  2020-07-28  3:34       ` Sagi Grimberg
  0 siblings, 1 reply; 40+ messages in thread
From: Chao Leng @ 2020-07-28  3:21 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme, Christoph Hellwig, Keith Busch, Jens Axboe
  Cc: linux-block, Ming Lin



On 2020/7/28 8:54, Sagi Grimberg wrote:
> 
> 
> On 7/27/20 4:10 PM, Sagi Grimberg wrote:
>> All controller namespaces share the same tagset, so we
>> can use this interface which does the optimal operation
>> for parallel quiesce based on the tagset type (e.g.
>> blocking tagsets and non-blocking tagsets).
>>
>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
>> ---
>>   drivers/nvme/host/core.c | 14 ++------------
>>   1 file changed, 2 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index 05aa568a60af..c41df20996d7 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -4557,23 +4557,13 @@ EXPORT_SYMBOL_GPL(nvme_start_freeze);
>>   void nvme_stop_queues(struct nvme_ctrl *ctrl)
>>   {
>> -    struct nvme_ns *ns;
>> -
>> -    down_read(&ctrl->namespaces_rwsem);
>> -    list_for_each_entry(ns, &ctrl->namespaces, list)
>> -        blk_mq_quiesce_queue(ns->queue);
>> -    up_read(&ctrl->namespaces_rwsem);
>> +    blk_mq_quiesce_tagset(ctrl->tagset);
> 
> Rrr.. this one is slightly annoying. We have the connect_q in
> fabrics that we use to issue the connect command which is now
> quiesced too...
> 
> If we will use this interface, we can unquiesce it right away,
> but that seems kinda backwards..Io queue and admin queue has different treat mechanism, introduce
blk_mq_quiesce_tagset may make the mechanism unclear. So this is
probably not a good choice.

> 
> -- 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 05aa568a60af..70af0ff63e7f 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -4557,23 +4557,15 @@ EXPORT_SYMBOL_GPL(nvme_start_freeze);
> 
>   void nvme_stop_queues(struct nvme_ctrl *ctrl)
>   {
> -       struct nvme_ns *ns;
> -
> -       down_read(&ctrl->namespaces_rwsem);
> -       list_for_each_entry(ns, &ctrl->namespaces, list)
> -               blk_mq_quiesce_queue(ns->queue);
> -       up_read(&ctrl->namespaces_rwsem);
> +       blk_mq_quiesce_tagset(ctrl->tagset);
> +       if (ctrl->connect_q)
> +               blk_mq_unquiesce_queue(ctrl->connect_q);
>   }
>   EXPORT_SYMBOL_GPL(nvme_stop_queues);
> -- 
> 
> Thoughts?
> .

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  1:40   ` Ming Lei
  2020-07-28  1:51     ` Jens Axboe
@ 2020-07-28  3:25     ` Sagi Grimberg
  1 sibling, 0 replies; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-28  3:25 UTC (permalink / raw)
  To: Ming Lei
  Cc: linux-nvme, Christoph Hellwig, Keith Busch, Jens Axboe,
	linux-block, Ming Lin, Chao Leng


>> drivers that have shared tagsets may need to quiesce potentially a lot
>> of request queues that all share a single tagset (e.g. nvme). Add an interface
>> to quiesce all the queues on a given tagset. This interface is useful because
>> it can speedup the quiesce by doing it in parallel.
>>
>> For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs
>> in parallel such that all of them wait for the same rcu elapsed period with
>> a per-hctx heap allocated rcu_synchronize. for tagsets that don't have
>> BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is
>> sufficient.
>>
>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
>> ---
>>   block/blk-mq.c         | 66 ++++++++++++++++++++++++++++++++++++++++++
>>   include/linux/blk-mq.h |  4 +++
>>   2 files changed, 70 insertions(+)
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index abcf590f6238..c37e37354330 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -209,6 +209,42 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
>>   }
>>   EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
>>   
>> +static void blk_mq_quiesce_blocking_queue_async(struct request_queue *q)
>> +{
>> +	struct blk_mq_hw_ctx *hctx;
>> +	unsigned int i;
>> +
>> +	blk_mq_quiesce_queue_nowait(q);
>> +
>> +	queue_for_each_hw_ctx(q, hctx, i) {
>> +		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
>> +		hctx->rcu_sync = kmalloc(sizeof(*hctx->rcu_sync), GFP_KERNEL);
>> +		if (!hctx->rcu_sync)
>> +			continue;
> 
> This approach of quiesce/unquiesce tagset is good abstraction.

I'm thinking I don't want to use this interface, it quiesce the
connect_q hctxs as well and then I need to unquiesce them which is
awkward...

> Just one more thing, please allocate a rcu_sync array because hctx is supposed
> to not store scratch stuff.

Sorry, I object to any interface that returns back the rcu_sync
allocation. Its just awkward. really..

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  2:32             ` Jens Axboe
@ 2020-07-28  3:29               ` Sagi Grimberg
  0 siblings, 0 replies; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-28  3:29 UTC (permalink / raw)
  To: Jens Axboe, Ming Lei
  Cc: linux-nvme, Christoph Hellwig, Keith Busch, linux-block,
	Ming Lin, Chao Leng


>>>>>>> +static void blk_mq_quiesce_blocking_queue_async(struct request_queue *q)
>>>>>>> +{
>>>>>>> +	struct blk_mq_hw_ctx *hctx;
>>>>>>> +	unsigned int i;
>>>>>>> +
>>>>>>> +	blk_mq_quiesce_queue_nowait(q);
>>>>>>> +
>>>>>>> +	queue_for_each_hw_ctx(q, hctx, i) {
>>>>>>> +		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
>>>>>>> +		hctx->rcu_sync = kmalloc(sizeof(*hctx->rcu_sync), GFP_KERNEL);
>>>>>>> +		if (!hctx->rcu_sync)
>>>>>>> +			continue;
>>>>>>
>>>>>> This approach of quiesce/unquiesce tagset is good abstraction.
>>>>>>
>>>>>> Just one more thing, please allocate a rcu_sync array because hctx is
>>>>>> supposed to not store scratch stuff.
>>>>>
>>>>> I'd be all for not stuffing this in the hctx, but how would that work?
>>>>> The only thing I can think of that would work reliably is batching the
>>>>> queue+wait into units of N. We could potentially have many thousands of
>>>>> queues, and it could get iffy (and/or unreliable) in terms of allocation
>>>>> size. Looks like rcu_synchronize is 48-bytes on my local install, and it
>>>>> doesn't take a lot of devices at current CPU counts to make an alloc
>>>>> covering all of it huge. Let's say 64 threads, and 32 devices, then
>>>>> we're already at 64*32*48 bytes which is an order 5 allocation. Not
>>>>> friendly, and not going to be reliable when you need it. And if we start
>>>>> batching in reasonable counts, then we're _almost_ back to doing a queue
>>>>> or two at the time... 32 * 48 is 1536 bytes, so we could only do two at
>>>>> the time for single page allocations.
>>>>
>>>> We can convert to order 0 allocation by one extra indirect array.
>>>
>>> I guess that could work, and would just be one extra alloc + free if we
>>> still retain the batch. That'd take it to 16 devices (at 32 CPUs) per
>>> round, potentially way less of course if we have more CPUs. So still
>>> somewhat limiting, rather than do all at once.
>>
>> With the approach in blk_mq_alloc_rqs(), each allocated page can be
>> added to one list, so the indirect array can be saved. Then it is
>> possible to allocate for any size queues/devices since every
>> allocation is just for single page in case that it is needed, even no
>> pre-calculation is required.
> 
> As long as we watch the complexity, don't think we need to go overboard
> here in the risk of adding issues for the failure path.

No we don't. I prefer not to do it. And if this turns out to be that bad
we can later convert it to a complicated page vector.

I'll move forward with this approach.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 2/2] nvme: use blk_mq_[un]quiesce_tagset
  2020-07-28  3:21     ` Chao Leng
@ 2020-07-28  3:34       ` Sagi Grimberg
  2020-07-28  3:51         ` Chao Leng
  0 siblings, 1 reply; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-28  3:34 UTC (permalink / raw)
  To: Chao Leng, linux-nvme, Christoph Hellwig, Keith Busch, Jens Axboe
  Cc: linux-block, Ming Lin


>>> All controller namespaces share the same tagset, so we
>>> can use this interface which does the optimal operation
>>> for parallel quiesce based on the tagset type (e.g.
>>> blocking tagsets and non-blocking tagsets).
>>>
>>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
>>> ---
>>>   drivers/nvme/host/core.c | 14 ++------------
>>>   1 file changed, 2 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>> index 05aa568a60af..c41df20996d7 100644
>>> --- a/drivers/nvme/host/core.c
>>> +++ b/drivers/nvme/host/core.c
>>> @@ -4557,23 +4557,13 @@ EXPORT_SYMBOL_GPL(nvme_start_freeze);
>>>   void nvme_stop_queues(struct nvme_ctrl *ctrl)
>>>   {
>>> -    struct nvme_ns *ns;
>>> -
>>> -    down_read(&ctrl->namespaces_rwsem);
>>> -    list_for_each_entry(ns, &ctrl->namespaces, list)
>>> -        blk_mq_quiesce_queue(ns->queue);
>>> -    up_read(&ctrl->namespaces_rwsem);
>>> +    blk_mq_quiesce_tagset(ctrl->tagset);
>>
>> Rrr.. this one is slightly annoying. We have the connect_q in
>> fabrics that we use to issue the connect command which is now
>> quiesced too...
>>
>> If we will use this interface, we can unquiesce it right away,
>> but that seems kinda backwards..Io queue and admin queue has different 
>> treat mechanism, introduce
> blk_mq_quiesce_tagset may make the mechanism unclear. So this is
> probably not a good choice.

The meaning of blk_mq_quiesce_tagset is clear, the awkward thing is
that we need to unquiesce the connect_q after blk_mq_quiesce_tagset
quiesced it.

I'm thinking of resorting from this abstraction...

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 2/2] nvme: use blk_mq_[un]quiesce_tagset
  2020-07-28  3:34       ` Sagi Grimberg
@ 2020-07-28  3:51         ` Chao Leng
  0 siblings, 0 replies; 40+ messages in thread
From: Chao Leng @ 2020-07-28  3:51 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme, Christoph Hellwig, Keith Busch, Jens Axboe
  Cc: linux-block, Ming Lin



On 2020/7/28 11:34, Sagi Grimberg wrote:
> 
>>>> All controller namespaces share the same tagset, so we
>>>> can use this interface which does the optimal operation
>>>> for parallel quiesce based on the tagset type (e.g.
>>>> blocking tagsets and non-blocking tagsets).
>>>>
>>>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
>>>> ---
>>>>   drivers/nvme/host/core.c | 14 ++------------
>>>>   1 file changed, 2 insertions(+), 12 deletions(-)
>>>>
>>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>>> index 05aa568a60af..c41df20996d7 100644
>>>> --- a/drivers/nvme/host/core.c
>>>> +++ b/drivers/nvme/host/core.c
>>>> @@ -4557,23 +4557,13 @@ EXPORT_SYMBOL_GPL(nvme_start_freeze);
>>>>   void nvme_stop_queues(struct nvme_ctrl *ctrl)
>>>>   {
>>>> -    struct nvme_ns *ns;
>>>> -
>>>> -    down_read(&ctrl->namespaces_rwsem);
>>>> -    list_for_each_entry(ns, &ctrl->namespaces, list)
>>>> -        blk_mq_quiesce_queue(ns->queue);
>>>> -    up_read(&ctrl->namespaces_rwsem);
>>>> +    blk_mq_quiesce_tagset(ctrl->tagset);
>>>
>>> Rrr.. this one is slightly annoying. We have the connect_q in
>>> fabrics that we use to issue the connect command which is now
>>> quiesced too...
>>>
>>> If we will use this interface, we can unquiesce it right away,
>>> but that seems kinda backwards..Io queue and admin queue has different treat mechanism, introduce
>> blk_mq_quiesce_tagset may make the mechanism unclear. So this is
>> probably not a good choice.
> 
> The meaning of blk_mq_quiesce_tagset is clear, the awkward thing is
> that we need to unquiesce the connect_q after blk_mq_quiesce_tagset
> quiesced it.
Io queue and admin queue has different treat mechanism. If we switch to
blk_mq_quiesce_tagset, maybe we need do more extras such as unquiesce
the connect_q, thus the mechanism maybe unclear. Surely if we introduce
blk_mq_quiesce_tagset for new feature, the abstraction is great.
> 
> I'm thinking of resorting from this abstraction...
> .

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-27 23:10 ` [PATCH v5 1/2] blk-mq: add tagset quiesce interface Sagi Grimberg
  2020-07-27 23:32   ` Keith Busch
  2020-07-28  1:40   ` Ming Lei
@ 2020-07-28  7:18   ` Christoph Hellwig
  2020-07-28  7:48     ` Sagi Grimberg
  2020-07-28  9:16     ` Ming Lei
  2 siblings, 2 replies; 40+ messages in thread
From: Christoph Hellwig @ 2020-07-28  7:18 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: linux-nvme, Christoph Hellwig, Keith Busch, Jens Axboe,
	linux-block, Ming Lin, Chao Leng, Paul E. McKenney

I like the tagset based interface.  But the idea of doing a per-hctx
allocation and wait doesn't seem very scalable.

Paul, do you have any good idea for an interface that waits on
multiple srcu heads?  As far as I can tell we could just have a single
global completion and counter, and each call_srcu would just just
decrement it and then the final one would do the wakeup.  It would just
be great to figure out a way to keep the struct rcu_synchronize and
counter on stack to avoid an allocation.

But if we can't do with an on-stack object I'd much rather just embedd
the rcu_head in the hw_ctx.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  7:18   ` Christoph Hellwig
@ 2020-07-28  7:48     ` Sagi Grimberg
  2020-07-28  9:16     ` Ming Lei
  1 sibling, 0 replies; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-28  7:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-nvme, Keith Busch, Jens Axboe, linux-block, Ming Lin,
	Chao Leng, Paul E. McKenney


> I like the tagset based interface.

Even that we need to unquiesce back the connect_q?
See my comment on v5. Kinda bothers me...

   But the idea of doing a per-hctx
> allocation and wait doesn't seem very scalable.

I belong to this camp too..

> Paul, do you have any good idea for an interface that waits on
> multiple srcu heads?  As far as I can tell we could just have a single
> global completion and counter, and each call_srcu would just just
> decrement it and then the final one would do the wakeup.  It would just
> be great to figure out a way to keep the struct rcu_synchronize and
> counter on stack to avoid an allocation.
> 
> But if we can't do with an on-stack object I'd much rather just embedd
> the rcu_head in the hw_ctx.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  7:18   ` Christoph Hellwig
  2020-07-28  7:48     ` Sagi Grimberg
@ 2020-07-28  9:16     ` Ming Lei
  2020-07-28  9:24       ` Sagi Grimberg
  1 sibling, 1 reply; 40+ messages in thread
From: Ming Lei @ 2020-07-28  9:16 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Sagi Grimberg, Jens Axboe, Paul E. McKenney, linux-nvme,
	linux-block, Chao Leng, Keith Busch, Ming Lin

On Tue, Jul 28, 2020 at 09:18:59AM +0200, Christoph Hellwig wrote:
> I like the tagset based interface.  But the idea of doing a per-hctx
> allocation and wait doesn't seem very scalable.
> 
> Paul, do you have any good idea for an interface that waits on
> multiple srcu heads?  As far as I can tell we could just have a single
> global completion and counter, and each call_srcu would just just
> decrement it and then the final one would do the wakeup.  It would just
> be great to figure out a way to keep the struct rcu_synchronize and
> counter on stack to avoid an allocation.
> 
> But if we can't do with an on-stack object I'd much rather just embedd
> the rcu_head in the hw_ctx.

I think we can do that, please see the following patch which is against Sagi's V5:

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c3856377b961..fc46e77460f1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -27,6 +27,7 @@
 #include <linux/crash_dump.h>
 #include <linux/prefetch.h>
 #include <linux/blk-crypto.h>
+#include <linux/rcupdate_wait.h>
 
 #include <trace/events/block.h>
 
@@ -209,6 +210,50 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 
+struct blk_mq_srcu_sync {
+	struct rcu_synchronize srcu_sync;
+	atomic_t count;
+};
+
+static void blk_mq_srcu_sync_init(struct blk_mq_srcu_sync *sync, int count)
+{
+	init_completion(&sync->srcu_sync.completion);
+	init_rcu_head(&sync->srcu_sync.head);
+
+	atomic_set(&sync->count, count);
+}
+
+static void blk_mq_srcu_sync_wait(struct blk_mq_srcu_sync *sync)
+{
+	wait_for_completion(&sync->srcu_sync.completion);
+	destroy_rcu_head_on_stack(&sync->srcu_sync.head);
+}
+
+static void blk_mq_wakeme_after_rcu(struct rcu_head *head)
+{
+	struct blk_mq_srcu_sync *sync;
+
+	sync = container_of(head, struct blk_mq_srcu_sync, srcu_sync.head);
+
+	if (atomic_dec_and_test(&sync->count))
+		complete(&sync->srcu_sync.completion);
+}
+
+static void blk_mq_quiesce_blocking_queue_async(struct request_queue *q,
+		struct blk_mq_srcu_sync *sync)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+
+	blk_mq_quiesce_queue_nowait(q);
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
+		call_srcu(hctx->srcu, &sync->srcu_sync.head,
+				blk_mq_wakeme_after_rcu);
+	}
+}
+
 /**
  * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
  * @q: request queue.
@@ -2880,6 +2925,45 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
 	}
 }
 
+void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
+{
+	struct request_queue *q;
+
+	mutex_lock(&set->tag_list_lock);
+	if (set->flags & BLK_MQ_F_BLOCKING) {
+		struct blk_mq_srcu_sync sync;
+		int count = 0;
+
+		list_for_each_entry(q, &set->tag_list, tag_set_list)
+			count++;
+
+		blk_mq_srcu_sync_init(&sync, count);
+
+		list_for_each_entry(q, &set->tag_list, tag_set_list)
+			blk_mq_quiesce_blocking_queue_async(q, &sync);
+
+		blk_mq_srcu_sync_wait(&sync);
+
+	} else {
+		list_for_each_entry(q, &set->tag_list, tag_set_list)
+			blk_mq_quiesce_queue_nowait(q);
+		synchronize_rcu();
+	}
+	mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
+
+void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
+{
+	struct request_queue *q;
+
+	mutex_lock(&set->tag_list_lock);
+	list_for_each_entry(q, &set->tag_list, tag_set_list)
+		blk_mq_unquiesce_queue(q);
+	mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
+
 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
 					bool shared)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 23230c1d031e..d5e0974a1dcc 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -532,6 +532,8 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
 
 void blk_mq_quiesce_queue_nowait(struct request_queue *q);
+void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
+void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set);
 
 unsigned int blk_mq_rq_cpu(struct request *rq);
 


-- 
Ming


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  9:16     ` Ming Lei
@ 2020-07-28  9:24       ` Sagi Grimberg
  2020-07-28  9:33         ` Ming Lei
  2020-07-28 13:54         ` Paul E. McKenney
  0 siblings, 2 replies; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-28  9:24 UTC (permalink / raw)
  To: Ming Lei, Christoph Hellwig
  Cc: Jens Axboe, Paul E. McKenney, linux-nvme, linux-block, Chao Leng,
	Keith Busch, Ming Lin


>> I like the tagset based interface.  But the idea of doing a per-hctx
>> allocation and wait doesn't seem very scalable.
>>
>> Paul, do you have any good idea for an interface that waits on
>> multiple srcu heads?  As far as I can tell we could just have a single
>> global completion and counter, and each call_srcu would just just
>> decrement it and then the final one would do the wakeup.  It would just
>> be great to figure out a way to keep the struct rcu_synchronize and
>> counter on stack to avoid an allocation.
>>
>> But if we can't do with an on-stack object I'd much rather just embedd
>> the rcu_head in the hw_ctx.
> 
> I think we can do that, please see the following patch which is against Sagi's V5:

I don't think you can send a single rcu_head to multiple call_srcu calls.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  9:24       ` Sagi Grimberg
@ 2020-07-28  9:33         ` Ming Lei
  2020-07-28  9:37           ` Sagi Grimberg
  2020-07-28 13:54         ` Paul E. McKenney
  1 sibling, 1 reply; 40+ messages in thread
From: Ming Lei @ 2020-07-28  9:33 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Jens Axboe, Paul E. McKenney, linux-nvme,
	linux-block, Chao Leng, Keith Busch, Ming Lin

On Tue, Jul 28, 2020 at 02:24:38AM -0700, Sagi Grimberg wrote:
> 
> > > I like the tagset based interface.  But the idea of doing a per-hctx
> > > allocation and wait doesn't seem very scalable.
> > > 
> > > Paul, do you have any good idea for an interface that waits on
> > > multiple srcu heads?  As far as I can tell we could just have a single
> > > global completion and counter, and each call_srcu would just just
> > > decrement it and then the final one would do the wakeup.  It would just
> > > be great to figure out a way to keep the struct rcu_synchronize and
> > > counter on stack to avoid an allocation.
> > > 
> > > But if we can't do with an on-stack object I'd much rather just embedd
> > > the rcu_head in the hw_ctx.
> > 
> > I think we can do that, please see the following patch which is against Sagi's V5:
> 
> I don't think you can send a single rcu_head to multiple call_srcu calls.

OK, then one variant is to put the rcu_head into blk_mq_hw_ctx, and put
rcu_synchronize into blk_mq_tag_set.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  9:33         ` Ming Lei
@ 2020-07-28  9:37           ` Sagi Grimberg
  2020-07-28  9:43             ` Sagi Grimberg
  2020-07-28 10:58             ` Christoph Hellwig
  0 siblings, 2 replies; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-28  9:37 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Jens Axboe, Paul E. McKenney, linux-nvme,
	linux-block, Chao Leng, Keith Busch, Ming Lin


>>>> I like the tagset based interface.  But the idea of doing a per-hctx
>>>> allocation and wait doesn't seem very scalable.
>>>>
>>>> Paul, do you have any good idea for an interface that waits on
>>>> multiple srcu heads?  As far as I can tell we could just have a single
>>>> global completion and counter, and each call_srcu would just just
>>>> decrement it and then the final one would do the wakeup.  It would just
>>>> be great to figure out a way to keep the struct rcu_synchronize and
>>>> counter on stack to avoid an allocation.
>>>>
>>>> But if we can't do with an on-stack object I'd much rather just embedd
>>>> the rcu_head in the hw_ctx.
>>>
>>> I think we can do that, please see the following patch which is against Sagi's V5:
>>
>> I don't think you can send a single rcu_head to multiple call_srcu calls.
> 
> OK, then one variant is to put the rcu_head into blk_mq_hw_ctx, and put
> rcu_synchronize into blk_mq_tag_set.

I can cook up a spin, but I still hate the fact that I have a queue that
ends up quiesced which I didn't want it to...

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  9:37           ` Sagi Grimberg
@ 2020-07-28  9:43             ` Sagi Grimberg
  2020-07-28 10:10               ` Ming Lei
  2020-07-28 10:58             ` Christoph Hellwig
  1 sibling, 1 reply; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-28  9:43 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Jens Axboe, Paul E. McKenney, linux-nvme,
	linux-block, Chao Leng, Keith Busch, Ming Lin


>>>>> I like the tagset based interface.  But the idea of doing a per-hctx
>>>>> allocation and wait doesn't seem very scalable.
>>>>>
>>>>> Paul, do you have any good idea for an interface that waits on
>>>>> multiple srcu heads?  As far as I can tell we could just have a single
>>>>> global completion and counter, and each call_srcu would just just
>>>>> decrement it and then the final one would do the wakeup.  It would 
>>>>> just
>>>>> be great to figure out a way to keep the struct rcu_synchronize and
>>>>> counter on stack to avoid an allocation.
>>>>>
>>>>> But if we can't do with an on-stack object I'd much rather just embedd
>>>>> the rcu_head in the hw_ctx.
>>>>
>>>> I think we can do that, please see the following patch which is 
>>>> against Sagi's V5:
>>>
>>> I don't think you can send a single rcu_head to multiple call_srcu 
>>> calls.
>>
>> OK, then one variant is to put the rcu_head into blk_mq_hw_ctx, and put
>> rcu_synchronize into blk_mq_tag_set.
> 
> I can cook up a spin,

Nope.. spoke too soon, the rcu_head needs to be in a context that has
access to the counter (which is what you called blk_mq_srcu_sync).
you want to add also a pointer to hctx? that is almost as big as
rcu_synchronize...

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  9:43             ` Sagi Grimberg
@ 2020-07-28 10:10               ` Ming Lei
  2020-07-28 10:57                 ` Christoph Hellwig
  2020-07-28 14:13                 ` Paul E. McKenney
  0 siblings, 2 replies; 40+ messages in thread
From: Ming Lei @ 2020-07-28 10:10 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Jens Axboe, Paul E. McKenney, linux-nvme, linux-block, Chao Leng,
	Keith Busch, Ming Lin, Christoph Hellwig

On Tue, Jul 28, 2020 at 02:43:06AM -0700, Sagi Grimberg wrote:
> 
> > > > > > I like the tagset based interface.  But the idea of doing a per-hctx
> > > > > > allocation and wait doesn't seem very scalable.
> > > > > > 
> > > > > > Paul, do you have any good idea for an interface that waits on
> > > > > > multiple srcu heads?  As far as I can tell we could just have a single
> > > > > > global completion and counter, and each call_srcu would just just
> > > > > > decrement it and then the final one would do the
> > > > > > wakeup.  It would just
> > > > > > be great to figure out a way to keep the struct rcu_synchronize and
> > > > > > counter on stack to avoid an allocation.
> > > > > > 
> > > > > > But if we can't do with an on-stack object I'd much rather just embedd
> > > > > > the rcu_head in the hw_ctx.
> > > > > 
> > > > > I think we can do that, please see the following patch which
> > > > > is against Sagi's V5:
> > > > 
> > > > I don't think you can send a single rcu_head to multiple
> > > > call_srcu calls.
> > > 
> > > OK, then one variant is to put the rcu_head into blk_mq_hw_ctx, and put
> > > rcu_synchronize into blk_mq_tag_set.
> > 
> > I can cook up a spin,
> 
> Nope.. spoke too soon, the rcu_head needs to be in a context that has
> access to the counter (which is what you called blk_mq_srcu_sync).
> you want to add also a pointer to hctx? that is almost as big as
> rcu_synchronize...

We can just put rcu_head into hctx, and put the count & completion into
tag_set, and the tagset can be retrieved via hctx, something like the
following patch:

diff --git a/block/blk-mq.c b/block/blk-mq.c
index c3856377b961..129665da4dbd 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -27,6 +27,7 @@
 #include <linux/crash_dump.h>
 #include <linux/prefetch.h>
 #include <linux/blk-crypto.h>
+#include <linux/rcupdate_wait.h>
 
 #include <trace/events/block.h>
 
@@ -209,6 +210,34 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 
+static void blk_mq_wakeme_after_rcu(struct rcu_head *head)
+{
+
+	struct blk_mq_srcu_struct *srcu = container_of(head,
+			struct blk_mq_srcu_struct, head);
+	struct blk_mq_hw_ctx *hctx = (void *)srcu -
+		sizeof(struct blk_mq_hw_ctx);
+	struct blk_mq_tag_set *set = hctx->queue->tag_set;
+
+	if (atomic_dec_and_test(&set->quiesce_count))
+		complete(&set->quiesce_completion);
+}
+
+static void blk_mq_quiesce_blocking_queue_async(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+
+	blk_mq_quiesce_queue_nowait(q);
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
+		init_rcu_head(&hctx->srcu->head);
+		call_srcu(&hctx->srcu->srcu, &hctx->srcu->head,
+				blk_mq_wakeme_after_rcu);
+	}
+}
+
 /**
  * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
  * @q: request queue.
@@ -228,7 +257,7 @@ void blk_mq_quiesce_queue(struct request_queue *q)
 
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (hctx->flags & BLK_MQ_F_BLOCKING)
-			synchronize_srcu(hctx->srcu);
+			synchronize_srcu(&hctx->srcu->srcu);
 		else
 			rcu = true;
 	}
@@ -700,23 +729,23 @@ void blk_mq_complete_request(struct request *rq)
 EXPORT_SYMBOL(blk_mq_complete_request);
 
 static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
-	__releases(hctx->srcu)
+	__releases(&hctx->srcu->srcu)
 {
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
 		rcu_read_unlock();
 	else
-		srcu_read_unlock(hctx->srcu, srcu_idx);
+		srcu_read_unlock(&hctx->srcu->srcu, srcu_idx);
 }
 
 static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
-	__acquires(hctx->srcu)
+	__acquires(&hctx->srcu->srcu)
 {
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
 		/* shut up gcc false positive */
 		*srcu_idx = 0;
 		rcu_read_lock();
 	} else
-		*srcu_idx = srcu_read_lock(hctx->srcu);
+		*srcu_idx = srcu_read_lock(&hctx->srcu->srcu);
 }
 
 /**
@@ -2599,7 +2628,7 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
 		     sizeof(struct blk_mq_hw_ctx));
 
 	if (tag_set->flags & BLK_MQ_F_BLOCKING)
-		hw_ctx_size += sizeof(struct srcu_struct);
+		hw_ctx_size += sizeof(struct blk_mq_srcu_struct);
 
 	return hw_ctx_size;
 }
@@ -2684,7 +2713,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 		goto free_bitmap;
 
 	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		init_srcu_struct(hctx->srcu);
+		init_srcu_struct(&hctx->srcu->srcu);
 	blk_mq_hctx_kobj_init(hctx);
 
 	return hctx;
@@ -2880,6 +2909,43 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
 	}
 }
 
+void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
+{
+	struct request_queue *q;
+
+	mutex_lock(&set->tag_list_lock);
+	if (set->flags & BLK_MQ_F_BLOCKING) {
+		int count = 0;
+
+		list_for_each_entry(q, &set->tag_list, tag_set_list)
+			count++;
+
+		atomic_set(&set->quiesce_count, count);
+		init_completion(&set->quiesce_completion);
+
+		list_for_each_entry(q, &set->tag_list, tag_set_list)
+			blk_mq_quiesce_blocking_queue_async(q);
+		wait_for_completion(&set->quiesce_completion);
+	} else {
+		list_for_each_entry(q, &set->tag_list, tag_set_list)
+			blk_mq_quiesce_queue_nowait(q);
+		synchronize_rcu();
+	}
+	mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
+
+void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
+{
+	struct request_queue *q;
+
+	mutex_lock(&set->tag_list_lock);
+	list_for_each_entry(q, &set->tag_list, tag_set_list)
+		blk_mq_unquiesce_queue(q);
+	mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
+
 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
 					bool shared)
 {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 23230c1d031e..9ef7fdb809a7 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -9,6 +9,11 @@
 struct blk_mq_tags;
 struct blk_flush_queue;
 
+struct blk_mq_srcu_struct {
+	struct srcu_struct srcu;
+	struct rcu_head    head;
+};
+
 /**
  * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
  * block device
@@ -175,7 +180,7 @@ struct blk_mq_hw_ctx {
 	 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
 	 * blk_mq_hw_ctx_size().
 	 */
-	struct srcu_struct	srcu[];
+	struct blk_mq_srcu_struct	srcu[];
 };
 
 /**
@@ -254,6 +259,9 @@ struct blk_mq_tag_set {
 
 	struct mutex		tag_list_lock;
 	struct list_head	tag_list;
+
+	struct completion	quiesce_completion;
+	atomic_t		quiesce_count;
 };
 
 /**
@@ -532,6 +540,8 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
 
 void blk_mq_quiesce_queue_nowait(struct request_queue *q);
+void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
+void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set);
 
 unsigned int blk_mq_rq_cpu(struct request *rq);
 

-- 
Ming


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28 10:10               ` Ming Lei
@ 2020-07-28 10:57                 ` Christoph Hellwig
  2020-07-28 14:13                 ` Paul E. McKenney
  1 sibling, 0 replies; 40+ messages in thread
From: Christoph Hellwig @ 2020-07-28 10:57 UTC (permalink / raw)
  To: Ming Lei
  Cc: Sagi Grimberg, Jens Axboe, Paul E. McKenney, linux-nvme,
	linux-block, Chao Leng, Keith Busch, Ming Lin, Christoph Hellwig

On Tue, Jul 28, 2020 at 06:10:42PM +0800, Ming Lei wrote:
> +	struct blk_mq_hw_ctx *hctx = (void *)srcu -
> +		sizeof(struct blk_mq_hw_ctx);

I think this should use container_of.

> +{
> +	struct blk_mq_hw_ctx *hctx;
> +	unsigned int i;
> +
> +	blk_mq_quiesce_queue_nowait(q);
> +
> +	queue_for_each_hw_ctx(q, hctx, i) {
> +		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
> +		init_rcu_head(&hctx->srcu->head);

We only need to initialize the rcu head once when we allocate the hctx.

> +	mutex_lock(&set->tag_list_lock);
> +	if (set->flags & BLK_MQ_F_BLOCKING) {
> +		int count = 0;
> +
> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
> +			count++;
> +
> +		atomic_set(&set->quiesce_count, count);
> +		init_completion(&set->quiesce_completion);
> +
> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
> +			blk_mq_quiesce_blocking_queue_async(q);
> +		wait_for_completion(&set->quiesce_completion);
> +	} else {
> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
> +			blk_mq_quiesce_queue_nowait(q);
> +		synchronize_rcu();
> +	}
> +	mutex_unlock(&set->tag_list_lock);

It would be great to do the wait_for_completion and synchronize_rcu
outside the lock.  Also I think the blocking case probably wants to be
split into a separate little helper.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  9:37           ` Sagi Grimberg
  2020-07-28  9:43             ` Sagi Grimberg
@ 2020-07-28 10:58             ` Christoph Hellwig
  2020-07-28 16:25               ` Sagi Grimberg
  1 sibling, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2020-07-28 10:58 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Ming Lei, Christoph Hellwig, Jens Axboe, Paul E. McKenney,
	linux-nvme, linux-block, Chao Leng, Keith Busch, Ming Lin

On Tue, Jul 28, 2020 at 02:37:15AM -0700, Sagi Grimberg wrote:
>
>>>>> I like the tagset based interface.  But the idea of doing a per-hctx
>>>>> allocation and wait doesn't seem very scalable.
>>>>>
>>>>> Paul, do you have any good idea for an interface that waits on
>>>>> multiple srcu heads?  As far as I can tell we could just have a single
>>>>> global completion and counter, and each call_srcu would just just
>>>>> decrement it and then the final one would do the wakeup.  It would just
>>>>> be great to figure out a way to keep the struct rcu_synchronize and
>>>>> counter on stack to avoid an allocation.
>>>>>
>>>>> But if we can't do with an on-stack object I'd much rather just embedd
>>>>> the rcu_head in the hw_ctx.
>>>>
>>>> I think we can do that, please see the following patch which is against Sagi's V5:
>>>
>>> I don't think you can send a single rcu_head to multiple call_srcu calls.
>>
>> OK, then one variant is to put the rcu_head into blk_mq_hw_ctx, and put
>> rcu_synchronize into blk_mq_tag_set.
>
> I can cook up a spin, but I still hate the fact that I have a queue that
> ends up quiesced which I didn't want it to...

Why do we care so much about the connect_q?  Especially if we generalize
it into a passthru queue that will absolutely need the quiesce hopefully
soon.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28  9:24       ` Sagi Grimberg
  2020-07-28  9:33         ` Ming Lei
@ 2020-07-28 13:54         ` Paul E. McKenney
  2020-07-28 23:46           ` Sagi Grimberg
  1 sibling, 1 reply; 40+ messages in thread
From: Paul E. McKenney @ 2020-07-28 13:54 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Ming Lei, Christoph Hellwig, Jens Axboe, linux-nvme, linux-block,
	Chao Leng, Keith Busch, Ming Lin

On Tue, Jul 28, 2020 at 02:24:38AM -0700, Sagi Grimberg wrote:
> 
> > > I like the tagset based interface.  But the idea of doing a per-hctx
> > > allocation and wait doesn't seem very scalable.
> > > 
> > > Paul, do you have any good idea for an interface that waits on
> > > multiple srcu heads?  As far as I can tell we could just have a single
> > > global completion and counter, and each call_srcu would just just
> > > decrement it and then the final one would do the wakeup.  It would just
> > > be great to figure out a way to keep the struct rcu_synchronize and
> > > counter on stack to avoid an allocation.
> > > 
> > > But if we can't do with an on-stack object I'd much rather just embedd
> > > the rcu_head in the hw_ctx.
> > 
> > I think we can do that, please see the following patch which is against Sagi's V5:
> 
> I don't think you can send a single rcu_head to multiple call_srcu calls.

Indeed you cannot.  And if you build with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
it will yell at you when you try.

You -can- pass on-stack rcu_head structures to call_srcu(), though,
if that helps.  You of course must have some way of waiting for the
callback to be invoked before exiting that function.  This should be
easy for me to package into an API, maybe using one of the existing
reference-counting APIs.

So, do you have a separate stack frame for each of the desired call_srcu()
invocations?  If not, do you know at build time how many rcu_head
structures you need?  If the answer to both of these is "no", then
it is likely that there needs to be an rcu_head in each of the relevant
data structures, as was noted earlier in this thread.

Yeah, I should go read the code.  But I would need to know where it is
and it is still early in the morning over here!  ;-)

I probably should also have read the remainder of the thread before
replying, as well.  But what is the fun in that?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28 10:10               ` Ming Lei
  2020-07-28 10:57                 ` Christoph Hellwig
@ 2020-07-28 14:13                 ` Paul E. McKenney
  1 sibling, 0 replies; 40+ messages in thread
From: Paul E. McKenney @ 2020-07-28 14:13 UTC (permalink / raw)
  To: Ming Lei
  Cc: Sagi Grimberg, Jens Axboe, linux-nvme, linux-block, Chao Leng,
	Keith Busch, Ming Lin, Christoph Hellwig

On Tue, Jul 28, 2020 at 06:10:42PM +0800, Ming Lei wrote:
> On Tue, Jul 28, 2020 at 02:43:06AM -0700, Sagi Grimberg wrote:
> > 
> > > > > > > I like the tagset based interface.  But the idea of doing a per-hctx
> > > > > > > allocation and wait doesn't seem very scalable.
> > > > > > > 
> > > > > > > Paul, do you have any good idea for an interface that waits on
> > > > > > > multiple srcu heads?  As far as I can tell we could just have a single
> > > > > > > global completion and counter, and each call_srcu would just just
> > > > > > > decrement it and then the final one would do the
> > > > > > > wakeup.  It would just
> > > > > > > be great to figure out a way to keep the struct rcu_synchronize and
> > > > > > > counter on stack to avoid an allocation.
> > > > > > > 
> > > > > > > But if we can't do with an on-stack object I'd much rather just embedd
> > > > > > > the rcu_head in the hw_ctx.
> > > > > > 
> > > > > > I think we can do that, please see the following patch which
> > > > > > is against Sagi's V5:
> > > > > 
> > > > > I don't think you can send a single rcu_head to multiple
> > > > > call_srcu calls.
> > > > 
> > > > OK, then one variant is to put the rcu_head into blk_mq_hw_ctx, and put
> > > > rcu_synchronize into blk_mq_tag_set.
> > > 
> > > I can cook up a spin,
> > 
> > Nope.. spoke too soon, the rcu_head needs to be in a context that has
> > access to the counter (which is what you called blk_mq_srcu_sync).
> > you want to add also a pointer to hctx? that is almost as big as
> > rcu_synchronize...
> 
> We can just put rcu_head into hctx, and put the count & completion into
> tag_set, and the tagset can be retrieved via hctx, something like the
> following patch:

A few questions and comments below, hopefully helpful ones.

							Thanx, Paul

> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index c3856377b961..129665da4dbd 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -27,6 +27,7 @@
>  #include <linux/crash_dump.h>
>  #include <linux/prefetch.h>
>  #include <linux/blk-crypto.h>
> +#include <linux/rcupdate_wait.h>
>  
>  #include <trace/events/block.h>
>  
> @@ -209,6 +210,34 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
>  }
>  EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
>  
> +static void blk_mq_wakeme_after_rcu(struct rcu_head *head)
> +{
> +
> +	struct blk_mq_srcu_struct *srcu = container_of(head,
> +			struct blk_mq_srcu_struct, head);
> +	struct blk_mq_hw_ctx *hctx = (void *)srcu -
> +		sizeof(struct blk_mq_hw_ctx);
> +	struct blk_mq_tag_set *set = hctx->queue->tag_set;
> +
> +	if (atomic_dec_and_test(&set->quiesce_count))
> +		complete(&set->quiesce_completion);
> +}
> +
> +static void blk_mq_quiesce_blocking_queue_async(struct request_queue *q)
> +{
> +	struct blk_mq_hw_ctx *hctx;
> +	unsigned int i;
> +
> +	blk_mq_quiesce_queue_nowait(q);
> +
> +	queue_for_each_hw_ctx(q, hctx, i) {

This loop, combined with the second list_for_each_entry() in
blk_mq_quiesce_tagset(), needs to have the name number of iterations as
the first list_for_each_entry() in blk_mq_quiesce_tagset().

This might be the case, but it is not obvious to me.  Then again, I
freely admit that I don't know this code.

> +		WARN_ON_ONCE(!(hctx->flags & BLK_MQ_F_BLOCKING));
> +		init_rcu_head(&hctx->srcu->head);
> +		call_srcu(&hctx->srcu->srcu, &hctx->srcu->head,
> +				blk_mq_wakeme_after_rcu);

So the SRCU readers are specific to a given blk_mq_hw_ctx?  This looks
OK if so.

> +	}
> +}
> +
>  /**
>   * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
>   * @q: request queue.
> @@ -228,7 +257,7 @@ void blk_mq_quiesce_queue(struct request_queue *q)
>  
>  	queue_for_each_hw_ctx(q, hctx, i) {
>  		if (hctx->flags & BLK_MQ_F_BLOCKING)
> -			synchronize_srcu(hctx->srcu);
> +			synchronize_srcu(&hctx->srcu->srcu);

This waits only for an SRCU grace period (that is, for any relevant
hctx_unlock() calls), not for the invocation of any callbacks from any
prior call_srcu().  Is this what you need here?

The ->srcu->srcu looks like it might become confusing, but I don't
have any specific suggstions.

>  		else
>  			rcu = true;
>  	}
> @@ -700,23 +729,23 @@ void blk_mq_complete_request(struct request *rq)
>  EXPORT_SYMBOL(blk_mq_complete_request);
>  
>  static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
> -	__releases(hctx->srcu)
> +	__releases(&hctx->srcu->srcu)
>  {
>  	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
>  		rcu_read_unlock();
>  	else
> -		srcu_read_unlock(hctx->srcu, srcu_idx);
> +		srcu_read_unlock(&hctx->srcu->srcu, srcu_idx);

Oh, either RCU or SRCU.  Got it!

>  }
>  
>  static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
> -	__acquires(hctx->srcu)
> +	__acquires(&hctx->srcu->srcu)
>  {
>  	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
>  		/* shut up gcc false positive */
>  		*srcu_idx = 0;
>  		rcu_read_lock();
>  	} else
> -		*srcu_idx = srcu_read_lock(hctx->srcu);
> +		*srcu_idx = srcu_read_lock(&hctx->srcu->srcu);
>  }
>  
>  /**
> @@ -2599,7 +2628,7 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
>  		     sizeof(struct blk_mq_hw_ctx));
>  
>  	if (tag_set->flags & BLK_MQ_F_BLOCKING)
> -		hw_ctx_size += sizeof(struct srcu_struct);
> +		hw_ctx_size += sizeof(struct blk_mq_srcu_struct);
>  
>  	return hw_ctx_size;
>  }
> @@ -2684,7 +2713,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
>  		goto free_bitmap;
>  
>  	if (hctx->flags & BLK_MQ_F_BLOCKING)
> -		init_srcu_struct(hctx->srcu);
> +		init_srcu_struct(&hctx->srcu->srcu);
>  	blk_mq_hctx_kobj_init(hctx);
>  
>  	return hctx;
> @@ -2880,6 +2909,43 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
>  	}
>  }
>  
> +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
> +{
> +	struct request_queue *q;
> +
> +	mutex_lock(&set->tag_list_lock);
> +	if (set->flags & BLK_MQ_F_BLOCKING) {
> +		int count = 0;
> +
> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
> +			count++;
> +
> +		atomic_set(&set->quiesce_count, count);
> +		init_completion(&set->quiesce_completion);
> +
> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
> +			blk_mq_quiesce_blocking_queue_async(q);

So Christoph would like the mutex_unlock() up here?

> +		wait_for_completion(&set->quiesce_completion);
> +	} else {
> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
> +			blk_mq_quiesce_queue_nowait(q);

And up here?

> +		synchronize_rcu();
> +	}
> +	mutex_unlock(&set->tag_list_lock);
> +}
> +EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
> +
> +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
> +{
> +	struct request_queue *q;
> +
> +	mutex_lock(&set->tag_list_lock);
> +	list_for_each_entry(q, &set->tag_list, tag_set_list)
> +		blk_mq_unquiesce_queue(q);
> +	mutex_unlock(&set->tag_list_lock);
> +}
> +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
> +
>  static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
>  					bool shared)
>  {
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 23230c1d031e..9ef7fdb809a7 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -9,6 +9,11 @@
>  struct blk_mq_tags;
>  struct blk_flush_queue;
>  
> +struct blk_mq_srcu_struct {
> +	struct srcu_struct srcu;
> +	struct rcu_head    head;
> +};
> +
>  /**
>   * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
>   * block device
> @@ -175,7 +180,7 @@ struct blk_mq_hw_ctx {
>  	 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
>  	 * blk_mq_hw_ctx_size().
>  	 */
> -	struct srcu_struct	srcu[];
> +	struct blk_mq_srcu_struct	srcu[];
>  };
>  
>  /**
> @@ -254,6 +259,9 @@ struct blk_mq_tag_set {
>  
>  	struct mutex		tag_list_lock;
>  	struct list_head	tag_list;
> +
> +	struct completion	quiesce_completion;
> +	atomic_t		quiesce_count;
>  };
>  
>  /**
> @@ -532,6 +540,8 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
>  void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
>  
>  void blk_mq_quiesce_queue_nowait(struct request_queue *q);
> +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
> +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set);
>  
>  unsigned int blk_mq_rq_cpu(struct request *rq);
>  
> 
> -- 
> Ming
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28 10:58             ` Christoph Hellwig
@ 2020-07-28 16:25               ` Sagi Grimberg
  0 siblings, 0 replies; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-28 16:25 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Ming Lei, Jens Axboe, Paul E. McKenney, linux-nvme, linux-block,
	Chao Leng, Keith Busch, Ming Lin


>>>>>> I like the tagset based interface.  But the idea of doing a per-hctx
>>>>>> allocation and wait doesn't seem very scalable.
>>>>>>
>>>>>> Paul, do you have any good idea for an interface that waits on
>>>>>> multiple srcu heads?  As far as I can tell we could just have a single
>>>>>> global completion and counter, and each call_srcu would just just
>>>>>> decrement it and then the final one would do the wakeup.  It would just
>>>>>> be great to figure out a way to keep the struct rcu_synchronize and
>>>>>> counter on stack to avoid an allocation.
>>>>>>
>>>>>> But if we can't do with an on-stack object I'd much rather just embedd
>>>>>> the rcu_head in the hw_ctx.
>>>>>
>>>>> I think we can do that, please see the following patch which is against Sagi's V5:
>>>>
>>>> I don't think you can send a single rcu_head to multiple call_srcu calls.
>>>
>>> OK, then one variant is to put the rcu_head into blk_mq_hw_ctx, and put
>>> rcu_synchronize into blk_mq_tag_set.
>>
>> I can cook up a spin, but I still hate the fact that I have a queue that
>> ends up quiesced which I didn't want it to...
> 
> Why do we care so much about the connect_q?  Especially if we generalize
> it into a passthru queue that will absolutely need the quiesce hopefully
> soon.

The connect_q cannot be generalized to a passthru_q, exactly because of
the reason it exists in the first place. There is no way to guarantee
that the connect is issued before any pending request (in case of reset
during traffic).

We can use this API, but we will need to explicitly unquiesce the
connect_q which is a bit ugly like:
--
void nvme_stop_queues(struct nvme_ctrl *ctrl)
{
	blk_mq_quiesce_tagset(ctrl->tagset);
	if (ctrl->connect_q)
		blk_mq_unquiesce_queue(ctrl->connect_q);
}
EXPORT_SYMBOL_GPL(nvme_stop_queues);
--

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28 13:54         ` Paul E. McKenney
@ 2020-07-28 23:46           ` Sagi Grimberg
  2020-07-29  0:31             ` Paul E. McKenney
  0 siblings, 1 reply; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-28 23:46 UTC (permalink / raw)
  To: paulmck
  Cc: Ming Lei, Christoph Hellwig, Jens Axboe, linux-nvme, linux-block,
	Chao Leng, Keith Busch, Ming Lin

Hey Paul,

> Indeed you cannot.  And if you build with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
> it will yell at you when you try.
> 
> You -can- pass on-stack rcu_head structures to call_srcu(), though,
> if that helps.  You of course must have some way of waiting for the
> callback to be invoked before exiting that function.  This should be
> easy for me to package into an API, maybe using one of the existing
> reference-counting APIs.
> 
> So, do you have a separate stack frame for each of the desired call_srcu()
> invocations?  If not, do you know at build time how many rcu_head
> structures you need?  If the answer to both of these is "no", then
> it is likely that there needs to be an rcu_head in each of the relevant
> data structures, as was noted earlier in this thread.
> 
> Yeah, I should go read the code.  But I would need to know where it is
> and it is still early in the morning over here!  ;-)
> 
> I probably should also have read the remainder of the thread before
> replying, as well.  But what is the fun in that?

The use-case is to quiesce submissions to queues. This flow is where we
want to teardown stuff, and we can potentially have 1000's of queues
that we need to quiesce each one.

each queue (hctx) has either rcu or srcu depending if it may sleep
during submission.

The goal is that the overall quiesce should be fast, so we want
to wait for all of these queues elapsed period ~once, in parallel,
instead of synchronizing each serially as done today.

The guys here are resisting to add a rcu_synchronize to each and
every hctx because it will take 32 bytes more or less from 1000's
of hctxs.

Dynamically allocating each one is possible but not very scalable.

The question is if there is some way, we can do this with on-stack
or a single on-heap rcu_head or equivalent that can achieve the same
effect.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-28 23:46           ` Sagi Grimberg
@ 2020-07-29  0:31             ` Paul E. McKenney
  2020-07-29  0:43               ` Sagi Grimberg
  0 siblings, 1 reply; 40+ messages in thread
From: Paul E. McKenney @ 2020-07-29  0:31 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Ming Lei, Christoph Hellwig, Jens Axboe, linux-nvme, linux-block,
	Chao Leng, Keith Busch, Ming Lin

On Tue, Jul 28, 2020 at 04:46:23PM -0700, Sagi Grimberg wrote:
> Hey Paul,
> 
> > Indeed you cannot.  And if you build with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y
> > it will yell at you when you try.
> > 
> > You -can- pass on-stack rcu_head structures to call_srcu(), though,
> > if that helps.  You of course must have some way of waiting for the
> > callback to be invoked before exiting that function.  This should be
> > easy for me to package into an API, maybe using one of the existing
> > reference-counting APIs.
> > 
> > So, do you have a separate stack frame for each of the desired call_srcu()
> > invocations?  If not, do you know at build time how many rcu_head
> > structures you need?  If the answer to both of these is "no", then
> > it is likely that there needs to be an rcu_head in each of the relevant
> > data structures, as was noted earlier in this thread.
> > 
> > Yeah, I should go read the code.  But I would need to know where it is
> > and it is still early in the morning over here!  ;-)
> > 
> > I probably should also have read the remainder of the thread before
> > replying, as well.  But what is the fun in that?
> 
> The use-case is to quiesce submissions to queues. This flow is where we
> want to teardown stuff, and we can potentially have 1000's of queues
> that we need to quiesce each one.
> 
> each queue (hctx) has either rcu or srcu depending if it may sleep
> during submission.
> 
> The goal is that the overall quiesce should be fast, so we want
> to wait for all of these queues elapsed period ~once, in parallel,
> instead of synchronizing each serially as done today.
> 
> The guys here are resisting to add a rcu_synchronize to each and
> every hctx because it will take 32 bytes more or less from 1000's
> of hctxs.
> 
> Dynamically allocating each one is possible but not very scalable.
> 
> The question is if there is some way, we can do this with on-stack
> or a single on-heap rcu_head or equivalent that can achieve the same
> effect.

If the hctx structures are guaranteed to stay put, you could count
them and then do a single allocation of an array of rcu_head structures
(or some larger structure containing an rcu_head structure, if needed).
You could then sequence through this array, consuming one rcu_head per
hctx as you processed it.  Once all the callbacks had been invoked,
it would be safe to free the array.

Sounds too simple, though.  So what am I missing?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-29  0:31             ` Paul E. McKenney
@ 2020-07-29  0:43               ` Sagi Grimberg
  2020-07-29  0:59                 ` Keith Busch
  2020-07-29  4:10                 ` Paul E. McKenney
  0 siblings, 2 replies; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-29  0:43 UTC (permalink / raw)
  To: paulmck
  Cc: Ming Lei, Christoph Hellwig, Jens Axboe, linux-nvme, linux-block,
	Chao Leng, Keith Busch, Ming Lin


>> Dynamically allocating each one is possible but not very scalable.
>>
>> The question is if there is some way, we can do this with on-stack
>> or a single on-heap rcu_head or equivalent that can achieve the same
>> effect.
> 
> If the hctx structures are guaranteed to stay put, you could count
> them and then do a single allocation of an array of rcu_head structures
> (or some larger structure containing an rcu_head structure, if needed).
> You could then sequence through this array, consuming one rcu_head per
> hctx as you processed it.  Once all the callbacks had been invoked,
> it would be safe to free the array.
> 
> Sounds too simple, though.  So what am I missing?

We don't want higher-order allocations...

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-29  0:43               ` Sagi Grimberg
@ 2020-07-29  0:59                 ` Keith Busch
  2020-07-29  4:39                   ` Sagi Grimberg
  2020-07-29  4:10                 ` Paul E. McKenney
  1 sibling, 1 reply; 40+ messages in thread
From: Keith Busch @ 2020-07-29  0:59 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: paulmck, Ming Lei, Christoph Hellwig, Jens Axboe, linux-nvme,
	linux-block, Chao Leng, Ming Lin

On Tue, Jul 28, 2020 at 05:43:02PM -0700, Sagi Grimberg wrote:
> 
> > > Dynamically allocating each one is possible but not very scalable.
> > > 
> > > The question is if there is some way, we can do this with on-stack
> > > or a single on-heap rcu_head or equivalent that can achieve the same
> > > effect.
> > 
> > If the hctx structures are guaranteed to stay put, you could count
> > them and then do a single allocation of an array of rcu_head structures
> > (or some larger structure containing an rcu_head structure, if needed).
> > You could then sequence through this array, consuming one rcu_head per
> > hctx as you processed it.  Once all the callbacks had been invoked,
> > it would be safe to free the array.
> > 
> > Sounds too simple, though.  So what am I missing?
> 
> We don't want higher-order allocations...

So:

  (1) We don't want to embed the struct in the hctx because we allocate
  so many of them that this is non-negligable to add for something we
  typically never use.

  (2) We don't want to allocate dynamically because it's potentially
  huge.

As long as we're using srcu for blocking hctx's, I think it's "pick your
poison".

Alternatively, Ming's percpu_ref patch(*) may be worth a look.

 * https://www.spinics.net/lists/linux-block/msg56976.html1

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-29  0:43               ` Sagi Grimberg
  2020-07-29  0:59                 ` Keith Busch
@ 2020-07-29  4:10                 ` Paul E. McKenney
  2020-07-29  4:37                   ` Sagi Grimberg
  1 sibling, 1 reply; 40+ messages in thread
From: Paul E. McKenney @ 2020-07-29  4:10 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Ming Lei, Christoph Hellwig, Jens Axboe, linux-nvme, linux-block,
	Chao Leng, Keith Busch, Ming Lin

On Tue, Jul 28, 2020 at 05:43:02PM -0700, Sagi Grimberg wrote:
> 
> > > Dynamically allocating each one is possible but not very scalable.
> > > 
> > > The question is if there is some way, we can do this with on-stack
> > > or a single on-heap rcu_head or equivalent that can achieve the same
> > > effect.
> > 
> > If the hctx structures are guaranteed to stay put, you could count
> > them and then do a single allocation of an array of rcu_head structures
> > (or some larger structure containing an rcu_head structure, if needed).
> > You could then sequence through this array, consuming one rcu_head per
> > hctx as you processed it.  Once all the callbacks had been invoked,
> > it would be safe to free the array.
> > 
> > Sounds too simple, though.  So what am I missing?
> 
> We don't want higher-order allocations...

OK, I will bite...  Do multiple lower-order allocations (page size is
still lower-order, correct?) and link them together.

Sorry, couldn't resist...

							Thanx, Paul

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-29  4:10                 ` Paul E. McKenney
@ 2020-07-29  4:37                   ` Sagi Grimberg
  0 siblings, 0 replies; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-29  4:37 UTC (permalink / raw)
  To: paulmck
  Cc: Ming Lei, Christoph Hellwig, Jens Axboe, linux-nvme, linux-block,
	Chao Leng, Keith Busch, Ming Lin


>>>> Dynamically allocating each one is possible but not very scalable.
>>>>
>>>> The question is if there is some way, we can do this with on-stack
>>>> or a single on-heap rcu_head or equivalent that can achieve the same
>>>> effect.
>>>
>>> If the hctx structures are guaranteed to stay put, you could count
>>> them and then do a single allocation of an array of rcu_head structures
>>> (or some larger structure containing an rcu_head structure, if needed).
>>> You could then sequence through this array, consuming one rcu_head per
>>> hctx as you processed it.  Once all the callbacks had been invoked,
>>> it would be safe to free the array.
>>>
>>> Sounds too simple, though.  So what am I missing?
>>
>> We don't want higher-order allocations...
> 
> OK, I will bite...  Do multiple lower-order allocations (page size is
> still lower-order, correct?) and link them together.
> 
> Sorry, couldn't resist...

Possible, but I didn't want us to resort to all this complexity and
thought we can find a better, simpler solution.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-29  0:59                 ` Keith Busch
@ 2020-07-29  4:39                   ` Sagi Grimberg
  2020-08-07  9:04                     ` Chao Leng
  0 siblings, 1 reply; 40+ messages in thread
From: Sagi Grimberg @ 2020-07-29  4:39 UTC (permalink / raw)
  To: Keith Busch
  Cc: paulmck, Ming Lei, Christoph Hellwig, Jens Axboe, linux-nvme,
	linux-block, Chao Leng, Ming Lin


>>>> Dynamically allocating each one is possible but not very scalable.
>>>>
>>>> The question is if there is some way, we can do this with on-stack
>>>> or a single on-heap rcu_head or equivalent that can achieve the same
>>>> effect.
>>>
>>> If the hctx structures are guaranteed to stay put, you could count
>>> them and then do a single allocation of an array of rcu_head structures
>>> (or some larger structure containing an rcu_head structure, if needed).
>>> You could then sequence through this array, consuming one rcu_head per
>>> hctx as you processed it.  Once all the callbacks had been invoked,
>>> it would be safe to free the array.
>>>
>>> Sounds too simple, though.  So what am I missing?
>>
>> We don't want higher-order allocations...
> 
> So:
> 
>    (1) We don't want to embed the struct in the hctx because we allocate
>    so many of them that this is non-negligable to add for something we
>    typically never use.
> 
>    (2) We don't want to allocate dynamically because it's potentially
>    huge.
> 
> As long as we're using srcu for blocking hctx's, I think it's "pick your
> poison".
> 
> Alternatively, Ming's percpu_ref patch(*) may be worth a look.
> 
>   * https://www.spinics.net/lists/linux-block/msg56976.html1
I'm not opposed to having this. Will require some more testing
as this affects pretty much every driver out there..

If we are going with a lightweight percpu_ref, can we just do
it also for non-blocking hctx and have a single code-path?

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-07-29  4:39                   ` Sagi Grimberg
@ 2020-08-07  9:04                     ` Chao Leng
  2020-08-07  9:24                       ` Ming Lei
  0 siblings, 1 reply; 40+ messages in thread
From: Chao Leng @ 2020-08-07  9:04 UTC (permalink / raw)
  To: Sagi Grimberg, Keith Busch
  Cc: paulmck, Ming Lei, Christoph Hellwig, Jens Axboe, linux-nvme,
	linux-block, Ming Lin



On 2020/7/29 12:39, Sagi Grimberg wrote:
> 
>>>>> Dynamically allocating each one is possible but not very scalable.
>>>>>
>>>>> The question is if there is some way, we can do this with on-stack
>>>>> or a single on-heap rcu_head or equivalent that can achieve the same
>>>>> effect.
>>>>
>>>> If the hctx structures are guaranteed to stay put, you could count
>>>> them and then do a single allocation of an array of rcu_head structures
>>>> (or some larger structure containing an rcu_head structure, if needed).
>>>> You could then sequence through this array, consuming one rcu_head per
>>>> hctx as you processed it.  Once all the callbacks had been invoked,
>>>> it would be safe to free the array.
>>>>
>>>> Sounds too simple, though.  So what am I missing?
>>>
>>> We don't want higher-order allocations...
>>
>> So:
>>
>>    (1) We don't want to embed the struct in the hctx because we allocate
>>    so many of them that this is non-negligable to add for something we
>>    typically never use.
>>
>>    (2) We don't want to allocate dynamically because it's potentially
>>    huge.
>>
>> As long as we're using srcu for blocking hctx's, I think it's "pick your
>> poison".
>>
>> Alternatively, Ming's percpu_ref patch(*) may be worth a look.
>>
>>   * https://www.spinics.net/lists/linux-block/msg56976.html1
> I'm not opposed to having this. Will require some more testing
> as this affects pretty much every driver out there..
> 
> If we are going with a lightweight percpu_ref, can we just do
> it also for non-blocking hctx and have a single code-path?
> .
I tried to optimize the patch,support for non blocking queue and
blocking queue.
See next email.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-08-07  9:04                     ` Chao Leng
@ 2020-08-07  9:24                       ` Ming Lei
  2020-08-07  9:35                         ` Chao Leng
  0 siblings, 1 reply; 40+ messages in thread
From: Ming Lei @ 2020-08-07  9:24 UTC (permalink / raw)
  To: Chao Leng
  Cc: Sagi Grimberg, Keith Busch, paulmck, Christoph Hellwig,
	Jens Axboe, linux-nvme, linux-block, Ming Lin

On Fri, Aug 07, 2020 at 05:04:38PM +0800, Chao Leng wrote:
> 
> 
> On 2020/7/29 12:39, Sagi Grimberg wrote:
> > 
> > > > > > Dynamically allocating each one is possible but not very scalable.
> > > > > > 
> > > > > > The question is if there is some way, we can do this with on-stack
> > > > > > or a single on-heap rcu_head or equivalent that can achieve the same
> > > > > > effect.
> > > > > 
> > > > > If the hctx structures are guaranteed to stay put, you could count
> > > > > them and then do a single allocation of an array of rcu_head structures
> > > > > (or some larger structure containing an rcu_head structure, if needed).
> > > > > You could then sequence through this array, consuming one rcu_head per
> > > > > hctx as you processed it.  Once all the callbacks had been invoked,
> > > > > it would be safe to free the array.
> > > > > 
> > > > > Sounds too simple, though.  So what am I missing?
> > > > 
> > > > We don't want higher-order allocations...
> > > 
> > > So:
> > > 
> > >    (1) We don't want to embed the struct in the hctx because we allocate
> > >    so many of them that this is non-negligable to add for something we
> > >    typically never use.
> > > 
> > >    (2) We don't want to allocate dynamically because it's potentially
> > >    huge.
> > > 
> > > As long as we're using srcu for blocking hctx's, I think it's "pick your
> > > poison".
> > > 
> > > Alternatively, Ming's percpu_ref patch(*) may be worth a look.
> > > 
> > >   * https://www.spinics.net/lists/linux-block/msg56976.html1
> > I'm not opposed to having this. Will require some more testing
> > as this affects pretty much every driver out there..
> > 
> > If we are going with a lightweight percpu_ref, can we just do
> > it also for non-blocking hctx and have a single code-path?
> > .
> I tried to optimize the patch,support for non blocking queue and
> blocking queue.
> See next email.

Please see the following thread:

https://lore.kernel.org/linux-block/05f75e89-b6f7-de49-eb9f-a08aa4e0ba4f@kernel.dk/

Both Keith and Jens didn't think it is a good idea.



Thanks,
Ming


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v5 1/2] blk-mq: add tagset quiesce interface
  2020-08-07  9:24                       ` Ming Lei
@ 2020-08-07  9:35                         ` Chao Leng
  0 siblings, 0 replies; 40+ messages in thread
From: Chao Leng @ 2020-08-07  9:35 UTC (permalink / raw)
  To: Ming Lei
  Cc: Sagi Grimberg, Keith Busch, paulmck, Christoph Hellwig,
	Jens Axboe, linux-nvme, linux-block, Ming Lin



On 2020/8/7 17:24, Ming Lei wrote:
> On Fri, Aug 07, 2020 at 05:04:38PM +0800, Chao Leng wrote:
>>
>>
>> On 2020/7/29 12:39, Sagi Grimberg wrote:
>>>
>>>>>>> Dynamically allocating each one is possible but not very scalable.
>>>>>>>
>>>>>>> The question is if there is some way, we can do this with on-stack
>>>>>>> or a single on-heap rcu_head or equivalent that can achieve the same
>>>>>>> effect.
>>>>>>
>>>>>> If the hctx structures are guaranteed to stay put, you could count
>>>>>> them and then do a single allocation of an array of rcu_head structures
>>>>>> (or some larger structure containing an rcu_head structure, if needed).
>>>>>> You could then sequence through this array, consuming one rcu_head per
>>>>>> hctx as you processed it.  Once all the callbacks had been invoked,
>>>>>> it would be safe to free the array.
>>>>>>
>>>>>> Sounds too simple, though.  So what am I missing?
>>>>>
>>>>> We don't want higher-order allocations...
>>>>
>>>> So:
>>>>
>>>>     (1) We don't want to embed the struct in the hctx because we allocate
>>>>     so many of them that this is non-negligable to add for something we
>>>>     typically never use.
>>>>
>>>>     (2) We don't want to allocate dynamically because it's potentially
>>>>     huge.
>>>>
>>>> As long as we're using srcu for blocking hctx's, I think it's "pick your
>>>> poison".
>>>>
>>>> Alternatively, Ming's percpu_ref patch(*) may be worth a look.
>>>>
>>>>    * https://www.spinics.net/lists/linux-block/msg56976.html1
>>> I'm not opposed to having this. Will require some more testing
>>> as this affects pretty much every driver out there..
>>>
>>> If we are going with a lightweight percpu_ref, can we just do
>>> it also for non-blocking hctx and have a single code-path?
>>> .
>> I tried to optimize the patch,support for non blocking queue and
>> blocking queue.
>> See next email.
> 
> Please see the following thread:
> 
> https://lore.kernel.org/linux-block/05f75e89-b6f7-de49-eb9f-a08aa4e0ba4f@kernel.dk/
> 
> Both Keith and Jens didn't think it is a good idea.
If we can support nonblocking queue and blocking queue simplely, this may be a good choice.
Please review the patch first.
> 
> 
> 
> Thanks,
> Ming
> 
> .
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, back to index

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-27 23:10 [PATCH v5 0/2] improve nvme quiesce time for large amount of namespaces Sagi Grimberg
2020-07-27 23:10 ` [PATCH v5 1/2] blk-mq: add tagset quiesce interface Sagi Grimberg
2020-07-27 23:32   ` Keith Busch
2020-07-28  0:12     ` Sagi Grimberg
2020-07-28  1:40   ` Ming Lei
2020-07-28  1:51     ` Jens Axboe
2020-07-28  2:17       ` Ming Lei
2020-07-28  2:23         ` Jens Axboe
2020-07-28  2:28           ` Ming Lei
2020-07-28  2:32             ` Jens Axboe
2020-07-28  3:29               ` Sagi Grimberg
2020-07-28  3:25     ` Sagi Grimberg
2020-07-28  7:18   ` Christoph Hellwig
2020-07-28  7:48     ` Sagi Grimberg
2020-07-28  9:16     ` Ming Lei
2020-07-28  9:24       ` Sagi Grimberg
2020-07-28  9:33         ` Ming Lei
2020-07-28  9:37           ` Sagi Grimberg
2020-07-28  9:43             ` Sagi Grimberg
2020-07-28 10:10               ` Ming Lei
2020-07-28 10:57                 ` Christoph Hellwig
2020-07-28 14:13                 ` Paul E. McKenney
2020-07-28 10:58             ` Christoph Hellwig
2020-07-28 16:25               ` Sagi Grimberg
2020-07-28 13:54         ` Paul E. McKenney
2020-07-28 23:46           ` Sagi Grimberg
2020-07-29  0:31             ` Paul E. McKenney
2020-07-29  0:43               ` Sagi Grimberg
2020-07-29  0:59                 ` Keith Busch
2020-07-29  4:39                   ` Sagi Grimberg
2020-08-07  9:04                     ` Chao Leng
2020-08-07  9:24                       ` Ming Lei
2020-08-07  9:35                         ` Chao Leng
2020-07-29  4:10                 ` Paul E. McKenney
2020-07-29  4:37                   ` Sagi Grimberg
2020-07-27 23:10 ` [PATCH v5 2/2] nvme: use blk_mq_[un]quiesce_tagset Sagi Grimberg
2020-07-28  0:54   ` Sagi Grimberg
2020-07-28  3:21     ` Chao Leng
2020-07-28  3:34       ` Sagi Grimberg
2020-07-28  3:51         ` Chao Leng

Linux-Block Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-block/0 linux-block/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-block linux-block/ https://lore.kernel.org/linux-block \
		linux-block@vger.kernel.org
	public-inbox-index linux-block

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-block


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git