[PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces
@ 2022-10-13  9:44 Chao Leng
  2022-10-13  9:44 ` [PATCH v2 1/2] blk-mq: add tagset quiesce interface Chao Leng
                   ` (2 more replies)
  0 siblings, 3 replies; 40+ messages in thread
From: Chao Leng @ 2022-10-13  9:44 UTC (permalink / raw)
  To: linux-nvme, linux-block; +Cc: hch, sagi, kbusch, lengchao, ming.lei, axboe

This set improves the quiesce time when using a large set of
namespaces, which also improves I/O failover time in a multipath environment.

We improve for both non-blocking hctxs and blocking hctxs introducing
blk_mq_[un]quiesce_tagset which works on all request queues over a given
tagset in parallel (which is the case in nvme) for both tagset types (blocking
and non-blocking);

Changes from v1:
- improvement is based on tagset rather than namesapces

Chao Leng (2):
  blk-mq: add tagset quiesce interface
  nvme: use blk_mq_[un]quiesce_tagset

 block/blk-mq.c           | 75 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/core.c | 42 +++++++++------------------
 drivers/nvme/host/nvme.h |  2 +-
 include/linux/blk-mq.h   |  2 ++
 include/linux/blkdev.h   |  2 ++
 5 files changed, 93 insertions(+), 30 deletions(-)

-- 
2.16.4


^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-13  9:44 [PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces Chao Leng
@ 2022-10-13  9:44 ` Chao Leng
  2022-10-13 10:28   ` Sagi Grimberg
  2022-10-17 13:39   ` Christoph Hellwig
  2022-10-13  9:44 ` [PATCH v2 2/2] nvme: use blk_mq_[un]quiesce_tagset Chao Leng
  2022-10-13 14:32 ` [PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces Chaitanya Kulkarni
  2 siblings, 2 replies; 40+ messages in thread
From: Chao Leng @ 2022-10-13  9:44 UTC (permalink / raw)
  To: linux-nvme, linux-block; +Cc: hch, sagi, kbusch, lengchao, ming.lei, axboe

Drivers that have shared tagsets may need to quiesce potentially a lot
of request queues that all share a single tagset (e.g. nvme). Add an interface
to quiesce all the queues on a given tagset. This interface is useful because
it can speedup the quiesce by doing it in parallel.

For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs
in parallel such that all of them wait for the same rcu elapsed period with
a per-hctx heap allocated rcu_synchronize. for tagsets that don't have
BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is
sufficient.

Because some queues never need to be quiesced(e.g. nvme connect_q).
So introduce QUEUE_FLAG_NOQUIESCED to tagset quiesce interface to
skip the queue.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Chao Leng <lengchao@huawei.com>
---
 block/blk-mq.c         | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq.h |  2 ++
 include/linux/blkdev.h |  2 ++
 3 files changed, 79 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8070b6c10e8d..ebe25da08156 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -29,6 +29,7 @@
 #include <linux/prefetch.h>
 #include <linux/blk-crypto.h>
 #include <linux/part_stat.h>
+#include <linux/rcupdate_wait.h>
 
 #include <trace/events/block.h>
 
@@ -311,6 +312,80 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
 
+static void blk_mq_quiesce_blocking_tagset(struct blk_mq_tag_set *set)
+{
+	int i = 0;
+	int count = 0;
+	struct request_queue *q;
+	struct rcu_synchronize *rcu;
+
+	list_for_each_entry(q, &set->tag_list, tag_set_list) {
+		if (blk_queue_noquiesced(q))
+			continue;
+
+		blk_mq_quiesce_queue_nowait(q);
+		count++;
+	}
+
+	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
+	if (rcu) {
+		list_for_each_entry(q, &set->tag_list, tag_set_list) {
+			if (blk_queue_noquiesced(q))
+				continue;
+
+			init_rcu_head(&rcu[i].head);
+			init_completion(&rcu[i].completion);
+			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
+			i++;
+		}
+
+		for (i = 0; i < count; i++) {
+			wait_for_completion(&rcu[i].completion);
+			destroy_rcu_head(&rcu[i].head);
+		}
+		kvfree(rcu);
+	} else {
+		list_for_each_entry(q, &set->tag_list, tag_set_list)
+			synchronize_srcu(q->srcu);
+	}
+}
+
+static void blk_mq_quiesce_nonblocking_tagset(struct blk_mq_tag_set *set)
+{
+	struct request_queue *q;
+
+	list_for_each_entry(q, &set->tag_list, tag_set_list) {
+		if (blk_queue_noquiesced(q))
+			continue;
+
+		blk_mq_quiesce_queue_nowait(q);
+	}
+	synchronize_rcu();
+}
+
+void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
+{
+	mutex_lock(&set->tag_list_lock);
+	if (set->flags & BLK_MQ_F_BLOCKING)
+		blk_mq_quiesce_blocking_tagset(set);
+	else
+		blk_mq_quiesce_nonblocking_tagset(set);
+
+	mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
+
+void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
+{
+	struct request_queue *q;
+
+	mutex_lock(&set->tag_list_lock);
+	list_for_each_entry(q, &set->tag_list, tag_set_list)
+		blk_mq_unquiesce_queue(q);
+	mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
+
 void blk_mq_wake_waiters(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ba18e9bdb799..1df47606d0a7 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -877,6 +877,8 @@ void blk_mq_start_hw_queues(struct request_queue *q);
 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_quiesce_queue(struct request_queue *q);
+void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
+void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set);
 void blk_mq_wait_quiesce_done(struct request_queue *q);
 void blk_mq_unquiesce_queue(struct request_queue *q);
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 50e358a19d98..f15544299a67 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -579,6 +579,7 @@ struct request_queue {
 #define QUEUE_FLAG_HCTX_ACTIVE	28	/* at least one blk-mq hctx is active */
 #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
 #define QUEUE_FLAG_SQ_SCHED     30	/* single queue style io dispatch */
+#define QUEUE_FLAG_NOQUIESCED	31	/* queue is never quiesced */
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1UL << QUEUE_FLAG_IO_STAT) |		\
 				 (1UL << QUEUE_FLAG_SAME_COMP) |	\
@@ -619,6 +620,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_pm_only(q)	atomic_read(&(q)->pm_only)
 #define blk_queue_registered(q)	test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
 #define blk_queue_sq_sched(q)	test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
+#define blk_queue_noquiesced(q)	test_bit(QUEUE_FLAG_NOQUIESCED, &(q)->queue_flags)
 
 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH v2 2/2] nvme: use blk_mq_[un]quiesce_tagset
  2022-10-13  9:44 [PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces Chao Leng
  2022-10-13  9:44 ` [PATCH v2 1/2] blk-mq: add tagset quiesce interface Chao Leng
@ 2022-10-13  9:44 ` Chao Leng
  2022-10-13 10:22   ` Sagi Grimberg
  2022-10-13 14:32 ` [PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces Chaitanya Kulkarni
  2 siblings, 1 reply; 40+ messages in thread
From: Chao Leng @ 2022-10-13  9:44 UTC (permalink / raw)
  To: linux-nvme, linux-block; +Cc: hch, sagi, kbusch, lengchao, ming.lei, axboe

All controller namespaces share the same tagset, so we can use this
interface which does the optimal operation for parallel quiesce based on
the tagset type(e.g. blocking tagsets and non-blocking tagsets).

Now use NVME_NS_STOPPED to ensure pairing quiescing and unquiescing.
If use blk_mq_[un]quiesce_tagset, NVME_NS_STOPPED will be invalided,
so introduce NVME_CTRL_STOPPED to replace NVME_NS_STOPPED.

nvme connect_q should be never quiesced, so set QUEUE_FLAG_NOQUIESCED
when init connect_q.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Chao Leng <lengchao@huawei.com>
---
 drivers/nvme/host/core.c | 42 +++++++++++++-----------------------------
 drivers/nvme/host/nvme.h |  2 +-
 2 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 059737c1a2c1..0d07a07e02ff 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4890,6 +4890,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
 			ret = PTR_ERR(ctrl->connect_q);
 			goto out_free_tag_set;
 		}
+		blk_queue_flag_set(QUEUE_FLAG_NOQUIESCED, ctrl->connect_q);
 	}
 
 	ctrl->tagset = set;
@@ -5089,20 +5090,6 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
 
-static void nvme_start_ns_queue(struct nvme_ns *ns)
-{
-	if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
-		blk_mq_unquiesce_queue(ns->queue);
-}
-
-static void nvme_stop_ns_queue(struct nvme_ns *ns)
-{
-	if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
-		blk_mq_quiesce_queue(ns->queue);
-	else
-		blk_mq_wait_quiesce_done(ns->queue);
-}
-
 /*
  * Prepare a queue for teardown.
  *
@@ -5111,13 +5098,14 @@ static void nvme_stop_ns_queue(struct nvme_ns *ns)
  * holding bd_butex.  This will end buffered writers dirtying pages that can't
  * be synced.
  */
-static void nvme_set_queue_dying(struct nvme_ns *ns)
+static void nvme_set_queue_dying(struct nvme_ns *ns, bool start_queue)
 {
 	if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
 		return;
 
 	blk_mark_disk_dead(ns->disk);
-	nvme_start_ns_queue(ns);
+	if (start_queue)
+		blk_mq_unquiesce_queue(ns->queue);
 
 	set_capacity_and_notify(ns->disk, 0);
 }
@@ -5132,6 +5120,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns)
 void nvme_kill_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	bool start_queue = false;
 
 	down_read(&ctrl->namespaces_rwsem);
 
@@ -5139,8 +5128,11 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
 	if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
 		nvme_start_admin_queue(ctrl);
 
+	if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
+		start_queue = true;
+
 	list_for_each_entry(ns, &ctrl->namespaces, list)
-		nvme_set_queue_dying(ns);
+		nvme_set_queue_dying(ns, start_queue);
 
 	up_read(&ctrl->namespaces_rwsem);
 }
@@ -5196,23 +5188,15 @@ EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
-	struct nvme_ns *ns;
-
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
-		nvme_stop_ns_queue(ns);
-	up_read(&ctrl->namespaces_rwsem);
+	if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags))
+		blk_mq_quiesce_tagset(ctrl->tagset);
 }
 EXPORT_SYMBOL_GPL(nvme_stop_queues);
 
 void nvme_start_queues(struct nvme_ctrl *ctrl)
 {
-	struct nvme_ns *ns;
-
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
-		nvme_start_ns_queue(ns);
-	up_read(&ctrl->namespaces_rwsem);
+	if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
+		blk_mq_unquiesce_tagset(ctrl->tagset);
 }
 EXPORT_SYMBOL_GPL(nvme_start_queues);
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a29877217ee6..8e2f13a1e23a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -237,6 +237,7 @@ enum nvme_ctrl_flags {
 	NVME_CTRL_FAILFAST_EXPIRED	= 0,
 	NVME_CTRL_ADMIN_Q_STOPPED	= 1,
 	NVME_CTRL_STARTED_ONCE		= 2,
+	NVME_CTRL_STOPPED		= 3,
 };
 
 struct nvme_ctrl {
@@ -487,7 +488,6 @@ struct nvme_ns {
 #define NVME_NS_ANA_PENDING	2
 #define NVME_NS_FORCE_RO	3
 #define NVME_NS_READY		4
-#define NVME_NS_STOPPED		5
 
 	struct cdev		cdev;
 	struct device		cdev_device;
-- 
2.16.4


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 2/2] nvme: use blk_mq_[un]quiesce_tagset
  2022-10-13  9:44 ` [PATCH v2 2/2] nvme: use blk_mq_[un]quiesce_tagset Chao Leng
@ 2022-10-13 10:22   ` Sagi Grimberg
  2022-10-14  2:09     ` Chao Leng
  2022-10-17 13:48     ` Christoph Hellwig
  0 siblings, 2 replies; 40+ messages in thread
From: Sagi Grimberg @ 2022-10-13 10:22 UTC (permalink / raw)
  To: Chao Leng, linux-nvme, linux-block; +Cc: hch, kbusch, ming.lei, axboe



On 10/13/22 12:44, Chao Leng wrote:
> All controller namespaces share the same tagset, so we can use this
> interface which does the optimal operation for parallel quiesce based on
> the tagset type(e.g. blocking tagsets and non-blocking tagsets).
> 
> Now use NVME_NS_STOPPED to ensure pairing quiescing and unquiescing.
> If use blk_mq_[un]quiesce_tagset, NVME_NS_STOPPED will be invalided,
> so introduce NVME_CTRL_STOPPED to replace NVME_NS_STOPPED.
> 
> nvme connect_q should be never quiesced, so set QUEUE_FLAG_NOQUIESCED
> when init connect_q.
> 
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> Signed-off-by: Chao Leng <lengchao@huawei.com>
> ---
>   drivers/nvme/host/core.c | 42 +++++++++++++-----------------------------
>   drivers/nvme/host/nvme.h |  2 +-
>   2 files changed, 14 insertions(+), 30 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 059737c1a2c1..0d07a07e02ff 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -4890,6 +4890,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
>   			ret = PTR_ERR(ctrl->connect_q);
>   			goto out_free_tag_set;
>   		}
> +		blk_queue_flag_set(QUEUE_FLAG_NOQUIESCED, ctrl->connect_q);
>   	}
>   
>   	ctrl->tagset = set;
> @@ -5089,20 +5090,6 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
>   }
>   EXPORT_SYMBOL_GPL(nvme_init_ctrl);
>   
> -static void nvme_start_ns_queue(struct nvme_ns *ns)
> -{
> -	if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
> -		blk_mq_unquiesce_queue(ns->queue);
> -}
> -
> -static void nvme_stop_ns_queue(struct nvme_ns *ns)
> -{
> -	if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
> -		blk_mq_quiesce_queue(ns->queue);
> -	else
> -		blk_mq_wait_quiesce_done(ns->queue);
> -}
> -
>   /*
>    * Prepare a queue for teardown.
>    *
> @@ -5111,13 +5098,14 @@ static void nvme_stop_ns_queue(struct nvme_ns *ns)
>    * holding bd_butex.  This will end buffered writers dirtying pages that can't
>    * be synced.
>    */
> -static void nvme_set_queue_dying(struct nvme_ns *ns)
> +static void nvme_set_queue_dying(struct nvme_ns *ns, bool start_queue)
>   {
>   	if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
>   		return;
>   
>   	blk_mark_disk_dead(ns->disk);
> -	nvme_start_ns_queue(ns);
> +	if (start_queue)
> +		blk_mq_unquiesce_queue(ns->queue);
>   
>   	set_capacity_and_notify(ns->disk, 0);
>   }
> @@ -5132,6 +5120,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns)
>   void nvme_kill_queues(struct nvme_ctrl *ctrl)
>   {
>   	struct nvme_ns *ns;
> +	bool start_queue = false;
>   
>   	down_read(&ctrl->namespaces_rwsem);
>   
> @@ -5139,8 +5128,11 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
>   	if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
>   		nvme_start_admin_queue(ctrl);
>   
> +	if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
> +		start_queue = true;

Why can't we just start the queues here? just call nvme_start_queues()?
Why does it need to happen only after we mark the disk dead?
The documentation only says that we need to set the capacity to 0
after we unquiesce, but it doesn't say that we can't unquiesce erliear.

Something like this:
--
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7e840549a940..27dc393eed97 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -5127,8 +5127,6 @@ static void nvme_set_queue_dying(struct nvme_ns *ns)
                 return;

         blk_mark_disk_dead(ns->disk);
-       nvme_start_ns_queue(ns);
-
         set_capacity_and_notify(ns->disk, 0);
  }

@@ -5149,6 +5147,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
         if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
                 nvme_start_admin_queue(ctrl);

+       if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
+               nvme_start_queues(ctrl);
+
         list_for_each_entry(ns, &ctrl->namespaces, list)
                 nvme_set_queue_dying(ns);
--

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-13  9:44 ` [PATCH v2 1/2] blk-mq: add tagset quiesce interface Chao Leng
@ 2022-10-13 10:28   ` Sagi Grimberg
  2022-10-14  2:09     ` Chao Leng
  2022-10-17 13:43     ` Christoph Hellwig
  2022-10-17 13:39   ` Christoph Hellwig
  1 sibling, 2 replies; 40+ messages in thread
From: Sagi Grimberg @ 2022-10-13 10:28 UTC (permalink / raw)
  To: Chao Leng, linux-nvme, linux-block; +Cc: hch, kbusch, ming.lei, axboe



On 10/13/22 12:44, Chao Leng wrote:
> Drivers that have shared tagsets may need to quiesce potentially a lot
> of request queues that all share a single tagset (e.g. nvme). Add an interface
> to quiesce all the queues on a given tagset. This interface is useful because
> it can speedup the quiesce by doing it in parallel.
> 
> For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs
> in parallel such that all of them wait for the same rcu elapsed period with
> a per-hctx heap allocated rcu_synchronize. for tagsets that don't have
> BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is
> sufficient.
> 
> Because some queues never need to be quiesced(e.g. nvme connect_q).
> So introduce QUEUE_FLAG_NOQUIESCED to tagset quiesce interface to
> skip the queue.

I wouldn't say it never nor will ever quiesce, we just don't happen to
quiesce it today...

> 
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> Signed-off-by: Chao Leng <lengchao@huawei.com>
> ---
>   block/blk-mq.c         | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
>   include/linux/blk-mq.h |  2 ++
>   include/linux/blkdev.h |  2 ++
>   3 files changed, 79 insertions(+)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 8070b6c10e8d..ebe25da08156 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -29,6 +29,7 @@
>   #include <linux/prefetch.h>
>   #include <linux/blk-crypto.h>
>   #include <linux/part_stat.h>
> +#include <linux/rcupdate_wait.h>
>   
>   #include <trace/events/block.h>
>   
> @@ -311,6 +312,80 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
>   }
>   EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
>   
> +static void blk_mq_quiesce_blocking_tagset(struct blk_mq_tag_set *set)
> +{
> +	int i = 0;
> +	int count = 0;
> +	struct request_queue *q;
> +	struct rcu_synchronize *rcu;
> +
> +	list_for_each_entry(q, &set->tag_list, tag_set_list) {
> +		if (blk_queue_noquiesced(q))
> +			continue;
> +
> +		blk_mq_quiesce_queue_nowait(q);
> +		count++;
> +	}
> +
> +	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
> +	if (rcu) {
> +		list_for_each_entry(q, &set->tag_list, tag_set_list) {
> +			if (blk_queue_noquiesced(q))
> +				continue;
> +
> +			init_rcu_head(&rcu[i].head);
> +			init_completion(&rcu[i].completion);
> +			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
> +			i++;
> +		}
> +
> +		for (i = 0; i < count; i++) {
> +			wait_for_completion(&rcu[i].completion);
> +			destroy_rcu_head(&rcu[i].head);
> +		}
> +		kvfree(rcu);
> +	} else {
> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
> +			synchronize_srcu(q->srcu);
> +	}
> +}
> +
> +static void blk_mq_quiesce_nonblocking_tagset(struct blk_mq_tag_set *set)
> +{
> +	struct request_queue *q;
> +
> +	list_for_each_entry(q, &set->tag_list, tag_set_list) {
> +		if (blk_queue_noquiesced(q))
> +			continue;
> +
> +		blk_mq_quiesce_queue_nowait(q);
> +	}
> +	synchronize_rcu();
> +}
> +
> +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
> +{
> +	mutex_lock(&set->tag_list_lock);
> +	if (set->flags & BLK_MQ_F_BLOCKING)
> +		blk_mq_quiesce_blocking_tagset(set);
> +	else
> +		blk_mq_quiesce_nonblocking_tagset(set);
> +
> +	mutex_unlock(&set->tag_list_lock);
> +}
> +EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
> +
> +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
> +{
> +	struct request_queue *q;
> +
> +	mutex_lock(&set->tag_list_lock);
> +	list_for_each_entry(q, &set->tag_list, tag_set_list)
> +		blk_mq_unquiesce_queue(q);
> +	mutex_unlock(&set->tag_list_lock);
> +}
> +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
> +
>   void blk_mq_wake_waiters(struct request_queue *q)
>   {
>   	struct blk_mq_hw_ctx *hctx;
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index ba18e9bdb799..1df47606d0a7 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -877,6 +877,8 @@ void blk_mq_start_hw_queues(struct request_queue *q);
>   void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
>   void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
>   void blk_mq_quiesce_queue(struct request_queue *q);
> +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
> +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set);
>   void blk_mq_wait_quiesce_done(struct request_queue *q);
>   void blk_mq_unquiesce_queue(struct request_queue *q);
>   void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 50e358a19d98..f15544299a67 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -579,6 +579,7 @@ struct request_queue {
>   #define QUEUE_FLAG_HCTX_ACTIVE	28	/* at least one blk-mq hctx is active */
>   #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
>   #define QUEUE_FLAG_SQ_SCHED     30	/* single queue style io dispatch */
> +#define QUEUE_FLAG_NOQUIESCED	31	/* queue is never quiesced */

the comment is misleading. If this is truely queue that is never
quiescing then blk_mq_quiesce_queue() and friends need to skip it.

I'd call it self_quiesce or something that would reflect that it is
black-listed from tagset-wide quiesce.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces
  2022-10-13  9:44 [PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces Chao Leng
  2022-10-13  9:44 ` [PATCH v2 1/2] blk-mq: add tagset quiesce interface Chao Leng
  2022-10-13  9:44 ` [PATCH v2 2/2] nvme: use blk_mq_[un]quiesce_tagset Chao Leng
@ 2022-10-13 14:32 ` Chaitanya Kulkarni
  2022-10-14  2:12   ` Chao Leng
  2 siblings, 1 reply; 40+ messages in thread
From: Chaitanya Kulkarni @ 2022-10-13 14:32 UTC (permalink / raw)
  To: Chao Leng, linux-nvme, linux-block; +Cc: hch, sagi, kbusch, ming.lei, axboe

On 10/13/22 02:44, Chao Leng wrote:
> This set improves the quiesce time when using a large set of
> namespaces, which also improves I/O failover time in a multipath environment.
> 

by how much and what are the problems exits with current timing needs to
be documented, i.e. add quantitative data if you are posting patches for
time/performance improvement.

-ck


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 2/2] nvme: use blk_mq_[un]quiesce_tagset
  2022-10-13 10:22   ` Sagi Grimberg
@ 2022-10-14  2:09     ` Chao Leng
  2022-10-17 13:48     ` Christoph Hellwig
  1 sibling, 0 replies; 40+ messages in thread
From: Chao Leng @ 2022-10-14  2:09 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme, linux-block; +Cc: hch, kbusch, ming.lei, axboe



On 2022/10/13 18:22, Sagi Grimberg wrote:
> 
> 
> On 10/13/22 12:44, Chao Leng wrote:
>> All controller namespaces share the same tagset, so we can use this
>> interface which does the optimal operation for parallel quiesce based on
>> the tagset type(e.g. blocking tagsets and non-blocking tagsets).
>>
>> Now use NVME_NS_STOPPED to ensure pairing quiescing and unquiescing.
>> If use blk_mq_[un]quiesce_tagset, NVME_NS_STOPPED will be invalided,
>> so introduce NVME_CTRL_STOPPED to replace NVME_NS_STOPPED.
>>
>> nvme connect_q should be never quiesced, so set QUEUE_FLAG_NOQUIESCED
>> when init connect_q.
>>
>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
>> Signed-off-by: Chao Leng <lengchao@huawei.com>
>> ---
>>   drivers/nvme/host/core.c | 42 +++++++++++++-----------------------------
>>   drivers/nvme/host/nvme.h |  2 +-
>>   2 files changed, 14 insertions(+), 30 deletions(-)
>>
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index 059737c1a2c1..0d07a07e02ff 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -4890,6 +4890,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
>>               ret = PTR_ERR(ctrl->connect_q);
>>               goto out_free_tag_set;
>>           }
>> +        blk_queue_flag_set(QUEUE_FLAG_NOQUIESCED, ctrl->connect_q);
>>       }
>>       ctrl->tagset = set;
>> @@ -5089,20 +5090,6 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
>>   }
>>   EXPORT_SYMBOL_GPL(nvme_init_ctrl);
>> -static void nvme_start_ns_queue(struct nvme_ns *ns)
>> -{
>> -    if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
>> -        blk_mq_unquiesce_queue(ns->queue);
>> -}
>> -
>> -static void nvme_stop_ns_queue(struct nvme_ns *ns)
>> -{
>> -    if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
>> -        blk_mq_quiesce_queue(ns->queue);
>> -    else
>> -        blk_mq_wait_quiesce_done(ns->queue);
>> -}
>> -
>>   /*
>>    * Prepare a queue for teardown.
>>    *
>> @@ -5111,13 +5098,14 @@ static void nvme_stop_ns_queue(struct nvme_ns *ns)
>>    * holding bd_butex.  This will end buffered writers dirtying pages that can't
>>    * be synced.
>>    */
>> -static void nvme_set_queue_dying(struct nvme_ns *ns)
>> +static void nvme_set_queue_dying(struct nvme_ns *ns, bool start_queue)
>>   {
>>       if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
>>           return;
>>       blk_mark_disk_dead(ns->disk);
>> -    nvme_start_ns_queue(ns);
>> +    if (start_queue)
>> +        blk_mq_unquiesce_queue(ns->queue);
>>       set_capacity_and_notify(ns->disk, 0);
>>   }
>> @@ -5132,6 +5120,7 @@ static void nvme_set_queue_dying(struct nvme_ns *ns)
>>   void nvme_kill_queues(struct nvme_ctrl *ctrl)
>>   {
>>       struct nvme_ns *ns;
>> +    bool start_queue = false;
>>       down_read(&ctrl->namespaces_rwsem);
>> @@ -5139,8 +5128,11 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
>>       if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
>>           nvme_start_admin_queue(ctrl);
>> +    if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
>> +        start_queue = true;
> 
> Why can't we just start the queues here? just call nvme_start_queues()?
If we move the unquiesce of nvme_set_queue_dying out, this may destroys
the functional atomicity of nvme_set_queue_dying.
Of course, from the current code, It can work well.
If others don't object, I'll modify it in patch v3.
> Why does it need to happen only after we mark the disk dead?
> The documentation only says that we need to set the capacity to 0
> after we unquiesce, but it doesn't say that we can't unquiesce erliear.
> 
> Something like this:
> -- 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 7e840549a940..27dc393eed97 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -5127,8 +5127,6 @@ static void nvme_set_queue_dying(struct nvme_ns *ns)
>                  return;
> 
>          blk_mark_disk_dead(ns->disk);
> -       nvme_start_ns_queue(ns);
> -
>          set_capacity_and_notify(ns->disk, 0);
>   }
> 
> @@ -5149,6 +5147,9 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
>          if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
>                  nvme_start_admin_queue(ctrl);
> 
> +       if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
> +               nvme_start_queues(ctrl);
> +
>          list_for_each_entry(ns, &ctrl->namespaces, list)
>                  nvme_set_queue_dying(ns);
> -- 
> 
> .

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-13 10:28   ` Sagi Grimberg
@ 2022-10-14  2:09     ` Chao Leng
  2022-10-17 13:43     ` Christoph Hellwig
  1 sibling, 0 replies; 40+ messages in thread
From: Chao Leng @ 2022-10-14  2:09 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme, linux-block; +Cc: hch, kbusch, ming.lei, axboe



On 2022/10/13 18:28, Sagi Grimberg wrote:
> 
> 
> On 10/13/22 12:44, Chao Leng wrote:
>> Drivers that have shared tagsets may need to quiesce potentially a lot
>> of request queues that all share a single tagset (e.g. nvme). Add an interface
>> to quiesce all the queues on a given tagset. This interface is useful because
>> it can speedup the quiesce by doing it in parallel.
>>
>> For tagsets that have BLK_MQ_F_BLOCKING set, we use call_srcu to all hctxs
>> in parallel such that all of them wait for the same rcu elapsed period with
>> a per-hctx heap allocated rcu_synchronize. for tagsets that don't have
>> BLK_MQ_F_BLOCKING set, we simply call a single synchronize_rcu as this is
>> sufficient.
>>
>> Because some queues never need to be quiesced(e.g. nvme connect_q).
>> So introduce QUEUE_FLAG_NOQUIESCED to tagset quiesce interface to
>> skip the queue.
> 
> I wouldn't say it never nor will ever quiesce, we just don't happen to
> quiesce it today...
> 
>>
>> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
>> Signed-off-by: Chao Leng <lengchao@huawei.com>
>> ---
>>   block/blk-mq.c         | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>   include/linux/blk-mq.h |  2 ++
>>   include/linux/blkdev.h |  2 ++
>>   3 files changed, 79 insertions(+)
>>
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 8070b6c10e8d..ebe25da08156 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -29,6 +29,7 @@
>>   #include <linux/prefetch.h>
>>   #include <linux/blk-crypto.h>
>>   #include <linux/part_stat.h>
>> +#include <linux/rcupdate_wait.h>
>>   #include <trace/events/block.h>
>> @@ -311,6 +312,80 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
>>   }
>>   EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
>> +static void blk_mq_quiesce_blocking_tagset(struct blk_mq_tag_set *set)
>> +{
>> +    int i = 0;
>> +    int count = 0;
>> +    struct request_queue *q;
>> +    struct rcu_synchronize *rcu;
>> +
>> +    list_for_each_entry(q, &set->tag_list, tag_set_list) {
>> +        if (blk_queue_noquiesced(q))
>> +            continue;
>> +
>> +        blk_mq_quiesce_queue_nowait(q);
>> +        count++;
>> +    }
>> +
>> +    rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
>> +    if (rcu) {
>> +        list_for_each_entry(q, &set->tag_list, tag_set_list) {
>> +            if (blk_queue_noquiesced(q))
>> +                continue;
>> +
>> +            init_rcu_head(&rcu[i].head);
>> +            init_completion(&rcu[i].completion);
>> +            call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
>> +            i++;
>> +        }
>> +
>> +        for (i = 0; i < count; i++) {
>> +            wait_for_completion(&rcu[i].completion);
>> +            destroy_rcu_head(&rcu[i].head);
>> +        }
>> +        kvfree(rcu);
>> +    } else {
>> +        list_for_each_entry(q, &set->tag_list, tag_set_list)
>> +            synchronize_srcu(q->srcu);
>> +    }
>> +}
>> +
>> +static void blk_mq_quiesce_nonblocking_tagset(struct blk_mq_tag_set *set)
>> +{
>> +    struct request_queue *q;
>> +
>> +    list_for_each_entry(q, &set->tag_list, tag_set_list) {
>> +        if (blk_queue_noquiesced(q))
>> +            continue;
>> +
>> +        blk_mq_quiesce_queue_nowait(q);
>> +    }
>> +    synchronize_rcu();
>> +}
>> +
>> +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
>> +{
>> +    mutex_lock(&set->tag_list_lock);
>> +    if (set->flags & BLK_MQ_F_BLOCKING)
>> +        blk_mq_quiesce_blocking_tagset(set);
>> +    else
>> +        blk_mq_quiesce_nonblocking_tagset(set);
>> +
>> +    mutex_unlock(&set->tag_list_lock);
>> +}
>> +EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
>> +
>> +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
>> +{
>> +    struct request_queue *q;
>> +
>> +    mutex_lock(&set->tag_list_lock);
>> +    list_for_each_entry(q, &set->tag_list, tag_set_list)
>> +        blk_mq_unquiesce_queue(q);
>> +    mutex_unlock(&set->tag_list_lock);
>> +}
>> +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
>> +
>>   void blk_mq_wake_waiters(struct request_queue *q)
>>   {
>>       struct blk_mq_hw_ctx *hctx;
>> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
>> index ba18e9bdb799..1df47606d0a7 100644
>> --- a/include/linux/blk-mq.h
>> +++ b/include/linux/blk-mq.h
>> @@ -877,6 +877,8 @@ void blk_mq_start_hw_queues(struct request_queue *q);
>>   void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
>>   void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
>>   void blk_mq_quiesce_queue(struct request_queue *q);
>> +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
>> +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set);
>>   void blk_mq_wait_quiesce_done(struct request_queue *q);
>>   void blk_mq_unquiesce_queue(struct request_queue *q);
>>   void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>> index 50e358a19d98..f15544299a67 100644
>> --- a/include/linux/blkdev.h
>> +++ b/include/linux/blkdev.h
>> @@ -579,6 +579,7 @@ struct request_queue {
>>   #define QUEUE_FLAG_HCTX_ACTIVE    28    /* at least one blk-mq hctx is active */
>>   #define QUEUE_FLAG_NOWAIT       29    /* device supports NOWAIT */
>>   #define QUEUE_FLAG_SQ_SCHED     30    /* single queue style io dispatch */
>> +#define QUEUE_FLAG_NOQUIESCED    31    /* queue is never quiesced */
> 
> the comment is misleading. If this is truely queue that is never
> quiescing then blk_mq_quiesce_queue() and friends need to skip it.
> 
> I'd call it self_quiesce or something that would reflect that it is
> black-listed from tagset-wide quiesce.
Yes, you are right. I will modify the comment in patch V3.
> .

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces
  2022-10-13 14:32 ` [PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces Chaitanya Kulkarni
@ 2022-10-14  2:12   ` Chao Leng
  2022-10-15  0:30     ` Chaitanya Kulkarni
  0 siblings, 1 reply; 40+ messages in thread
From: Chao Leng @ 2022-10-14  2:12 UTC (permalink / raw)
  To: Chaitanya Kulkarni, linux-nvme, linux-block
  Cc: hch, sagi, kbusch, ming.lei, axboe



On 2022/10/13 22:32, Chaitanya Kulkarni wrote:
> On 10/13/22 02:44, Chao Leng wrote:
>> This set improves the quiesce time when using a large set of
>> namespaces, which also improves I/O failover time in a multipath environment.
>>
> 
> by how much and what are the problems exits with current timing needs to
> be documented, i.e. add quantitative data if you are posting patches for
> time/performance improvement.
Theoretically, every synchronize_rcu/synchronize_srcu need to wait more than 10ms.
Now nvme quiesce all queues of namespace one by one, The more namespace,
the longer the waiting time.

The test result:
Test Scenario: nvme over roce with multipathing, 256 namespaces
When send I/Os to all namespaces, simulate a path fault, the fail over waiting time
is more than 20 seconds.
After analysis, it was found that the total of quiesce waiting time for 256 namespace is
more than 20 seconds.
After optimization, same scenario the fail over waiting time is less than 1 second.
> 
> -ck
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces
  2022-10-14  2:12   ` Chao Leng
@ 2022-10-15  0:30     ` Chaitanya Kulkarni
  0 siblings, 0 replies; 40+ messages in thread
From: Chaitanya Kulkarni @ 2022-10-15  0:30 UTC (permalink / raw)
  To: Chao Leng; +Cc: hch, linux-nvme, linux-block, sagi, kbusch, ming.lei, axboe


> The test result:
> Test Scenario: nvme over roce with multipathing, 256 namespaces
> When send I/Os to all namespaces, simulate a path fault, the fail over 
> waiting time
> is more than 20 seconds.
> After analysis, it was found that the total of quiesce waiting time for 
> 256 namespace is
> more than 20 seconds.
> After optimization, same scenario the fail over waiting time is less 
> than 1 second.

I'm not questioning what you are doing, please add these to the 
cover-letter or  patches for reference.

-ck


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-13  9:44 ` [PATCH v2 1/2] blk-mq: add tagset quiesce interface Chao Leng
  2022-10-13 10:28   ` Sagi Grimberg
@ 2022-10-17 13:39   ` Christoph Hellwig
  2022-10-17 13:42     ` Christoph Hellwig
                       ` (2 more replies)
  1 sibling, 3 replies; 40+ messages in thread
From: Christoph Hellwig @ 2022-10-17 13:39 UTC (permalink / raw)
  To: Chao Leng
  Cc: linux-nvme, linux-block, hch, sagi, kbusch, ming.lei, axboe,
	Paul E. McKenney

On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote:
> +	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
> +	if (rcu) {
> +		list_for_each_entry(q, &set->tag_list, tag_set_list) {
> +			if (blk_queue_noquiesced(q))
> +				continue;
> +
> +			init_rcu_head(&rcu[i].head);
> +			init_completion(&rcu[i].completion);
> +			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
> +			i++;
> +		}
> +
> +		for (i = 0; i < count; i++) {
> +			wait_for_completion(&rcu[i].completion);
> +			destroy_rcu_head(&rcu[i].head);
> +		}
> +		kvfree(rcu);
> +	} else {
> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
> +			synchronize_srcu(q->srcu);
> +	}

Having to allocate a struct rcu_synchronize for each of the potentially
many queues here is a bit sad.

Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
last week, so I wonder if something like that would also be feasible
for SRCU, as that would come in really handy here.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-17 13:39   ` Christoph Hellwig
@ 2022-10-17 13:42     ` Christoph Hellwig
  2022-10-18  8:39       ` Sagi Grimberg
  2022-10-18  9:52       ` Chao Leng
  2022-10-17 15:21     ` Paul E. McKenney
  2022-10-18  9:52     ` Chao Leng
  2 siblings, 2 replies; 40+ messages in thread
From: Christoph Hellwig @ 2022-10-17 13:42 UTC (permalink / raw)
  To: Chao Leng
  Cc: linux-nvme, linux-block, hch, sagi, kbusch, ming.lei, axboe,
	Paul E. McKenney

On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote:
> Having to allocate a struct rcu_synchronize for each of the potentially
> many queues here is a bit sad.
> 
> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
> last week, so I wonder if something like that would also be feasible
> for SRCU, as that would come in really handy here.

Alternatively I wonder if we could simply use a single srcu_struct
in the tag_set instead of one per-queue.  That would potentially
increase the number of read side critical sections
blk_mq_wait_quiesce_done would have to wait for in tagsets with
multiple queues, but I wonder how much overhead that would be in
practive.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-13 10:28   ` Sagi Grimberg
  2022-10-14  2:09     ` Chao Leng
@ 2022-10-17 13:43     ` Christoph Hellwig
  2022-10-18  9:51       ` Chao Leng
  1 sibling, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2022-10-17 13:43 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Chao Leng, linux-nvme, linux-block, hch, kbusch, ming.lei, axboe

On Thu, Oct 13, 2022 at 01:28:37PM +0300, Sagi Grimberg wrote:
>> Because some queues never need to be quiesced(e.g. nvme connect_q).
>> So introduce QUEUE_FLAG_NOQUIESCED to tagset quiesce interface to
>> skip the queue.
>
> I wouldn't say it never nor will ever quiesce, we just don't happen to
> quiesce it today...

Yes.  It really is a QUEUE_FLAG_SKIP_TAGSET_QUIESCE.


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 2/2] nvme: use blk_mq_[un]quiesce_tagset
  2022-10-13 10:22   ` Sagi Grimberg
  2022-10-14  2:09     ` Chao Leng
@ 2022-10-17 13:48     ` Christoph Hellwig
  1 sibling, 0 replies; 40+ messages in thread
From: Christoph Hellwig @ 2022-10-17 13:48 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Chao Leng, linux-nvme, linux-block, hch, kbusch, ming.lei, axboe

> Something like this:

That definitively seems like a sensible prep patch.

In the long run NVME_CTRL_STOPPED probably needs to be integrated
with the controller state machine, and moving it from the ns to
the ctrl is a good step towards that.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-17 13:39   ` Christoph Hellwig
  2022-10-17 13:42     ` Christoph Hellwig
@ 2022-10-17 15:21     ` Paul E. McKenney
  2022-10-17 15:31       ` Christoph Hellwig
  2022-10-18  9:52       ` Chao Leng
  2022-10-18  9:52     ` Chao Leng
  2 siblings, 2 replies; 40+ messages in thread
From: Paul E. McKenney @ 2022-10-17 15:21 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Chao Leng, linux-nvme, linux-block, sagi, kbusch, ming.lei, axboe

On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote:
> On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote:
> > +	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
> > +	if (rcu) {
> > +		list_for_each_entry(q, &set->tag_list, tag_set_list) {
> > +			if (blk_queue_noquiesced(q))
> > +				continue;
> > +
> > +			init_rcu_head(&rcu[i].head);
> > +			init_completion(&rcu[i].completion);
> > +			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
> > +			i++;
> > +		}
> > +
> > +		for (i = 0; i < count; i++) {
> > +			wait_for_completion(&rcu[i].completion);
> > +			destroy_rcu_head(&rcu[i].head);
> > +		}
> > +		kvfree(rcu);
> > +	} else {
> > +		list_for_each_entry(q, &set->tag_list, tag_set_list)
> > +			synchronize_srcu(q->srcu);
> > +	}
> 
> Having to allocate a struct rcu_synchronize for each of the potentially
> many queues here is a bit sad.
> 
> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
> last week, so I wonder if something like that would also be feasible
> for SRCU, as that would come in really handy here.

There is start_poll_synchronize_srcu() and poll_state_synchronize_srcu(),
but there would need to be an unsigned long for each srcu_struct from
which an SRCU grace period was required.  This would be half the size
of the "rcu" array above, but still maybe larger than you would like.

The resulting code might look something like this, with "rcu" now being
a pointer to unsigned long:

	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
	if (rcu) {
		list_for_each_entry(q, &set->tag_list, tag_set_list) {
			if (blk_queue_noquiesced(q))
				continue;
			rcu[i] = start_poll_synchronize_srcu(q->srcu);
			i++;
		}

		for (i = 0; i < count; i++)
			if (!poll_state_synchronize_srcu(q->srcu))
				synchronize_srcu(q->srcu);
		kvfree(rcu);
	} else {
		list_for_each_entry(q, &set->tag_list, tag_set_list)
			synchronize_srcu(q->srcu);
	}

Or as Christoph suggested, just have a single srcu_struct for the
whole group.

The main reason for having multiple srcu_struct structures is to
prevent the readers from one from holding up the updaters from another.
Except that by waiting for the multiple grace periods, you are losing
that property anyway, correct?  Or is this code waiting on only a small
fraction of the srcu_struct structures associated with blk_queue?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-17 15:21     ` Paul E. McKenney
@ 2022-10-17 15:31       ` Christoph Hellwig
  2022-10-17 22:41         ` Paul E. McKenney
  2022-10-18  9:52       ` Chao Leng
  1 sibling, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2022-10-17 15:31 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Christoph Hellwig, Chao Leng, linux-nvme, linux-block, sagi,
	kbusch, ming.lei, axboe

On Mon, Oct 17, 2022 at 08:21:36AM -0700, Paul E. McKenney wrote:
> The main reason for having multiple srcu_struct structures is to
> prevent the readers from one from holding up the updaters from another.
> Except that by waiting for the multiple grace periods, you are losing
> that property anyway, correct?  Or is this code waiting on only a small
> fraction of the srcu_struct structures associated with blk_queue?

There are a few places that do this.  SCSI and MMC could probably switch
to this scheme or at least and open coded version of it (if we move
to a per_tagsect srcu_struct open coding it might be a better idea
anyway).

del_gendisk is one where we have to go one queue at a time, and
that might actually matter for a device removal.  elevator_switch
is another one, there is a variant for it that works on the whole
tagset, but also those that don't.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-17 15:31       ` Christoph Hellwig
@ 2022-10-17 22:41         ` Paul E. McKenney
  2022-10-18  5:19           ` Christoph Hellwig
  0 siblings, 1 reply; 40+ messages in thread
From: Paul E. McKenney @ 2022-10-17 22:41 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Chao Leng, linux-nvme, linux-block, sagi, kbusch, ming.lei, axboe

On Mon, Oct 17, 2022 at 05:31:05PM +0200, Christoph Hellwig wrote:
> On Mon, Oct 17, 2022 at 08:21:36AM -0700, Paul E. McKenney wrote:
> > The main reason for having multiple srcu_struct structures is to
> > prevent the readers from one from holding up the updaters from another.
> > Except that by waiting for the multiple grace periods, you are losing
> > that property anyway, correct?  Or is this code waiting on only a small
> > fraction of the srcu_struct structures associated with blk_queue?
> 
> There are a few places that do this.  SCSI and MMC could probably switch
> to this scheme or at least and open coded version of it (if we move
> to a per_tagsect srcu_struct open coding it might be a better idea
> anyway).
> 
> del_gendisk is one where we have to go one queue at a time, and
> that might actually matter for a device removal.  elevator_switch
> is another one, there is a variant for it that works on the whole
> tagset, but also those that don't.

OK, thank you for the info!

Then the big question is "how long do the SRCU readers run?"

If all of the readers ran for exactly the same duration, there would be
little point in having more than one srcu_struct.

If the kernel knew up front how long the SRCU readers for a given entry
would run, it could provide an srcu_struct structure for each duration.
For a (fanciful) example, you could have one srcu_struct structure for
SSDs, another for rotating rust, a third for LAN-attached storage, and
a fourth for WAN-attached storage.  Maybe a fifth for lunar-based storage.

If it is impossible to know up front which entry has what SRCU read-side
latency characteristics, then the current approach of just giving each
entry its own srcu_struct structure is not at all a bad plan.

Does that help, or am I off in the weeds here?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-17 22:41         ` Paul E. McKenney
@ 2022-10-18  5:19           ` Christoph Hellwig
  2022-10-19  0:35             ` Ming Lei
  0 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2022-10-18  5:19 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Christoph Hellwig, Chao Leng, linux-nvme, linux-block, sagi,
	kbusch, ming.lei, axboe

On Mon, Oct 17, 2022 at 03:41:15PM -0700, Paul E. McKenney wrote:
> Then the big question is "how long do the SRCU readers run?"
> 
> If all of the readers ran for exactly the same duration, there would be
> little point in having more than one srcu_struct.

The SRCU readers are the I/O dispatch, which will have quite similar
runtimes for the different queues.

> If the kernel knew up front how long the SRCU readers for a given entry
> would run, it could provide an srcu_struct structure for each duration.
> For a (fanciful) example, you could have one srcu_struct structure for
> SSDs, another for rotating rust, a third for LAN-attached storage, and
> a fourth for WAN-attached storage.  Maybe a fifth for lunar-based storage.

All the different request_queues in a tag_set are for the same device.
There might be some corner cases like storare arrays where they have
different latencies.  But we're not even waiting for the I/O completion
here, this just protects the submission.

> Does that help, or am I off in the weeds here?

I think this was very helpful, and at least to be moving the srcu_struct
to the tag_set sounds like a good idea to explore.

Ming, anything I might have missed?

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-17 13:42     ` Christoph Hellwig
@ 2022-10-18  8:39       ` Sagi Grimberg
  2022-10-18  8:55         ` Christoph Hellwig
  2022-10-18  9:52       ` Chao Leng
  1 sibling, 1 reply; 40+ messages in thread
From: Sagi Grimberg @ 2022-10-18  8:39 UTC (permalink / raw)
  To: Christoph Hellwig, Chao Leng
  Cc: linux-nvme, linux-block, kbusch, ming.lei, axboe, Paul E. McKenney


>> Having to allocate a struct rcu_synchronize for each of the potentially
>> many queues here is a bit sad.
>>
>> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
>> last week, so I wonder if something like that would also be feasible
>> for SRCU, as that would come in really handy here.
> 
> Alternatively I wonder if we could simply use a single srcu_struct
> in the tag_set instead of one per-queue.  That would potentially
> increase the number of read side critical sections
> blk_mq_wait_quiesce_done would have to wait for in tagsets with
> multiple queues, but I wonder how much overhead that would be in
> practive.

It used to be on the hctx actually. It was moved to the request_queue
in an attempt to make parallel quiesce that didn't eventually complete
afaict.

For nvme, we never really quiesce a single namespace, so it will be a
net positive IMO. I also think that quiesce is really more a tagset
operation so I would think that its a more natural location.

The only question in my mind is regarding patch 2/2 with the subtle
ordering of nvme_set_queue_dying...

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-18  8:39       ` Sagi Grimberg
@ 2022-10-18  8:55         ` Christoph Hellwig
  2022-10-18  9:06           ` Sagi Grimberg
  0 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2022-10-18  8:55 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Chao Leng, linux-nvme, linux-block, kbusch,
	ming.lei, axboe

On Tue, Oct 18, 2022 at 11:39:30AM +0300, Sagi Grimberg wrote:
> The only question in my mind is regarding patch 2/2 with the subtle
> ordering of nvme_set_queue_dying...

I think we need to sort that out as a pre-patch.  There is quite some
history there was a fair amount of dead lock potential earlier, but
I think I sorted quite a lot out there, including a new lock for setting
the gendisk size, which is what the comment refers to.

Something like this:

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 059737c1a2c19..cb7623b5d2c8b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -5105,21 +5105,21 @@ static void nvme_stop_ns_queue(struct nvme_ns *ns)
 
 /*
  * Prepare a queue for teardown.
- *
- * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
- * the capacity to 0 after that to avoid blocking dispatchers that may be
- * holding bd_butex.  This will end buffered writers dirtying pages that can't
- * be synced.
  */
 static void nvme_set_queue_dying(struct nvme_ns *ns)
 {
 	if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
 		return;
 
+	/*
+	 * Mark the disk dead to prevent new opens, and set the capacity to 0
+	 * to end buffered writers dirtying pages that can't be synced.
+	 */
 	blk_mark_disk_dead(ns->disk);
-	nvme_start_ns_queue(ns);
-
 	set_capacity_and_notify(ns->disk, 0);
+
+	/* forcibly unquiesce queues to avoid blocking dispatch */
+	nvme_start_ns_queue(ns);
 }
 
 /**

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-18  8:55         ` Christoph Hellwig
@ 2022-10-18  9:06           ` Sagi Grimberg
  2022-10-18 11:05             ` Christoph Hellwig
  0 siblings, 1 reply; 40+ messages in thread
From: Sagi Grimberg @ 2022-10-18  9:06 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Chao Leng, linux-nvme, linux-block, kbusch, ming.lei, axboe


>> The only question in my mind is regarding patch 2/2 with the subtle
>> ordering of nvme_set_queue_dying...
> 
> I think we need to sort that out as a pre-patch.  There is quite some
> history there was a fair amount of dead lock potential earlier, but
> I think I sorted quite a lot out there, including a new lock for setting
> the gendisk size, which is what the comment refers to.
> 
> Something like this:
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 059737c1a2c19..cb7623b5d2c8b 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -5105,21 +5105,21 @@ static void nvme_stop_ns_queue(struct nvme_ns *ns)
>   
>   /*
>    * Prepare a queue for teardown.
> - *
> - * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
> - * the capacity to 0 after that to avoid blocking dispatchers that may be
> - * holding bd_butex.  This will end buffered writers dirtying pages that can't
> - * be synced.
>    */
>   static void nvme_set_queue_dying(struct nvme_ns *ns)
>   {
>   	if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
>   		return;
>   
> +	/*
> +	 * Mark the disk dead to prevent new opens, and set the capacity to 0
> +	 * to end buffered writers dirtying pages that can't be synced.
> +	 */
>   	blk_mark_disk_dead(ns->disk);
> -	nvme_start_ns_queue(ns);
> -
>   	set_capacity_and_notify(ns->disk, 0);
> +
> +	/* forcibly unquiesce queues to avoid blocking dispatch */
> +	nvme_start_ns_queue(ns);
>   }

If we no longer have this ordering requirement, then I don't see why
the unquiesce cannot move before or after nvme_set_queue_dying and apply
a tagset-wide quiesce/unquiesce...

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-17 13:43     ` Christoph Hellwig
@ 2022-10-18  9:51       ` Chao Leng
  0 siblings, 0 replies; 40+ messages in thread
From: Chao Leng @ 2022-10-18  9:51 UTC (permalink / raw)
  To: Christoph Hellwig, Sagi Grimberg
  Cc: linux-nvme, linux-block, kbusch, ming.lei, axboe



On 2022/10/17 21:43, Christoph Hellwig wrote:
> On Thu, Oct 13, 2022 at 01:28:37PM +0300, Sagi Grimberg wrote:
>>> Because some queues never need to be quiesced(e.g. nvme connect_q).
>>> So introduce QUEUE_FLAG_NOQUIESCED to tagset quiesce interface to
>>> skip the queue.
>>
>> I wouldn't say it never nor will ever quiesce, we just don't happen to
>> quiesce it today...
> 
> Yes.  It really is a QUEUE_FLAG_SKIP_TAGSET_QUIESCE.
I will modify it in patch V3.
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-17 15:21     ` Paul E. McKenney
  2022-10-17 15:31       ` Christoph Hellwig
@ 2022-10-18  9:52       ` Chao Leng
  2022-10-18 15:04         ` Paul E. McKenney
  1 sibling, 1 reply; 40+ messages in thread
From: Chao Leng @ 2022-10-18  9:52 UTC (permalink / raw)
  To: paulmck, Christoph Hellwig
  Cc: linux-nvme, linux-block, sagi, kbusch, ming.lei, axboe



On 2022/10/17 23:21, Paul E. McKenney wrote:
> On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote:
>> On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote:
>>> +	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
>>> +	if (rcu) {
>>> +		list_for_each_entry(q, &set->tag_list, tag_set_list) {
>>> +			if (blk_queue_noquiesced(q))
>>> +				continue;
>>> +
>>> +			init_rcu_head(&rcu[i].head);
>>> +			init_completion(&rcu[i].completion);
>>> +			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
>>> +			i++;
>>> +		}
>>> +
>>> +		for (i = 0; i < count; i++) {
>>> +			wait_for_completion(&rcu[i].completion);
>>> +			destroy_rcu_head(&rcu[i].head);
>>> +		}
>>> +		kvfree(rcu);
>>> +	} else {
>>> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
>>> +			synchronize_srcu(q->srcu);
>>> +	}
>>
>> Having to allocate a struct rcu_synchronize for each of the potentially
>> many queues here is a bit sad.
>>
>> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
>> last week, so I wonder if something like that would also be feasible
>> for SRCU, as that would come in really handy here.
> 
> There is start_poll_synchronize_srcu() and poll_state_synchronize_srcu(),
> but there would need to be an unsigned long for each srcu_struct from
> which an SRCU grace period was required.  This would be half the size
> of the "rcu" array above, but still maybe larger than you would like.
> 
> The resulting code might look something like this, with "rcu" now being
> a pointer to unsigned long:
> 
> 	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
> 	if (rcu) {
> 		list_for_each_entry(q, &set->tag_list, tag_set_list) {
> 			if (blk_queue_noquiesced(q))
> 				continue;
> 			rcu[i] = start_poll_synchronize_srcu(q->srcu);
> 			i++;
> 		}
> 
> 		for (i = 0; i < count; i++)
> 			if (!poll_state_synchronize_srcu(q->srcu))
> 				synchronize_srcu(q->srcu);
synchronize_srcu will restart a new period of grace.
Maybe it would be better like this:
			while (!poll_state_synchronize_srcu(q->srcu, rcu[i]))
				schedule_timeout_uninterruptible(1);
> 		kvfree(rcu);
> 	} else {
> 		list_for_each_entry(q, &set->tag_list, tag_set_list)
> 			synchronize_srcu(q->srcu);
> 	}
> 
> Or as Christoph suggested, just have a single srcu_struct for the
> whole group.
> 
> The main reason for having multiple srcu_struct structures is to
> prevent the readers from one from holding up the updaters from another.
> Except that by waiting for the multiple grace periods, you are losing
> that property anyway, correct?  Or is this code waiting on only a small
> fraction of the srcu_struct structures associated with blk_queue?
> 
> 							Thanx, Paul
> .
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-17 13:39   ` Christoph Hellwig
  2022-10-17 13:42     ` Christoph Hellwig
  2022-10-17 15:21     ` Paul E. McKenney
@ 2022-10-18  9:52     ` Chao Leng
  2 siblings, 0 replies; 40+ messages in thread
From: Chao Leng @ 2022-10-18  9:52 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-nvme, linux-block, sagi, kbusch, ming.lei, axboe, Paul E. McKenney



On 2022/10/17 21:39, Christoph Hellwig wrote:
> On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote:
>> +	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
>> +	if (rcu) {
>> +		list_for_each_entry(q, &set->tag_list, tag_set_list) {
>> +			if (blk_queue_noquiesced(q))
>> +				continue;
>> +
>> +			init_rcu_head(&rcu[i].head);
>> +			init_completion(&rcu[i].completion);
>> +			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
>> +			i++;
>> +		}
>> +
>> +		for (i = 0; i < count; i++) {
>> +			wait_for_completion(&rcu[i].completion);
>> +			destroy_rcu_head(&rcu[i].head);
>> +		}
>> +		kvfree(rcu);
>> +	} else {
>> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
>> +			synchronize_srcu(q->srcu);
>> +	}
> 
> Having to allocate a struct rcu_synchronize for each of the potentially
> many queues here is a bit sad.
> 
> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
> last week, so I wonder if something like that would also be feasible
> for SRCU, as that would come in really handy here.
Using start_poll_synchronize_rcu will reduce some memory.
This is a good idea.
> 
> .
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-17 13:42     ` Christoph Hellwig
  2022-10-18  8:39       ` Sagi Grimberg
@ 2022-10-18  9:52       ` Chao Leng
  1 sibling, 0 replies; 40+ messages in thread
From: Chao Leng @ 2022-10-18  9:52 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-nvme, linux-block, sagi, kbusch, ming.lei, axboe, Paul E. McKenney



On 2022/10/17 21:42, Christoph Hellwig wrote:
> On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote:
>> Having to allocate a struct rcu_synchronize for each of the potentially
>> many queues here is a bit sad.
>>
>> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
>> last week, so I wonder if something like that would also be feasible
>> for SRCU, as that would come in really handy here.
> 
> Alternatively I wonder if we could simply use a single srcu_struct
> in the tag_set instead of one per-queue.  That would potentially
> increase the number of read side critical sections
> blk_mq_wait_quiesce_done would have to wait for in tagsets with
> multiple queues, but I wonder how much overhead that would be in
> practive.
Now we submit request based on queues, maybe it is difficult to use
a single srcu_struct in the tag instead of one per-queue.
Using start_poll_synchronize_rcu  still need a unsigned long array.
> 
> .
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-18  9:06           ` Sagi Grimberg
@ 2022-10-18 11:05             ` Christoph Hellwig
  0 siblings, 0 replies; 40+ messages in thread
From: Christoph Hellwig @ 2022-10-18 11:05 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Chao Leng, linux-nvme, linux-block, kbusch,
	ming.lei, axboe

On Tue, Oct 18, 2022 at 12:06:41PM +0300, Sagi Grimberg wrote:
>>   +	/*
>> +	 * Mark the disk dead to prevent new opens, and set the capacity to 0
>> +	 * to end buffered writers dirtying pages that can't be synced.
>> +	 */
>>   	blk_mark_disk_dead(ns->disk);
>> -	nvme_start_ns_queue(ns);
>> -
>>   	set_capacity_and_notify(ns->disk, 0);
>> +
>> +	/* forcibly unquiesce queues to avoid blocking dispatch */
>> +	nvme_start_ns_queue(ns);
>>   }
>
> If we no longer have this ordering requirement, then I don't see why
> the unquiesce cannot move before or after nvme_set_queue_dying and apply
> a tagset-wide quiesce/unquiesce...

Yes.  After this patch we can simply do the tagset-wide unquiesce
after the loop calling blk_mark_disk_dead and set_capacity_and_notify.

We can then also move the NЅ_DEAD flag to the controller..

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-18  9:52       ` Chao Leng
@ 2022-10-18 15:04         ` Paul E. McKenney
  2022-10-19  2:39           ` Chao Leng
  0 siblings, 1 reply; 40+ messages in thread
From: Paul E. McKenney @ 2022-10-18 15:04 UTC (permalink / raw)
  To: Chao Leng
  Cc: Christoph Hellwig, linux-nvme, linux-block, sagi, kbusch,
	ming.lei, axboe

On Tue, Oct 18, 2022 at 05:52:06PM +0800, Chao Leng wrote:
> On 2022/10/17 23:21, Paul E. McKenney wrote:
> > On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote:
> > > On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote:
> > > > +	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
> > > > +	if (rcu) {
> > > > +		list_for_each_entry(q, &set->tag_list, tag_set_list) {
> > > > +			if (blk_queue_noquiesced(q))
> > > > +				continue;
> > > > +
> > > > +			init_rcu_head(&rcu[i].head);
> > > > +			init_completion(&rcu[i].completion);
> > > > +			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
> > > > +			i++;
> > > > +		}
> > > > +
> > > > +		for (i = 0; i < count; i++) {
> > > > +			wait_for_completion(&rcu[i].completion);
> > > > +			destroy_rcu_head(&rcu[i].head);
> > > > +		}
> > > > +		kvfree(rcu);
> > > > +	} else {
> > > > +		list_for_each_entry(q, &set->tag_list, tag_set_list)
> > > > +			synchronize_srcu(q->srcu);
> > > > +	}
> > > 
> > > Having to allocate a struct rcu_synchronize for each of the potentially
> > > many queues here is a bit sad.
> > > 
> > > Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
> > > last week, so I wonder if something like that would also be feasible
> > > for SRCU, as that would come in really handy here.
> > 
> > There is start_poll_synchronize_srcu() and poll_state_synchronize_srcu(),
> > but there would need to be an unsigned long for each srcu_struct from
> > which an SRCU grace period was required.  This would be half the size
> > of the "rcu" array above, but still maybe larger than you would like.
> > 
> > The resulting code might look something like this, with "rcu" now being
> > a pointer to unsigned long:
> > 
> > 	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
> > 	if (rcu) {
> > 		list_for_each_entry(q, &set->tag_list, tag_set_list) {
> > 			if (blk_queue_noquiesced(q))
> > 				continue;
> > 			rcu[i] = start_poll_synchronize_srcu(q->srcu);
> > 			i++;
> > 		}
> > 
> > 		for (i = 0; i < count; i++)
> > 			if (!poll_state_synchronize_srcu(q->srcu))
> > 				synchronize_srcu(q->srcu);
> synchronize_srcu will restart a new period of grace.

True, but SRCU grace periods normally complete reasonably quickly, so
the synchronize_srcu() might well be faster than the loop, depending on
what the corresponding SRCU readers are doing.

> Maybe it would be better like this:
> 			while (!poll_state_synchronize_srcu(q->srcu, rcu[i]))
> 				schedule_timeout_uninterruptible(1);

Why not try it both ways and see what happens?  Assuming that is, that
the differences matter in this code.

							Thanx, Paul

> > 		kvfree(rcu);
> > 	} else {
> > 		list_for_each_entry(q, &set->tag_list, tag_set_list)
> > 			synchronize_srcu(q->srcu);
> > 	}
> > 
> > Or as Christoph suggested, just have a single srcu_struct for the
> > whole group.
> > 
> > The main reason for having multiple srcu_struct structures is to
> > prevent the readers from one from holding up the updaters from another.
> > Except that by waiting for the multiple grace periods, you are losing
> > that property anyway, correct?  Or is this code waiting on only a small
> > fraction of the srcu_struct structures associated with blk_queue?
> > 
> > 							Thanx, Paul
> > .
> > 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-18  5:19           ` Christoph Hellwig
@ 2022-10-19  0:35             ` Ming Lei
  2022-10-19  7:15               ` Sagi Grimberg
  0 siblings, 1 reply; 40+ messages in thread
From: Ming Lei @ 2022-10-19  0:35 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Paul E. McKenney, Chao Leng, linux-nvme, linux-block, sagi,
	kbusch, axboe

On Tue, Oct 18, 2022 at 07:19:56AM +0200, Christoph Hellwig wrote:
> On Mon, Oct 17, 2022 at 03:41:15PM -0700, Paul E. McKenney wrote:
> > Then the big question is "how long do the SRCU readers run?"
> > 
> > If all of the readers ran for exactly the same duration, there would be
> > little point in having more than one srcu_struct.
> 
> The SRCU readers are the I/O dispatch, which will have quite similar
> runtimes for the different queues.
> 
> > If the kernel knew up front how long the SRCU readers for a given entry
> > would run, it could provide an srcu_struct structure for each duration.
> > For a (fanciful) example, you could have one srcu_struct structure for
> > SSDs, another for rotating rust, a third for LAN-attached storage, and
> > a fourth for WAN-attached storage.  Maybe a fifth for lunar-based storage.
> 
> All the different request_queues in a tag_set are for the same device.
> There might be some corner cases like storare arrays where they have
> different latencies.  But we're not even waiting for the I/O completion
> here, this just protects the submission.
> 
> > Does that help, or am I off in the weeds here?
> 
> I think this was very helpful, and at least to be moving the srcu_struct
> to the tag_set sounds like a good idea to explore.
> 
> Ming, anything I might have missed?

I think it is fine to move it to tag_set, this way could simplify a
lot.

The effect could be that blk_mq_quiesce_queue() becomes a little
slow, but it is always in slow path, and synchronize_srcu() won't
wait new read-side critical-section.

Just one corner case, in case of BLK_MQ_F_BLOCKING, is there any such driver
which may block too long in ->queue_rq() sometime? such as wait for dozens
of seconds.


thanks,
Ming


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-18 15:04         ` Paul E. McKenney
@ 2022-10-19  2:39           ` Chao Leng
  0 siblings, 0 replies; 40+ messages in thread
From: Chao Leng @ 2022-10-19  2:39 UTC (permalink / raw)
  To: paulmck
  Cc: Christoph Hellwig, linux-nvme, linux-block, sagi, kbusch,
	ming.lei, axboe



On 2022/10/18 23:04, Paul E. McKenney wrote:
> On Tue, Oct 18, 2022 at 05:52:06PM +0800, Chao Leng wrote:
>> On 2022/10/17 23:21, Paul E. McKenney wrote:
>>> On Mon, Oct 17, 2022 at 03:39:06PM +0200, Christoph Hellwig wrote:
>>>> On Thu, Oct 13, 2022 at 05:44:49PM +0800, Chao Leng wrote:
>>>>> +	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
>>>>> +	if (rcu) {
>>>>> +		list_for_each_entry(q, &set->tag_list, tag_set_list) {
>>>>> +			if (blk_queue_noquiesced(q))
>>>>> +				continue;
>>>>> +
>>>>> +			init_rcu_head(&rcu[i].head);
>>>>> +			init_completion(&rcu[i].completion);
>>>>> +			call_srcu(q->srcu, &rcu[i].head, wakeme_after_rcu);
>>>>> +			i++;
>>>>> +		}
>>>>> +
>>>>> +		for (i = 0; i < count; i++) {
>>>>> +			wait_for_completion(&rcu[i].completion);
>>>>> +			destroy_rcu_head(&rcu[i].head);
>>>>> +		}
>>>>> +		kvfree(rcu);
>>>>> +	} else {
>>>>> +		list_for_each_entry(q, &set->tag_list, tag_set_list)
>>>>> +			synchronize_srcu(q->srcu);
>>>>> +	}
>>>>
>>>> Having to allocate a struct rcu_synchronize for each of the potentially
>>>> many queues here is a bit sad.
>>>>
>>>> Pull just explained the start_poll_synchronize_rcu interfaces at ALPSS
>>>> last week, so I wonder if something like that would also be feasible
>>>> for SRCU, as that would come in really handy here.
>>>
>>> There is start_poll_synchronize_srcu() and poll_state_synchronize_srcu(),
>>> but there would need to be an unsigned long for each srcu_struct from
>>> which an SRCU grace period was required.  This would be half the size
>>> of the "rcu" array above, but still maybe larger than you would like.
>>>
>>> The resulting code might look something like this, with "rcu" now being
>>> a pointer to unsigned long:
>>>
>>> 	rcu = kvmalloc(count * sizeof(*rcu), GFP_KERNEL);
>>> 	if (rcu) {
>>> 		list_for_each_entry(q, &set->tag_list, tag_set_list) {
>>> 			if (blk_queue_noquiesced(q))
>>> 				continue;
>>> 			rcu[i] = start_poll_synchronize_srcu(q->srcu);
>>> 			i++;
>>> 		}
>>>
>>> 		for (i = 0; i < count; i++)
>>> 			if (!poll_state_synchronize_srcu(q->srcu))
>>> 				synchronize_srcu(q->srcu);
>> synchronize_srcu will restart a new period of grace.
> 
> True, but SRCU grace periods normally complete reasonably quickly, so
> the synchronize_srcu() might well be faster than the loop, depending on
> what the corresponding SRCU readers are doing.
Yes, Different runtimes have different wait times, and it's hard to say
which is better or worse.
> 
>> Maybe it would be better like this:
>> 			while (!poll_state_synchronize_srcu(q->srcu, rcu[i]))
>> 				schedule_timeout_uninterruptible(1);
> 
> Why not try it both ways and see what happens?  Assuming that is, that
> the differences matter in this code.
> 
> 							Thanx, Paul
> 
>>> 		kvfree(rcu);
>>> 	} else {
>>> 		list_for_each_entry(q, &set->tag_list, tag_set_list)
>>> 			synchronize_srcu(q->srcu);
>>> 	}
>>>
>>> Or as Christoph suggested, just have a single srcu_struct for the
>>> whole group.
>>>
>>> The main reason for having multiple srcu_struct structures is to
>>> prevent the readers from one from holding up the updaters from another.
>>> Except that by waiting for the multiple grace periods, you are losing
>>> that property anyway, correct?  Or is this code waiting on only a small
>>> fraction of the srcu_struct structures associated with blk_queue?
>>>
>>> 							Thanx, Paul
>>> .
>>>
> .
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-19  0:35             ` Ming Lei
@ 2022-10-19  7:15               ` Sagi Grimberg
  2022-10-19  7:25                 ` Christoph Hellwig
  0 siblings, 1 reply; 40+ messages in thread
From: Sagi Grimberg @ 2022-10-19  7:15 UTC (permalink / raw)
  To: Ming Lei, Christoph Hellwig
  Cc: Paul E. McKenney, Chao Leng, linux-nvme, linux-block, kbusch, axboe


>>> Then the big question is "how long do the SRCU readers run?"
>>>
>>> If all of the readers ran for exactly the same duration, there would be
>>> little point in having more than one srcu_struct.
>>
>> The SRCU readers are the I/O dispatch, which will have quite similar
>> runtimes for the different queues.
>>
>>> If the kernel knew up front how long the SRCU readers for a given entry
>>> would run, it could provide an srcu_struct structure for each duration.
>>> For a (fanciful) example, you could have one srcu_struct structure for
>>> SSDs, another for rotating rust, a third for LAN-attached storage, and
>>> a fourth for WAN-attached storage.  Maybe a fifth for lunar-based storage.
>>
>> All the different request_queues in a tag_set are for the same device.
>> There might be some corner cases like storare arrays where they have
>> different latencies.  But we're not even waiting for the I/O completion
>> here, this just protects the submission.
>>
>>> Does that help, or am I off in the weeds here?
>>
>> I think this was very helpful, and at least to be moving the srcu_struct
>> to the tag_set sounds like a good idea to explore.
>>
>> Ming, anything I might have missed?
> 
> I think it is fine to move it to tag_set, this way could simplify a
> lot.
> 
> The effect could be that blk_mq_quiesce_queue() becomes a little
> slow, but it is always in slow path, and synchronize_srcu() won't
> wait new read-side critical-section.
> 
> Just one corner case, in case of BLK_MQ_F_BLOCKING, is there any such driver
> which may block too long in ->queue_rq() sometime? such as wait for dozens
> of seconds.

nvme-tcp will opportunistically try a network send directly from
.queue_rq(), but always with MSG_DONTWAIT, so that is not a problem.

nbd though can block in .queue_rq() with blocking network sends, however
afaict nbd allocates a tagset per nbd_device, and also a request_queue
per device, so its effectively the same if the srcu is in the tagset or
in the request_queue.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-19  7:15               ` Sagi Grimberg
@ 2022-10-19  7:25                 ` Christoph Hellwig
  2022-10-19  7:27                   ` Christoph Hellwig
  2022-10-19  7:30                   ` Sagi Grimberg
  0 siblings, 2 replies; 40+ messages in thread
From: Christoph Hellwig @ 2022-10-19  7:25 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Ming Lei, Christoph Hellwig, Paul E. McKenney, Chao Leng,
	linux-nvme, linux-block, kbusch, axboe

On Wed, Oct 19, 2022 at 10:15:26AM +0300, Sagi Grimberg wrote:
> nvme-tcp will opportunistically try a network send directly from
> .queue_rq(), but always with MSG_DONTWAIT, so that is not a problem.
>
> nbd though can block in .queue_rq() with blocking network sends, however
> afaict nbd allocates a tagset per nbd_device, and also a request_queue
> per device, so its effectively the same if the srcu is in the tagset or
> in the request_queue.

Yes, the only multiple request_queue per tag_set cases right now
are nvme-tcp and mmc.  There have been patches from iSCSI, but they
seem to be stuck.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-19  7:25                 ` Christoph Hellwig
@ 2022-10-19  7:27                   ` Christoph Hellwig
  2022-10-19  7:30                   ` Sagi Grimberg
  1 sibling, 0 replies; 40+ messages in thread
From: Christoph Hellwig @ 2022-10-19  7:27 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Ming Lei, Christoph Hellwig, Paul E. McKenney, Chao Leng,
	linux-nvme, linux-block, kbusch, axboe

On Wed, Oct 19, 2022 at 09:25:35AM +0200, Christoph Hellwig wrote:
> Yes, the only multiple request_queue per tag_set cases right now
> are nvme-tcp and mmc.  There have been patches from iSCSI, but they
> seem to be stuck.

... and looking at the only blk_mq_quiesce_queue caller in mmc,
it would benefit from a tagset-wide quiesce after a bit of refactoring
as well.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-19  7:25                 ` Christoph Hellwig
  2022-10-19  7:27                   ` Christoph Hellwig
@ 2022-10-19  7:30                   ` Sagi Grimberg
  2022-10-19  7:32                     ` Christoph Hellwig
  1 sibling, 1 reply; 40+ messages in thread
From: Sagi Grimberg @ 2022-10-19  7:30 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Ming Lei, Paul E. McKenney, Chao Leng, linux-nvme, linux-block,
	kbusch, axboe


>> nvme-tcp will opportunistically try a network send directly from
>> .queue_rq(), but always with MSG_DONTWAIT, so that is not a problem.
>>
>> nbd though can block in .queue_rq() with blocking network sends, however
>> afaict nbd allocates a tagset per nbd_device, and also a request_queue
>> per device, so its effectively the same if the srcu is in the tagset or
>> in the request_queue.
> 
> Yes, the only multiple request_queue per tag_set cases right now
> are nvme-tcp and mmc.  There have been patches from iSCSI, but they
> seem to be stuck.

iscsi defers network sends to a workqueue anyways, and no scsi lld sets
BLK_MQ_F_BLOCKING.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-19  7:30                   ` Sagi Grimberg
@ 2022-10-19  7:32                     ` Christoph Hellwig
  2022-10-19  7:57                       ` Sagi Grimberg
  0 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2022-10-19  7:32 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Ming Lei, Paul E. McKenney, Chao Leng,
	linux-nvme, linux-block, kbusch, axboe

On Wed, Oct 19, 2022 at 10:30:44AM +0300, Sagi Grimberg wrote:
> iscsi defers network sends to a workqueue anyways, and no scsi lld sets
> BLK_MQ_F_BLOCKING.

Yes, but Mike had a series to implement a similar fast path as nvme-tcp
as while ago that included supporting BLK_MQ_F_BLOCKING in the scsi
midlayer.  We'll also need it to convert the paride drivers to libata,
but those are single request_queue per tag_set as far as I know.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-19  7:32                     ` Christoph Hellwig
@ 2022-10-19  7:57                       ` Sagi Grimberg
  2022-10-19  8:17                         ` Christoph Hellwig
  0 siblings, 1 reply; 40+ messages in thread
From: Sagi Grimberg @ 2022-10-19  7:57 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Ming Lei, Paul E. McKenney, Chao Leng, linux-nvme, linux-block,
	kbusch, axboe



On 10/19/22 10:32, Christoph Hellwig wrote:
> On Wed, Oct 19, 2022 at 10:30:44AM +0300, Sagi Grimberg wrote:
>> iscsi defers network sends to a workqueue anyways, and no scsi lld sets
>> BLK_MQ_F_BLOCKING.
> 
> Yes, but Mike had a series to implement a similar fast path as nvme-tcp
> as while ago that included supporting BLK_MQ_F_BLOCKING in the scsi
> midlayer.

Hmmm, didn't see that, pointer?

>  We'll also need it to convert the paride drivers to libata,
> but those are single request_queue per tag_set as far as I know.

didn't see paride even attempting to quiesce...

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-19  7:57                       ` Sagi Grimberg
@ 2022-10-19  8:17                         ` Christoph Hellwig
  2022-10-19  8:29                           ` Sagi Grimberg
  0 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2022-10-19  8:17 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Ming Lei, Paul E. McKenney, Chao Leng,
	linux-nvme, linux-block, kbusch, axboe

On Wed, Oct 19, 2022 at 10:57:11AM +0300, Sagi Grimberg wrote:
>
>
> On 10/19/22 10:32, Christoph Hellwig wrote:
>> On Wed, Oct 19, 2022 at 10:30:44AM +0300, Sagi Grimberg wrote:
>>> iscsi defers network sends to a workqueue anyways, and no scsi lld sets
>>> BLK_MQ_F_BLOCKING.
>>
>> Yes, but Mike had a series to implement a similar fast path as nvme-tcp
>> as while ago that included supporting BLK_MQ_F_BLOCKING in the scsi
>> midlayer.
>
> Hmmm, didn't see that, pointer?

https://www.spinics.net/lists/linux-scsi/msg170849.html

>>  We'll also need it to convert the paride drivers to libata,
>> but those are single request_queue per tag_set as far as I know.
>
> didn't see paride even attempting to quiesce...

The block layer might.  But it really does not matter for our
considerations here..

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 1/2] blk-mq: add tagset quiesce interface
  2022-10-19  8:17                         ` Christoph Hellwig
@ 2022-10-19  8:29                           ` Sagi Grimberg
  0 siblings, 0 replies; 40+ messages in thread
From: Sagi Grimberg @ 2022-10-19  8:29 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Ming Lei, Paul E. McKenney, Chao Leng, linux-nvme, linux-block,
	kbusch, axboe


>>>> iscsi defers network sends to a workqueue anyways, and no scsi lld sets
>>>> BLK_MQ_F_BLOCKING.
>>>
>>> Yes, but Mike had a series to implement a similar fast path as nvme-tcp
>>> as while ago that included supporting BLK_MQ_F_BLOCKING in the scsi
>>> midlayer.
>>
>> Hmmm, didn't see that, pointer?
> 
> https://www.spinics.net/lists/linux-scsi/msg170849.html

Nice.

>>>   We'll also need it to convert the paride drivers to libata,
>>> but those are single request_queue per tag_set as far as I know.
>>
>> didn't see paride even attempting to quiesce...
> 
> The block layer might.  But it really does not matter for our
> considerations here..

Right.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces
  2022-10-20  6:34 ` Christoph Hellwig
@ 2022-10-20  9:36   ` Chao Leng
  0 siblings, 0 replies; 40+ messages in thread
From: Chao Leng @ 2022-10-20  9:36 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-nvme, linux-block, sagi, kbusch, axboe, ming.lei, paulmck



On 2022/10/20 14:34, Christoph Hellwig wrote:
> I though the conclusion from last round was to move the srcu_struct to
> the tagset?
If move the srcu_struct to the tagset, may loss the flexibility.
I am not sure that it is good for other modules currently using blk_mq_quiesce_queue.
Another, I am not sure if there will be a future scenario where blk_mq_quiesce_queue
will have to be used, and if it is good for such scenario.
It is a acceptable cost to allocate a temporary array for SRCU, the max memory size
is actually a few hundred KB, and most of the time it's less than a few KB.
So I did not move the srcu_struct to the tagset in patch V3.

It sounds like a good idea that explore moving the srcu_struct to the tag_set,
But we may need more time to analysis it.
I suggest that do the optimization in a separate patch set.
> 
> .
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces
  2022-10-20  3:53 Chao Leng
@ 2022-10-20  6:34 ` Christoph Hellwig
  2022-10-20  9:36   ` Chao Leng
  0 siblings, 1 reply; 40+ messages in thread
From: Christoph Hellwig @ 2022-10-20  6:34 UTC (permalink / raw)
  To: Chao Leng
  Cc: linux-nvme, linux-block, hch, sagi, kbusch, axboe, ming.lei, paulmck

I though the conclusion from last round was to move the srcu_struct to
the tagset?

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces
@ 2022-10-20  3:53 Chao Leng
  2022-10-20  6:34 ` Christoph Hellwig
  0 siblings, 1 reply; 40+ messages in thread
From: Chao Leng @ 2022-10-20  3:53 UTC (permalink / raw)
  To: linux-nvme, linux-block
  Cc: hch, sagi, kbusch, lengchao, axboe, ming.lei, paulmck

Now nvme_stop_queues quiesce all queues one by one. Every queue must
wait a grace period(rcu or srcu). If the controller has a large amount
of namespaces, the total waiting time is very long.
Test result: the total waiting time is more than 20 seconds when the
controller has 256 namespace.

This set improves the quiesce time when using a large set of namespaces,
which also improves I/O failover time in a multipath environment.

We improve for both non-blocking tagset and blocking tagset introducing
blk_mq_[un]quiesce_tagset which works on all request queues over a given
tagset in parallel (which is the case in nvme).

Changes from v2:
- replace call_srcu to start_poll_synchronize_srcu
- rename the flag name to make it accurate.
- move unquiescing queue outside from nvme_set_queue_dying
- add mutex to ensure that all queues are quiesced after set the NVME_CTRL_STOPPED.

Changes from v1:
- improvement is based on tagset rather than namesapces

Chao Leng (2):
  blk-mq: add tagset quiesce interface
  nvme: use blk_mq_[un]quiesce_tagset

 block/blk-mq.c           | 76 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/core.c | 57 +++++++++++++++---------------------
 drivers/nvme/host/nvme.h |  3 +-
 include/linux/blk-mq.h   |  2 ++
 include/linux/blkdev.h   |  3 ++
 5 files changed, 106 insertions(+), 35 deletions(-)

-- 
2.16.4

^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2022-10-20  9:36 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-13  9:44 [PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces Chao Leng
2022-10-13  9:44 ` [PATCH v2 1/2] blk-mq: add tagset quiesce interface Chao Leng
2022-10-13 10:28   ` Sagi Grimberg
2022-10-14  2:09     ` Chao Leng
2022-10-17 13:43     ` Christoph Hellwig
2022-10-18  9:51       ` Chao Leng
2022-10-17 13:39   ` Christoph Hellwig
2022-10-17 13:42     ` Christoph Hellwig
2022-10-18  8:39       ` Sagi Grimberg
2022-10-18  8:55         ` Christoph Hellwig
2022-10-18  9:06           ` Sagi Grimberg
2022-10-18 11:05             ` Christoph Hellwig
2022-10-18  9:52       ` Chao Leng
2022-10-17 15:21     ` Paul E. McKenney
2022-10-17 15:31       ` Christoph Hellwig
2022-10-17 22:41         ` Paul E. McKenney
2022-10-18  5:19           ` Christoph Hellwig
2022-10-19  0:35             ` Ming Lei
2022-10-19  7:15               ` Sagi Grimberg
2022-10-19  7:25                 ` Christoph Hellwig
2022-10-19  7:27                   ` Christoph Hellwig
2022-10-19  7:30                   ` Sagi Grimberg
2022-10-19  7:32                     ` Christoph Hellwig
2022-10-19  7:57                       ` Sagi Grimberg
2022-10-19  8:17                         ` Christoph Hellwig
2022-10-19  8:29                           ` Sagi Grimberg
2022-10-18  9:52       ` Chao Leng
2022-10-18 15:04         ` Paul E. McKenney
2022-10-19  2:39           ` Chao Leng
2022-10-18  9:52     ` Chao Leng
2022-10-13  9:44 ` [PATCH v2 2/2] nvme: use blk_mq_[un]quiesce_tagset Chao Leng
2022-10-13 10:22   ` Sagi Grimberg
2022-10-14  2:09     ` Chao Leng
2022-10-17 13:48     ` Christoph Hellwig
2022-10-13 14:32 ` [PATCH v2 0/2] improve nvme quiesce time for large amount of namespaces Chaitanya Kulkarni
2022-10-14  2:12   ` Chao Leng
2022-10-15  0:30     ` Chaitanya Kulkarni
2022-10-20  3:53 Chao Leng
2022-10-20  6:34 ` Christoph Hellwig
2022-10-20  9:36   ` Chao Leng

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.