[PATCH v4 0/2] improve quiesce time for large amount of namespaces

linux-nvme.lists.infradead.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v4 0/2] improve quiesce time for large amount of namespaces
@ 2020-07-27 22:07 Sagi Grimberg
  2020-07-27 22:07 ` [PATCH v4 1/2] blk-mq: add async quiesce interface Sagi Grimberg
  2020-07-27 22:07 ` [PATCH v4 2/2] nvme: improve quiesce time for large amount of namespaces Sagi Grimberg
  0 siblings, 2 replies; 5+ messages in thread
From: Sagi Grimberg @ 2020-07-27 22:07 UTC (permalink / raw)
  To: linux-nvme, Christoph Hellwig, Keith Busch, Jens Axboe
  Cc: linux-block, Ming Lin, Chao Leng

This set improves the quiesce time when using a large set of
namespaces, which also improves I/O failover time in a multipath environment.

We improve for both non-blocking hctxs (e.g. pci, fc, rdma nvme transports)
and blocking hctxs (e.g. tcp nvme transport) by splitting queue quiesce to
blk_mq_quiesce_queue_async call_(s)rcu and blk_mq_quiesce_queue_async_wait
to wait for it to complete. These calls are meant to be called in parallel
for different queues.

Changes from v3:
- make hctx->rcu_sync dynamically allocated from the heap instead
  of a static member function

Changes from v2:
- made blk_mq_quiesce_queue_async operate on both blocking and
  non-blocking hctxs.
- removed separation between blocking vs. non-blocking queues
- dropeed patch from Chao
- dropped nvme-rdma test patch

Changes from v1:
- trivial typo fixes

Sagi Grimberg (2):
  blk-mq: add async quiesce interface
  nvme: improve quiesce time for large amount of namespaces

 block/blk-mq.c           | 46 ++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/core.c |  4 +++-
 include/linux/blk-mq.h   |  4 ++++
 3 files changed, 53 insertions(+), 1 deletion(-)

-- 
2.25.1


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v4 1/2] blk-mq: add async quiesce interface
  2020-07-27 22:07 [PATCH v4 0/2] improve quiesce time for large amount of namespaces Sagi Grimberg
@ 2020-07-27 22:07 ` Sagi Grimberg
  2020-07-27 22:13   ` Jens Axboe
  2020-07-27 22:07 ` [PATCH v4 2/2] nvme: improve quiesce time for large amount of namespaces Sagi Grimberg
  1 sibling, 1 reply; 5+ messages in thread
From: Sagi Grimberg @ 2020-07-27 22:07 UTC (permalink / raw)
  To: linux-nvme, Christoph Hellwig, Keith Busch, Jens Axboe
  Cc: linux-block, Ming Lin, Chao Leng

Drivers that may have to quiesce a large amount of request queues at once
(e.g. controller or adapter reset). These drivers would benefit from an
async quiesce interface such that the can trigger quiesce asynchronously
and wait for all in parallel.

This leaves the synchronization responsibility to the driver, but adds
a convenient interface to quiesce async and wait in a single pass.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 block/blk-mq.c         | 46 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/blk-mq.h |  4 ++++
 2 files changed, 50 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index abcf590f6238..d913924117d2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -209,6 +209,52 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
 
+void blk_mq_quiesce_queue_async(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+	int rcu = false;
+
+	blk_mq_quiesce_queue_nowait(q);
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		hctx->rcu_sync = kmalloc(sizeof(*hctx->rcu_sync), GFP_KERNEL);
+		if (!hctx->rcu_sync) {
+			/* fallback to serial rcu sync */
+			if (hctx->flags & BLK_MQ_F_BLOCKING)
+				synchronize_srcu(hctx->srcu);
+			else
+				rcu = true;
+		} else {
+			init_completion(&hctx->rcu_sync->completion);
+			init_rcu_head(&hctx->rcu_sync->head);
+			if (hctx->flags & BLK_MQ_F_BLOCKING)
+				call_srcu(hctx->srcu, &hctx->rcu_sync->head,
+					wakeme_after_rcu);
+			else
+				call_rcu(&hctx->rcu_sync->head,
+					wakeme_after_rcu);
+		}
+	}
+	if (rcu)
+		synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_async);
+
+void blk_mq_quiesce_queue_async_wait(struct request_queue *q)
+{
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+
+	queue_for_each_hw_ctx(q, hctx, i) {
+		if (!hctx->rcu_sync)
+			continue;
+		wait_for_completion(&hctx->rcu_sync->completion);
+		destroy_rcu_head(&hctx->rcu_sync->head);
+	}
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_async_wait);
+
 /**
  * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
  * @q: request queue.
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 23230c1d031e..7213ce56bb31 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -5,6 +5,7 @@
 #include <linux/blkdev.h>
 #include <linux/sbitmap.h>
 #include <linux/srcu.h>
+#include <linux/rcupdate_wait.h>
 
 struct blk_mq_tags;
 struct blk_flush_queue;
@@ -170,6 +171,7 @@ struct blk_mq_hw_ctx {
 	 */
 	struct list_head	hctx_list;
 
+	struct rcu_synchronize	*rcu_sync;
 	/**
 	 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
 	 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
@@ -532,6 +534,8 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
 
 void blk_mq_quiesce_queue_nowait(struct request_queue *q);
+void blk_mq_quiesce_queue_async(struct request_queue *q);
+void blk_mq_quiesce_queue_async_wait(struct request_queue *q);
 
 unsigned int blk_mq_rq_cpu(struct request *rq);
 
-- 
2.25.1


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH v4 2/2] nvme: improve quiesce time for large amount of namespaces
  2020-07-27 22:07 [PATCH v4 0/2] improve quiesce time for large amount of namespaces Sagi Grimberg
  2020-07-27 22:07 ` [PATCH v4 1/2] blk-mq: add async quiesce interface Sagi Grimberg
@ 2020-07-27 22:07 ` Sagi Grimberg
  1 sibling, 0 replies; 5+ messages in thread
From: Sagi Grimberg @ 2020-07-27 22:07 UTC (permalink / raw)
  To: linux-nvme, Christoph Hellwig, Keith Busch, Jens Axboe
  Cc: linux-block, Ming Lin, Chao Leng

nvme currently will synchronize queue quiesce for each namespace at once.
This can slow down failover time (which first quiesce all ns queues) if we
have a large amount of namespaces. Instead, we want to use an async interface
and do the namespaces quiesce in parallel rather than serially.

Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 05aa568a60af..e8cc728dee46 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4561,7 +4561,9 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl)
 
 	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
-		blk_mq_quiesce_queue(ns->queue);
+		blk_mq_quiesce_queue_async(ns->queue);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_mq_quiesce_queue_async_wait(ns->queue);
 	up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_stop_queues);
-- 
2.25.1


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v4 1/2] blk-mq: add async quiesce interface
  2020-07-27 22:07 ` [PATCH v4 1/2] blk-mq: add async quiesce interface Sagi Grimberg
@ 2020-07-27 22:13   ` Jens Axboe
  2020-07-27 22:37     ` Sagi Grimberg
  0 siblings, 1 reply; 5+ messages in thread
From: Jens Axboe @ 2020-07-27 22:13 UTC (permalink / raw)
  To: Sagi Grimberg, linux-nvme, Christoph Hellwig, Keith Busch
  Cc: linux-block, Ming Lin, Chao Leng

On 7/27/20 4:07 PM, Sagi Grimberg wrote:
> Drivers that may have to quiesce a large amount of request queues at once
> (e.g. controller or adapter reset). These drivers would benefit from an
> async quiesce interface such that the can trigger quiesce asynchronously
> and wait for all in parallel.
> 
> This leaves the synchronization responsibility to the driver, but adds
> a convenient interface to quiesce async and wait in a single pass.
> 
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> ---
>  block/blk-mq.c         | 46 ++++++++++++++++++++++++++++++++++++++++++
>  include/linux/blk-mq.h |  4 ++++
>  2 files changed, 50 insertions(+)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index abcf590f6238..d913924117d2 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -209,6 +209,52 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q)
>  }
>  EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
>  
> +void blk_mq_quiesce_queue_async(struct request_queue *q)
> +{
> +	struct blk_mq_hw_ctx *hctx;
> +	unsigned int i;
> +	int rcu = false;
> +
> +	blk_mq_quiesce_queue_nowait(q);
> +
> +	queue_for_each_hw_ctx(q, hctx, i) {
> +		hctx->rcu_sync = kmalloc(sizeof(*hctx->rcu_sync), GFP_KERNEL);
> +		if (!hctx->rcu_sync) {
> +			/* fallback to serial rcu sync */
> +			if (hctx->flags & BLK_MQ_F_BLOCKING)
> +				synchronize_srcu(hctx->srcu);
> +			else
> +				rcu = true;
> +		} else {
> +			init_completion(&hctx->rcu_sync->completion);
> +			init_rcu_head(&hctx->rcu_sync->head);
> +			if (hctx->flags & BLK_MQ_F_BLOCKING)
> +				call_srcu(hctx->srcu, &hctx->rcu_sync->head,
> +					wakeme_after_rcu);
> +			else
> +				call_rcu(&hctx->rcu_sync->head,
> +					wakeme_after_rcu);
> +		}
> +	}
> +	if (rcu)
> +		synchronize_rcu();
> +}
> +EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_async);

This won't always be async, and that might matter to some users. I think
it'd be better to put the fallback path into the _wait() part instead,
since the caller should expect that to be blocking/waiting as the name
implies.

Nit picking, but...

-- 
Jens Axboe


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v4 1/2] blk-mq: add async quiesce interface
  2020-07-27 22:13   ` Jens Axboe
@ 2020-07-27 22:37     ` Sagi Grimberg
  0 siblings, 0 replies; 5+ messages in thread
From: Sagi Grimberg @ 2020-07-27 22:37 UTC (permalink / raw)
  To: Jens Axboe, linux-nvme, Christoph Hellwig, Keith Busch
  Cc: linux-block, Ming Lin, Chao Leng


>> +void blk_mq_quiesce_queue_async(struct request_queue *q)
>> +{
>> +	struct blk_mq_hw_ctx *hctx;
>> +	unsigned int i;
>> +	int rcu = false;
>> +
>> +	blk_mq_quiesce_queue_nowait(q);
>> +
>> +	queue_for_each_hw_ctx(q, hctx, i) {
>> +		hctx->rcu_sync = kmalloc(sizeof(*hctx->rcu_sync), GFP_KERNEL);
>> +		if (!hctx->rcu_sync) {
>> +			/* fallback to serial rcu sync */
>> +			if (hctx->flags & BLK_MQ_F_BLOCKING)
>> +				synchronize_srcu(hctx->srcu);
>> +			else
>> +				rcu = true;
>> +		} else {
>> +			init_completion(&hctx->rcu_sync->completion);
>> +			init_rcu_head(&hctx->rcu_sync->head);
>> +			if (hctx->flags & BLK_MQ_F_BLOCKING)
>> +				call_srcu(hctx->srcu, &hctx->rcu_sync->head,
>> +					wakeme_after_rcu);
>> +			else
>> +				call_rcu(&hctx->rcu_sync->head,
>> +					wakeme_after_rcu);
>> +		}
>> +	}
>> +	if (rcu)
>> +		synchronize_rcu();
>> +}
>> +EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_async);
> 
> This won't always be async, and that might matter to some users. I think
> it'd be better to put the fallback path into the _wait() part instead,
> since the caller should expect that to be blocking/waiting as the name
> implies.
> 
> Nit picking, but...

Makes sense..

I thought more about Keith suggestion for an interface that accepts a
tagset. It allows us to decide what we do based on the tagset itself
which is now passed in the interface.

What do you think about:
--
diff --git a/block/blk-mq.c b/block/blk-mq.c
index abcf590f6238..d4b24aa1a766 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -209,6 +209,43 @@ void blk_mq_quiesce_queue_nowait(struct 
request_queue *q)
  }
  EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);

+static void blk_mq_quiesce_queue_async(struct request_queue *q)
+{
+       struct blk_mq_hw_ctx *hctx;
+       unsigned int i;
+
+       blk_mq_quiesce_queue_nowait(q);
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+               if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+                       continue;
+
+               hctx->rcu_sync = kmalloc(sizeof(*hctx->rcu_sync), 
GFP_KERNEL);
+               if (!hctx->rcu_sync)
+                       continue;
+
+               init_completion(&hctx->rcu_sync->completion);
+               init_rcu_head(&hctx->rcu_sync->head);
+               call_srcu(hctx->srcu, &hctx->rcu_sync->head,
+                               wakeme_after_rcu);
+       }
+}
+
+static void blk_mq_quiesce_queue_async_wait(struct request_queue *q)
+{
+       struct blk_mq_hw_ctx *hctx;
+       unsigned int i;
+
+       queue_for_each_hw_ctx(q, hctx, i) {
+               if (!hctx->rcu_sync) {
+                       synchronize_srcu(hctx->srcu);
+                       continue;
+               }
+               wait_for_completion(&hctx->rcu_sync->completion);
+               destroy_rcu_head(&hctx->rcu_sync->head);
+       }
+}
+
  /**
   * blk_mq_quiesce_queue() - wait until all ongoing dispatches have 
finished
   * @q: request queue.
@@ -2884,6 +2921,39 @@ static void queue_set_hctx_shared(struct 
request_queue *q, bool shared)
         }
  }

+void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
+{
+       struct request_queue *q;
+
+       mutex_lock(&set->tag_list_lock);
+       list_for_each_entry(q, &set->tag_list, tag_set_list) {
+               if (!(set->flags & BLK_MQ_F_BLOCKING))
+                       blk_mq_quiesce_queue_nowait(q);
+               else
+                       blk_mq_quiesce_queue_async(q);
+       }
+
+       if (!(set->flags & BLK_MQ_F_BLOCKING)) {
+               synchronize_rcu();
+       } else {
+               list_for_each_entry(q, &set->tag_list, tag_set_list)
+                       blk_mq_quiesce_queue_async_wait(q);
+       }
+       mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
+
+void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
+{
+       struct request_queue *q;
+
+       mutex_lock(&set->tag_list_lock);
+       list_for_each_entry(q, &set->tag_list, tag_set_list)
+               blk_mq_unquiesce_queue(q);
+       mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
+
  static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
                                         bool shared)
  {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 23230c1d031e..a85f2dedc947 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -5,6 +5,7 @@
  #include <linux/blkdev.h>
  #include <linux/sbitmap.h>
  #include <linux/srcu.h>
+#include <linux/rcupdate_wait.h>

  struct blk_mq_tags;
  struct blk_flush_queue;
@@ -170,6 +171,7 @@ struct blk_mq_hw_ctx {
          */
         struct list_head        hctx_list;

+       struct rcu_synchronize  *rcu_sync;
         /**
          * @srcu: Sleepable RCU. Use as lock when type of the hardware 
queue is
          * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
@@ -532,6 +534,8 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
  void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int 
nr_hw_queues);

  void blk_mq_quiesce_queue_nowait(struct request_queue *q);
+void blk_mq_quiesce_tagset(struct request_queue *q);
+void blk_mq_unquiesce_tagset(struct request_queue *q);

  unsigned int blk_mq_rq_cpu(struct request *rq);
--


And then nvme will use it:
--
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 05aa568a60af..c41df20996d7 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4557,23 +4557,13 @@ EXPORT_SYMBOL_GPL(nvme_start_freeze);

  void nvme_stop_queues(struct nvme_ctrl *ctrl)
  {
-       struct nvme_ns *ns;
-
-       down_read(&ctrl->namespaces_rwsem);
-       list_for_each_entry(ns, &ctrl->namespaces, list)
-               blk_mq_quiesce_queue(ns->queue);
-       up_read(&ctrl->namespaces_rwsem);
+       blk_mq_quiesce_tagset(ctrl->tagset);
  }
  EXPORT_SYMBOL_GPL(nvme_stop_queues);

  void nvme_start_queues(struct nvme_ctrl *ctrl)
  {
-       struct nvme_ns *ns;
-
-       down_read(&ctrl->namespaces_rwsem);
-       list_for_each_entry(ns, &ctrl->namespaces, list)
-               blk_mq_unquiesce_queue(ns->queue);
-       up_read(&ctrl->namespaces_rwsem);
+       blk_mq_unquiesce_tagset(ctrl->tagset);
  }
  EXPORT_SYMBOL_GPL(nvme_start_queues);
--

Thoughts?

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2020-07-27 22:37 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-27 22:07 [PATCH v4 0/2] improve quiesce time for large amount of namespaces Sagi Grimberg
2020-07-27 22:07 ` [PATCH v4 1/2] blk-mq: add async quiesce interface Sagi Grimberg
2020-07-27 22:13   ` Jens Axboe
2020-07-27 22:37     ` Sagi Grimberg
2020-07-27 22:07 ` [PATCH v4 2/2] nvme: improve quiesce time for large amount of namespaces Sagi Grimberg

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).