linux-scsi.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/5] blk-mq: quiesce improvement
@ 2021-11-19  2:18 Ming Lei
  2021-11-19  2:18 ` [PATCH 1/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue Ming Lei
                   ` (4 more replies)
  0 siblings, 5 replies; 28+ messages in thread
From: Ming Lei @ 2021-11-19  2:18 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Martin K . Petersen
  Cc: Sagi Grimberg, linux-block, linux-nvme, linux-scsi, Keith Busch,
	Ming Lei

Hi,

The 1st two patches moves srcu from blk_mq_hw_ctx to request_queue.

The other patches add one new helper for supporting quiesce in parallel.


Ming Lei (5):
  blk-mq: move srcu from blk_mq_hw_ctx to request_queue
  blk-mq: rename hctx_lock & hctx_unlock
  blk-mq: add helper of blk_mq_global_quiesce_wait()
  nvme: quiesce namespace queue in parallel
  scsi: use blk-mq quiesce APIs to implement scsi_host_block

 block/blk-core.c         | 23 +++++++++---
 block/blk-mq-sysfs.c     |  2 --
 block/blk-mq.c           | 78 +++++++++++++++++-----------------------
 block/blk-sysfs.c        |  3 +-
 block/blk.h              | 10 +++++-
 block/genhd.c            |  2 +-
 drivers/nvme/host/core.c |  9 +++--
 drivers/scsi/scsi_lib.c  | 16 ++++-----
 include/linux/blk-mq.h   | 21 ++++++-----
 include/linux/blkdev.h   |  8 +++++
 10 files changed, 98 insertions(+), 74 deletions(-)

-- 
2.31.1


^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue
  2021-11-19  2:18 [PATCH 0/5] blk-mq: quiesce improvement Ming Lei
@ 2021-11-19  2:18 ` Ming Lei
  2021-11-19  4:30   ` Bart Van Assche
  2021-11-19  2:18 ` [PATCH 2/5] blk-mq: rename hctx_lock & hctx_unlock Ming Lei
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 28+ messages in thread
From: Ming Lei @ 2021-11-19  2:18 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Martin K . Petersen
  Cc: Sagi Grimberg, linux-block, linux-nvme, linux-scsi, Keith Busch,
	Ming Lei

In case of BLK_MQ_F_BLOCKING, per-hctx srcu is used to protect dispatch
critical area. However, this srcu instance stays at the end of hctx, and
it often takes standalone cacheline, often cold.

Inside srcu_read_lock() and srcu_read_unlock(), WRITE is always done on
the indirect percpu variable which is allocated from heap instead of
being embedded, srcu->srcu_idx is read only in srcu_read_lock(). It
doesn't matter if srcu structure stays in hctx or request queue.

So switch to per-request-queue srcu for protecting dispatch, and this
way simplifies quiesce a lot, not mention quiesce is always done on the
request queue wide.

io_uring randread IO test shows that IOPS is basically not affected by this
change on null_blk(g_blocking, MQ).

Cc: Keith Busch <kbusch@kernel.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-core.c       | 23 +++++++++++++++++----
 block/blk-mq-sysfs.c   |  2 --
 block/blk-mq.c         | 46 ++++++++++++------------------------------
 block/blk-sysfs.c      |  3 ++-
 block/blk.h            | 10 ++++++++-
 block/genhd.c          |  2 +-
 include/linux/blk-mq.h |  8 --------
 include/linux/blkdev.h |  8 ++++++++
 8 files changed, 52 insertions(+), 50 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index ee54b34d5e99..aed14485a932 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -64,6 +64,7 @@ DEFINE_IDA(blk_queue_ida);
  * For queue allocation
  */
 struct kmem_cache *blk_requestq_cachep;
+struct kmem_cache *blk_requestq_srcu_cachep;
 
 /*
  * Controlling structure to kblockd
@@ -433,21 +434,25 @@ static void blk_timeout_work(struct work_struct *work)
 {
 }
 
-struct request_queue *blk_alloc_queue(int node_id)
+struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
 {
 	struct request_queue *q;
 	int ret;
 
-	q = kmem_cache_alloc_node(blk_requestq_cachep,
-				GFP_KERNEL | __GFP_ZERO, node_id);
+	q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
+			GFP_KERNEL | __GFP_ZERO, node_id);
 	if (!q)
 		return NULL;
 
+	q->alloc_srcu = alloc_srcu;
+	if (alloc_srcu && init_srcu_struct(q->srcu) != 0)
+		goto fail_q;
+
 	q->last_merge = NULL;
 
 	q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
 	if (q->id < 0)
-		goto fail_q;
+		goto fail_srcu;
 
 	ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
 	if (ret)
@@ -504,6 +509,9 @@ struct request_queue *blk_alloc_queue(int node_id)
 	bioset_exit(&q->bio_split);
 fail_id:
 	ida_simple_remove(&blk_queue_ida, q->id);
+fail_srcu:
+	if (q->alloc_srcu)
+		cleanup_srcu_struct(q->srcu);
 fail_q:
 	kmem_cache_free(blk_requestq_cachep, q);
 	return NULL;
@@ -1305,6 +1313,9 @@ int __init blk_dev_init(void)
 			sizeof_field(struct request, cmd_flags));
 	BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
 			sizeof_field(struct bio, bi_opf));
+	BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
+			   __alignof__(struct request_queue)) !=
+		     sizeof(struct request_queue));
 
 	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
 	kblockd_workqueue = alloc_workqueue("kblockd",
@@ -1315,6 +1326,10 @@ int __init blk_dev_init(void)
 	blk_requestq_cachep = kmem_cache_create("request_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 
+	blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
+			sizeof(struct request_queue) +
+			sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
+
 	blk_debugfs_root = debugfs_create_dir("block", NULL);
 
 	return 0;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 253c857cba47..674786574075 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
 	struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
 						  kobj);
 
-	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		cleanup_srcu_struct(hctx->srcu);
 	blk_free_flush_queue(hctx->fq);
 	sbitmap_free(&hctx->ctx_map);
 	free_cpumask_var(hctx->cpumask);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1feb9ab65f28..9728a571b009 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -260,17 +260,11 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
  */
 void blk_mq_wait_quiesce_done(struct request_queue *q)
 {
-	struct blk_mq_hw_ctx *hctx;
-	unsigned int i;
-	bool rcu = false;
-
-	queue_for_each_hw_ctx(q, hctx, i) {
-		if (hctx->flags & BLK_MQ_F_BLOCKING)
-			synchronize_srcu(hctx->srcu);
-		else
-			rcu = true;
-	}
-	if (rcu)
+	WARN_ON_ONCE(q->alloc_srcu != !!(q->tag_set->flags &
+				BLK_MQ_F_BLOCKING));
+	if (q->alloc_srcu)
+		synchronize_srcu(q->srcu);
+	else
 		synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
@@ -1082,16 +1076,16 @@ void blk_mq_complete_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
-static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
+static inline void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
 	__releases(hctx->srcu)
 {
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
 		rcu_read_unlock();
 	else
-		srcu_read_unlock(hctx->srcu, srcu_idx);
+		srcu_read_unlock(hctx->queue->srcu, srcu_idx);
 }
 
-static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
+static inline void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
 	__acquires(hctx->srcu)
 {
 	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
@@ -1099,7 +1093,7 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
 		*srcu_idx = 0;
 		rcu_read_lock();
 	} else
-		*srcu_idx = srcu_read_lock(hctx->srcu);
+		*srcu_idx = srcu_read_lock(hctx->queue->srcu);
 }
 
 /**
@@ -3515,20 +3509,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
 	}
 }
 
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
-{
-	int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
-
-	BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
-			   __alignof__(struct blk_mq_hw_ctx)) !=
-		     sizeof(struct blk_mq_hw_ctx));
-
-	if (tag_set->flags & BLK_MQ_F_BLOCKING)
-		hw_ctx_size += sizeof(struct srcu_struct);
-
-	return hw_ctx_size;
-}
-
 static int blk_mq_init_hctx(struct request_queue *q,
 		struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
@@ -3566,7 +3546,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 	struct blk_mq_hw_ctx *hctx;
 	gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
 
-	hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
+	hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
 	if (!hctx)
 		goto fail_alloc_hctx;
 
@@ -3608,8 +3588,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
 	if (!hctx->fq)
 		goto free_bitmap;
 
-	if (hctx->flags & BLK_MQ_F_BLOCKING)
-		init_srcu_struct(hctx->srcu);
 	blk_mq_hctx_kobj_init(hctx);
 
 	return hctx;
@@ -3945,7 +3923,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
 	struct request_queue *q;
 	int ret;
 
-	q = blk_alloc_queue(set->numa_node);
+	q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
 	if (!q)
 		return ERR_PTR(-ENOMEM);
 	q->queuedata = queuedata;
@@ -4094,6 +4072,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
 		struct request_queue *q)
 {
+	WARN_ON_ONCE(q->alloc_srcu != !!(set->flags & BLK_MQ_F_BLOCKING));
+
 	/* mark the queue as mq asap */
 	q->mq_ops = set->ops;
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index cef1f713370b..2b79845a581d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -734,7 +734,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
 {
 	struct request_queue *q = container_of(rcu_head, struct request_queue,
 					       rcu_head);
-	kmem_cache_free(blk_requestq_cachep, q);
+
+	kmem_cache_free(blk_get_queue_kmem_cache(q->alloc_srcu), q);
 }
 
 /* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
diff --git a/block/blk.h b/block/blk.h
index 296e3010f8d6..c14bca80aba9 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -32,6 +32,7 @@ struct blk_flush_queue {
 };
 
 extern struct kmem_cache *blk_requestq_cachep;
+extern struct kmem_cache *blk_requestq_srcu_cachep;
 extern struct kobj_type blk_queue_ktype;
 extern struct ida blk_queue_ida;
 
@@ -448,7 +449,14 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		struct page *page, unsigned int len, unsigned int offset,
 		unsigned int max_sectors, bool *same_page);
 
-struct request_queue *blk_alloc_queue(int node_id);
+static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
+{
+	if (srcu)
+		return blk_requestq_srcu_cachep;
+	return blk_requestq_cachep;
+}
+
+struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
 
 int disk_alloc_events(struct gendisk *disk);
 void disk_add_events(struct gendisk *disk);
diff --git a/block/genhd.c b/block/genhd.c
index c5392cc24d37..e624fe9371f2 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1341,7 +1341,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
 	struct request_queue *q;
 	struct gendisk *disk;
 
-	q = blk_alloc_queue(node);
+	q = blk_alloc_queue(node, false);
 	if (!q)
 		return NULL;
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 308edc2a4925..5cc7fc1ea863 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -4,7 +4,6 @@
 
 #include <linux/blkdev.h>
 #include <linux/sbitmap.h>
-#include <linux/srcu.h>
 #include <linux/lockdep.h>
 #include <linux/scatterlist.h>
 #include <linux/prefetch.h>
@@ -376,13 +375,6 @@ struct blk_mq_hw_ctx {
 	 * q->unused_hctx_list.
 	 */
 	struct list_head	hctx_list;
-
-	/**
-	 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
-	 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
-	 * blk_mq_hw_ctx_size().
-	 */
-	struct srcu_struct	srcu[];
 };
 
 /**
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index bd4370baccca..5741b46bca6c 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -16,6 +16,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/blkzoned.h>
 #include <linux/sbitmap.h>
+#include <linux/srcu.h>
 
 struct module;
 struct request_queue;
@@ -364,6 +365,7 @@ struct request_queue {
 #endif
 
 	bool			mq_sysfs_init_done;
+	bool			alloc_srcu;
 
 #define BLK_MAX_WRITE_HINTS	5
 	u64			write_hints[BLK_MAX_WRITE_HINTS];
@@ -373,6 +375,12 @@ struct request_queue {
 	 * devices that do not have multiple independent access ranges.
 	 */
 	struct blk_independent_access_ranges *ia_ranges;
+
+	/**
+	 * @srcu: Sleepable RCU. Use as lock when type of the request queue
+	 * is blocking (BLK_MQ_F_BLOCKING). Must be the last member
+	 */
+	struct srcu_struct	srcu[];
 };
 
 /* Keep blk_queue_flag_name[] in sync with the definitions below */
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/5] blk-mq: rename hctx_lock & hctx_unlock
  2021-11-19  2:18 [PATCH 0/5] blk-mq: quiesce improvement Ming Lei
  2021-11-19  2:18 ` [PATCH 1/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue Ming Lei
@ 2021-11-19  2:18 ` Ming Lei
  2021-11-22  7:53   ` Sagi Grimberg
  2021-11-22  9:00   ` Christoph Hellwig
  2021-11-19  2:18 ` [PATCH 3/5] blk-mq: add helper of blk_mq_global_quiesce_wait() Ming Lei
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 28+ messages in thread
From: Ming Lei @ 2021-11-19  2:18 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Martin K . Petersen
  Cc: Sagi Grimberg, linux-block, linux-nvme, linux-scsi, Keith Busch,
	Ming Lei

We have moved srcu from 'struct blk_mq_hw_ctx' into 'struct request_queue',
both hctx_lock and hctx_unlock are run on request queue level, so rename
them as queue_lock and queue_unlock().

And it could be used for supporting Jens's ->queue_rqs(), as suggested
by Keith.

Also it could be extended for driver uses in future.

Cc: Keith Busch <kbusch@kernel.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq.c | 40 +++++++++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9728a571b009..ba0d0e411b65 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1076,24 +1076,26 @@ void blk_mq_complete_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
-static inline void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
-	__releases(hctx->srcu)
+static inline void queue_unlock(struct request_queue *q, bool blocking,
+		int srcu_idx)
+	__releases(q->srcu)
 {
-	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+	if (!blocking)
 		rcu_read_unlock();
 	else
-		srcu_read_unlock(hctx->queue->srcu, srcu_idx);
+		srcu_read_unlock(q->srcu, srcu_idx);
 }
 
-static inline void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
+static inline void queue_lock(struct request_queue *q, bool blocking,
+		int *srcu_idx)
 	__acquires(hctx->srcu)
 {
-	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
+	if (!blocking) {
 		/* shut up gcc false positive */
 		*srcu_idx = 0;
 		rcu_read_lock();
 	} else
-		*srcu_idx = srcu_read_lock(hctx->queue->srcu);
+		*srcu_idx = srcu_read_lock(q->srcu);
 }
 
 /**
@@ -1958,6 +1960,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	int srcu_idx;
+	bool blocking = hctx->flags & BLK_MQ_F_BLOCKING;
 
 	/*
 	 * We can't run the queue inline with ints disabled. Ensure that
@@ -1965,11 +1968,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 	 */
 	WARN_ON_ONCE(in_interrupt());
 
-	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
+	might_sleep_if(blocking);
 
-	hctx_lock(hctx, &srcu_idx);
+	queue_lock(hctx->queue, blocking, &srcu_idx);
 	blk_mq_sched_dispatch_requests(hctx);
-	hctx_unlock(hctx, srcu_idx);
+	queue_unlock(hctx->queue, blocking, srcu_idx);
 }
 
 static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
@@ -2083,6 +2086,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
 	int srcu_idx;
 	bool need_run;
+	bool blocking = hctx->flags & BLK_MQ_F_BLOCKING;
 
 	/*
 	 * When queue is quiesced, we may be switching io scheduler, or
@@ -2092,10 +2096,10 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 	 * And queue will be rerun in blk_mq_unquiesce_queue() if it is
 	 * quiesced.
 	 */
-	hctx_lock(hctx, &srcu_idx);
+	queue_lock(hctx->queue, blocking, &srcu_idx);
 	need_run = !blk_queue_quiesced(hctx->queue) &&
 		blk_mq_hctx_has_pending(hctx);
-	hctx_unlock(hctx, srcu_idx);
+	queue_unlock(hctx->queue, blocking, srcu_idx);
 
 	if (need_run)
 		__blk_mq_delay_run_hw_queue(hctx, async, 0);
@@ -2500,10 +2504,11 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 {
 	blk_status_t ret;
 	int srcu_idx;
+	bool blocking = hctx->flags & BLK_MQ_F_BLOCKING;
 
-	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
+	might_sleep_if(blocking);
 
-	hctx_lock(hctx, &srcu_idx);
+	queue_lock(hctx->queue, blocking, &srcu_idx);
 
 	ret = __blk_mq_try_issue_directly(hctx, rq, false, true);
 	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
@@ -2511,7 +2516,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 	else if (ret != BLK_STS_OK)
 		blk_mq_end_request(rq, ret);
 
-	hctx_unlock(hctx, srcu_idx);
+	queue_unlock(hctx->queue, blocking, srcu_idx);
 }
 
 static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
@@ -2519,10 +2524,11 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
 	blk_status_t ret;
 	int srcu_idx;
 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+	bool blocking = hctx->flags & BLK_MQ_F_BLOCKING;
 
-	hctx_lock(hctx, &srcu_idx);
+	queue_lock(hctx->queue, blocking, &srcu_idx);
 	ret = __blk_mq_try_issue_directly(hctx, rq, true, last);
-	hctx_unlock(hctx, srcu_idx);
+	queue_unlock(hctx->queue, blocking, srcu_idx);
 
 	return ret;
 }
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 3/5] blk-mq: add helper of blk_mq_global_quiesce_wait()
  2021-11-19  2:18 [PATCH 0/5] blk-mq: quiesce improvement Ming Lei
  2021-11-19  2:18 ` [PATCH 1/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue Ming Lei
  2021-11-19  2:18 ` [PATCH 2/5] blk-mq: rename hctx_lock & hctx_unlock Ming Lei
@ 2021-11-19  2:18 ` Ming Lei
  2021-11-22  7:56   ` Sagi Grimberg
  2021-11-19  2:18 ` [PATCH 4/5] nvme: quiesce namespace queue in parallel Ming Lei
  2021-11-19  2:18 ` [PATCH 5/5] scsi: use blk-mq quiesce APIs to implement scsi_host_block Ming Lei
  4 siblings, 1 reply; 28+ messages in thread
From: Ming Lei @ 2021-11-19  2:18 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Martin K . Petersen
  Cc: Sagi Grimberg, linux-block, linux-nvme, linux-scsi, Keith Busch,
	Ming Lei

Add helper of blk_mq_global_quiesce_wait() for supporting to quiesce
queues in parallel, then we can just wait once if global quiesce wait
is allowed.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 include/linux/blk-mq.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 5cc7fc1ea863..a9fecda2507e 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -777,6 +777,19 @@ static inline bool blk_mq_add_to_batch(struct request *req,
 	return true;
 }
 
+/*
+ * If the queue has allocated & used srcu to quiesce queue, quiesce wait is
+ * done via the synchronize_srcu(q->rcu), otherwise it is done via global
+ * synchronize_rcu().
+ *
+ * This helper can help us to support quiescing queue in parallel, so just
+ * one quiesce wait is enough if global quiesce wait is allowed.
+ */
+static inline bool blk_mq_global_quiesce_wait(struct request_queue *q)
+{
+	return !q->alloc_srcu;
+}
+
 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
 void blk_mq_kick_requeue_list(struct request_queue *q);
 void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 4/5] nvme: quiesce namespace queue in parallel
  2021-11-19  2:18 [PATCH 0/5] blk-mq: quiesce improvement Ming Lei
                   ` (2 preceding siblings ...)
  2021-11-19  2:18 ` [PATCH 3/5] blk-mq: add helper of blk_mq_global_quiesce_wait() Ming Lei
@ 2021-11-19  2:18 ` Ming Lei
  2021-11-22  8:07   ` Sagi Grimberg
  2021-11-19  2:18 ` [PATCH 5/5] scsi: use blk-mq quiesce APIs to implement scsi_host_block Ming Lei
  4 siblings, 1 reply; 28+ messages in thread
From: Ming Lei @ 2021-11-19  2:18 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Martin K . Petersen
  Cc: Sagi Grimberg, linux-block, linux-nvme, linux-scsi, Keith Busch,
	Ming Lei, Chao Leng

Chao Leng reported that in case of lots of namespaces, it may take quite a
while for nvme_stop_queues() to quiesce all namespace queues.

Improve nvme_stop_queues() by running quiesce in parallel, and just wait
once if global quiesce wait is allowed.

Link: https://lore.kernel.org/linux-block/cc732195-c053-9ce4-e1a7-e7f6dcf762ac@huawei.com/
Reported-by: Chao Leng <lengchao@huawei.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/nvme/host/core.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4b5de8f5435a..06741d3ed72b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4517,9 +4517,7 @@ static void nvme_start_ns_queue(struct nvme_ns *ns)
 static void nvme_stop_ns_queue(struct nvme_ns *ns)
 {
 	if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
-		blk_mq_quiesce_queue(ns->queue);
-	else
-		blk_mq_wait_quiesce_done(ns->queue);
+		blk_mq_quiesce_queue_nowait(ns->queue);
 }
 
 /*
@@ -4620,6 +4618,11 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl)
 	down_read(&ctrl->namespaces_rwsem);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
 		nvme_stop_ns_queue(ns);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		blk_mq_wait_quiesce_done(ns->queue);
+		if (blk_mq_global_quiesce_wait(ns->queue))
+			break;
+	}
 	up_read(&ctrl->namespaces_rwsem);
 }
 EXPORT_SYMBOL_GPL(nvme_stop_queues);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 5/5] scsi: use blk-mq quiesce APIs to implement scsi_host_block
  2021-11-19  2:18 [PATCH 0/5] blk-mq: quiesce improvement Ming Lei
                   ` (3 preceding siblings ...)
  2021-11-19  2:18 ` [PATCH 4/5] nvme: quiesce namespace queue in parallel Ming Lei
@ 2021-11-19  2:18 ` Ming Lei
  4 siblings, 0 replies; 28+ messages in thread
From: Ming Lei @ 2021-11-19  2:18 UTC (permalink / raw)
  To: Christoph Hellwig, Jens Axboe, Martin K . Petersen
  Cc: Sagi Grimberg, linux-block, linux-nvme, linux-scsi, Keith Busch,
	Ming Lei

scsi_host_block() calls synchronize_rcu() directly to wait for
quiesce done, this way is ugly since it exposes blk-mq quiesce's
implementation details.

Instead apply blk_mq_wait_quiesce_done() and
blk_mq_global_quiesce_wait() for scsi_host_block().

Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 drivers/scsi/scsi_lib.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 5e8b5ecb3245..b0da6a4a1784 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2952,15 +2952,15 @@ scsi_host_block(struct Scsi_Host *shost)
 		}
 	}
 
-	/*
-	 * SCSI never enables blk-mq's BLK_MQ_F_BLOCKING flag so
-	 * calling synchronize_rcu() once is enough.
-	 */
-	WARN_ON_ONCE(shost->tag_set.flags & BLK_MQ_F_BLOCKING);
-
-	if (!ret)
-		synchronize_rcu();
+	if (!ret) {
+		shost_for_each_device(sdev, shost) {
+			struct request_queue *q = sdev->request_queue;
 
+			blk_mq_wait_quiesce_done(q);
+			if (blk_mq_global_quiesce_wait(q))
+				break;
+		}
+	}
 	return ret;
 }
 EXPORT_SYMBOL_GPL(scsi_host_block);
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue
  2021-11-19  2:18 ` [PATCH 1/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue Ming Lei
@ 2021-11-19  4:30   ` Bart Van Assche
  2021-11-19  8:10     ` Ming Lei
  0 siblings, 1 reply; 28+ messages in thread
From: Bart Van Assche @ 2021-11-19  4:30 UTC (permalink / raw)
  To: Ming Lei, Christoph Hellwig, Jens Axboe, Martin K . Petersen
  Cc: Sagi Grimberg, linux-block, linux-nvme, linux-scsi, Keith Busch

On 11/18/21 18:18, Ming Lei wrote:
> +	bool			alloc_srcu;

I found the following statement multiple times in this patch:

WARN_ON_ONCE(q->alloc_srcu != !!(q->tag_set->flags & BLK_MQ_F_BLOCKING));

Does this mean that the new q->alloc_srcu member variable can be left out
and that it can be replaced with the following test?

q->tag_set->flags & BLK_MQ_F_BLOCKING

Please note that I'm not concerned about the memory occupied by this
variable but only about avoiding redundancy.

If this variable is retained it probably should be renamed, e.g. "has_srcu"
instead of "alloc_srcu".

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue
  2021-11-19  4:30   ` Bart Van Assche
@ 2021-11-19  8:10     ` Ming Lei
  2021-11-22  7:48       ` Sagi Grimberg
  0 siblings, 1 reply; 28+ messages in thread
From: Ming Lei @ 2021-11-19  8:10 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: Christoph Hellwig, Jens Axboe, Martin K . Petersen,
	Sagi Grimberg, linux-block, linux-nvme, linux-scsi, Keith Busch

On Thu, Nov 18, 2021 at 08:30:49PM -0800, Bart Van Assche wrote:
> On 11/18/21 18:18, Ming Lei wrote:
> > +	bool			alloc_srcu;
> 
> I found the following statement multiple times in this patch:
> 
> WARN_ON_ONCE(q->alloc_srcu != !!(q->tag_set->flags & BLK_MQ_F_BLOCKING));
> 
> Does this mean that the new q->alloc_srcu member variable can be left out
> and that it can be replaced with the following test?
> 
> q->tag_set->flags & BLK_MQ_F_BLOCKING

q->tag_set can't be used anymore after blk_cleanup_queue() returns,
and we need the flag for freeing request_queue instance.

> 
> Please note that I'm not concerned about the memory occupied by this
> variable but only about avoiding redundancy.
> 
> If this variable is retained it probably should be renamed, e.g. "has_srcu"
> instead of "alloc_srcu".

Fine.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue
  2021-11-19  8:10     ` Ming Lei
@ 2021-11-22  7:48       ` Sagi Grimberg
  2021-11-22 13:08         ` Ming Lei
  0 siblings, 1 reply; 28+ messages in thread
From: Sagi Grimberg @ 2021-11-22  7:48 UTC (permalink / raw)
  To: Ming Lei, Bart Van Assche
  Cc: Christoph Hellwig, Jens Axboe, Martin K . Petersen, linux-block,
	linux-nvme, linux-scsi, Keith Busch


>>> +	bool			alloc_srcu;
>>
>> I found the following statement multiple times in this patch:
>>
>> WARN_ON_ONCE(q->alloc_srcu != !!(q->tag_set->flags & BLK_MQ_F_BLOCKING));
>>
>> Does this mean that the new q->alloc_srcu member variable can be left out
>> and that it can be replaced with the following test?
>>
>> q->tag_set->flags & BLK_MQ_F_BLOCKING
> 
> q->tag_set can't be used anymore after blk_cleanup_queue() returns,
> and we need the flag for freeing request_queue instance.

Why not just look at the queue->srcu pointer? it is allocated only
for BLK_MQ_F_BLOCKING no?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/5] blk-mq: rename hctx_lock & hctx_unlock
  2021-11-19  2:18 ` [PATCH 2/5] blk-mq: rename hctx_lock & hctx_unlock Ming Lei
@ 2021-11-22  7:53   ` Sagi Grimberg
  2021-11-22 13:20     ` Ming Lei
  2021-11-22  9:00   ` Christoph Hellwig
  1 sibling, 1 reply; 28+ messages in thread
From: Sagi Grimberg @ 2021-11-22  7:53 UTC (permalink / raw)
  To: Ming Lei, Christoph Hellwig, Jens Axboe, Martin K . Petersen
  Cc: linux-block, linux-nvme, linux-scsi, Keith Busch


> -static inline void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
> -	__releases(hctx->srcu)
> +static inline void queue_unlock(struct request_queue *q, bool blocking,
> +		int srcu_idx)
> +	__releases(q->srcu)
>   {
> -	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
> +	if (!blocking)
>   		rcu_read_unlock();
>   	else
> -		srcu_read_unlock(hctx->queue->srcu, srcu_idx);
> +		srcu_read_unlock(q->srcu, srcu_idx);

Maybe instead of passing blocking bool just look at srcu_idx?

	if (srcu_idx < 0)
		rcu_read_unlock();
	else
		srcu_read_unlock(q->srcu, srcu_idx);

Or look if the queue has srcu allocated?

	if (!q->srcu)
		rcu_read_unlock();
	else
		srcu_read_unlock(q->srcu, srcu_idx);

>   }
>   
> -static inline void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
> +static inline void queue_lock(struct request_queue *q, bool blocking,
> +		int *srcu_idx)
>   	__acquires(hctx->srcu)
>   {
> -	if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
> +	if (!blocking) {
>   		/* shut up gcc false positive */
>   		*srcu_idx = 0;
>   		rcu_read_lock();
>   	} else
> -		*srcu_idx = srcu_read_lock(hctx->queue->srcu);
> +		*srcu_idx = srcu_read_lock(q->srcu);

Same here:
	
	if (!q->srcu)
		rcu_read_lock();
	else
		srcu_idx = srcu_read_lock(q->srcu);

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/5] blk-mq: add helper of blk_mq_global_quiesce_wait()
  2021-11-19  2:18 ` [PATCH 3/5] blk-mq: add helper of blk_mq_global_quiesce_wait() Ming Lei
@ 2021-11-22  7:56   ` Sagi Grimberg
  2021-11-22  8:00     ` Sagi Grimberg
  2021-11-22 13:26     ` Ming Lei
  0 siblings, 2 replies; 28+ messages in thread
From: Sagi Grimberg @ 2021-11-22  7:56 UTC (permalink / raw)
  To: Ming Lei, Christoph Hellwig, Jens Axboe, Martin K . Petersen
  Cc: linux-block, linux-nvme, linux-scsi, Keith Busch


> Add helper of blk_mq_global_quiesce_wait() for supporting to quiesce
> queues in parallel, then we can just wait once if global quiesce wait
> is allowed.

blk_mq_global_quiesce_wait() is a poor name... global is scope-less and
obviously it has a scope.


> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>   include/linux/blk-mq.h | 13 +++++++++++++
>   1 file changed, 13 insertions(+)
> 
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 5cc7fc1ea863..a9fecda2507e 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -777,6 +777,19 @@ static inline bool blk_mq_add_to_batch(struct request *req,
>   	return true;
>   }
>   
> +/*
> + * If the queue has allocated & used srcu to quiesce queue, quiesce wait is
> + * done via the synchronize_srcu(q->rcu), otherwise it is done via global
> + * synchronize_rcu().
> + *
> + * This helper can help us to support quiescing queue in parallel, so just
> + * one quiesce wait is enough if global quiesce wait is allowed.
> + */
> +static inline bool blk_mq_global_quiesce_wait(struct request_queue *q)
> +{
> +	return !q->alloc_srcu;
> +}
> +
>   void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
>   void blk_mq_kick_requeue_list(struct request_queue *q);
>   void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/5] blk-mq: add helper of blk_mq_global_quiesce_wait()
  2021-11-22  7:56   ` Sagi Grimberg
@ 2021-11-22  8:00     ` Sagi Grimberg
  2021-11-22 13:26     ` Ming Lei
  1 sibling, 0 replies; 28+ messages in thread
From: Sagi Grimberg @ 2021-11-22  8:00 UTC (permalink / raw)
  To: Ming Lei, Christoph Hellwig, Jens Axboe, Martin K . Petersen
  Cc: linux-block, linux-nvme, linux-scsi, Keith Busch



On 11/22/21 9:56 AM, Sagi Grimberg wrote:
> 
>> Add helper of blk_mq_global_quiesce_wait() for supporting to quiesce
>> queues in parallel, then we can just wait once if global quiesce wait
>> is allowed.
> 
> blk_mq_global_quiesce_wait() is a poor name... global is scope-less and
> obviously it has a scope.
> 
> 
>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>> ---
>>   include/linux/blk-mq.h | 13 +++++++++++++
>>   1 file changed, 13 insertions(+)
>>
>> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
>> index 5cc7fc1ea863..a9fecda2507e 100644
>> --- a/include/linux/blk-mq.h
>> +++ b/include/linux/blk-mq.h
>> @@ -777,6 +777,19 @@ static inline bool blk_mq_add_to_batch(struct 
>> request *req,
>>       return true;
>>   }
>> +/*
>> + * If the queue has allocated & used srcu to quiesce queue, quiesce 
>> wait is
>> + * done via the synchronize_srcu(q->rcu), otherwise it is done via 
>> global
>> + * synchronize_rcu().
>> + *
>> + * This helper can help us to support quiescing queue in parallel, so 
>> just
>> + * one quiesce wait is enough if global quiesce wait is allowed.
>> + */
>> +static inline bool blk_mq_global_quiesce_wait(struct request_queue *q)
>> +{
>> +    return !q->alloc_srcu;

	return !q->srcu ?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/5] nvme: quiesce namespace queue in parallel
  2021-11-19  2:18 ` [PATCH 4/5] nvme: quiesce namespace queue in parallel Ming Lei
@ 2021-11-22  8:07   ` Sagi Grimberg
  2021-11-23  0:13     ` Ming Lei
  0 siblings, 1 reply; 28+ messages in thread
From: Sagi Grimberg @ 2021-11-22  8:07 UTC (permalink / raw)
  To: Ming Lei, Christoph Hellwig, Jens Axboe, Martin K . Petersen
  Cc: linux-block, linux-nvme, linux-scsi, Keith Busch, Chao Leng



On 11/19/21 4:18 AM, Ming Lei wrote:
> Chao Leng reported that in case of lots of namespaces, it may take quite a
> while for nvme_stop_queues() to quiesce all namespace queues.
> 
> Improve nvme_stop_queues() by running quiesce in parallel, and just wait
> once if global quiesce wait is allowed.
> 
> Link: https://lore.kernel.org/linux-block/cc732195-c053-9ce4-e1a7-e7f6dcf762ac@huawei.com/
> Reported-by: Chao Leng <lengchao@huawei.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>   drivers/nvme/host/core.c | 9 ++++++---
>   1 file changed, 6 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 4b5de8f5435a..06741d3ed72b 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -4517,9 +4517,7 @@ static void nvme_start_ns_queue(struct nvme_ns *ns)
>   static void nvme_stop_ns_queue(struct nvme_ns *ns)
>   {
>   	if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
> -		blk_mq_quiesce_queue(ns->queue);
> -	else
> -		blk_mq_wait_quiesce_done(ns->queue);
> +		blk_mq_quiesce_queue_nowait(ns->queue);
>   }
>   
>   /*
> @@ -4620,6 +4618,11 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl)
>   	down_read(&ctrl->namespaces_rwsem);
>   	list_for_each_entry(ns, &ctrl->namespaces, list)
>   		nvme_stop_ns_queue(ns);
> +	list_for_each_entry(ns, &ctrl->namespaces, list) {
> +		blk_mq_wait_quiesce_done(ns->queue);
> +		if (blk_mq_global_quiesce_wait(ns->queue))
> +			break;
> +	}
>   	up_read(&ctrl->namespaces_rwsem);


Can you quantify how much of a difference it is to do rcu_read_unlock()
for every queue? The big improvement here is that it is done in parallel
instead of serially. Just wandering if it is worth the less than elegant
interface...

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/5] blk-mq: rename hctx_lock & hctx_unlock
  2021-11-19  2:18 ` [PATCH 2/5] blk-mq: rename hctx_lock & hctx_unlock Ming Lei
  2021-11-22  7:53   ` Sagi Grimberg
@ 2021-11-22  9:00   ` Christoph Hellwig
  1 sibling, 0 replies; 28+ messages in thread
From: Christoph Hellwig @ 2021-11-22  9:00 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Jens Axboe, Martin K . Petersen,
	Sagi Grimberg, linux-block, linux-nvme, linux-scsi, Keith Busch

On Fri, Nov 19, 2021 at 10:18:46AM +0800, Ming Lei wrote:
> +static inline void queue_unlock(struct request_queue *q, bool blocking,
> +		int srcu_idx)

I don't think this is a good name, as it can be easily confused with
q->queue_lock.

> +	__releases(q->srcu)
>  {
> -	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
> +	if (!blocking)
>  		rcu_read_unlock();
>  	else
> -		srcu_read_unlock(hctx->queue->srcu, srcu_idx);
> +		srcu_read_unlock(q->srcu, srcu_idx);
>  }

I think you want to make BLK_MQ_F_BLOCKING accessible from the
request_queue instead of passing the extra argument as well.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue
  2021-11-22  7:48       ` Sagi Grimberg
@ 2021-11-22 13:08         ` Ming Lei
  2021-11-22 13:47           ` Sagi Grimberg
  0 siblings, 1 reply; 28+ messages in thread
From: Ming Lei @ 2021-11-22 13:08 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Bart Van Assche, Christoph Hellwig, Jens Axboe,
	Martin K . Petersen, linux-block, linux-nvme, linux-scsi,
	Keith Busch

On Mon, Nov 22, 2021 at 09:48:35AM +0200, Sagi Grimberg wrote:
> 
> > > > +	bool			alloc_srcu;
> > > 
> > > I found the following statement multiple times in this patch:
> > > 
> > > WARN_ON_ONCE(q->alloc_srcu != !!(q->tag_set->flags & BLK_MQ_F_BLOCKING));
> > > 
> > > Does this mean that the new q->alloc_srcu member variable can be left out
> > > and that it can be replaced with the following test?
> > > 
> > > q->tag_set->flags & BLK_MQ_F_BLOCKING
> > 
> > q->tag_set can't be used anymore after blk_cleanup_queue() returns,
> > and we need the flag for freeing request_queue instance.
> 
> Why not just look at the queue->srcu pointer? it is allocated only
> for BLK_MQ_F_BLOCKING no?

Yeah, we can add one extra srcu pointer to request queue, but this way
needs one extra fetch to q->srcu in fast path compared with current
code base, so io_uring workload may be affected a bit.

Thanks,
Ming


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/5] blk-mq: rename hctx_lock & hctx_unlock
  2021-11-22  7:53   ` Sagi Grimberg
@ 2021-11-22 13:20     ` Ming Lei
  2021-11-22 13:50       ` Sagi Grimberg
  0 siblings, 1 reply; 28+ messages in thread
From: Ming Lei @ 2021-11-22 13:20 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Jens Axboe, Martin K . Petersen, linux-block,
	linux-nvme, linux-scsi, Keith Busch

On Mon, Nov 22, 2021 at 09:53:53AM +0200, Sagi Grimberg wrote:
> 
> > -static inline void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
> > -	__releases(hctx->srcu)
> > +static inline void queue_unlock(struct request_queue *q, bool blocking,
> > +		int srcu_idx)
> > +	__releases(q->srcu)
> >   {
> > -	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
> > +	if (!blocking)
> >   		rcu_read_unlock();
> >   	else
> > -		srcu_read_unlock(hctx->queue->srcu, srcu_idx);
> > +		srcu_read_unlock(q->srcu, srcu_idx);
> 
> Maybe instead of passing blocking bool just look at srcu_idx?
> 
> 	if (srcu_idx < 0)
> 		rcu_read_unlock();
> 	else
> 		srcu_read_unlock(q->srcu, srcu_idx);

This way needs to initialize srcu_idx in each callers.

> 
> Or look if the queue has srcu allocated?
> 
> 	if (!q->srcu)
> 		rcu_read_unlock();
> 	else
> 		srcu_read_unlock(q->srcu, srcu_idx);

This way is worse since q->srcu may involve one new cacheline fetch.

hctx->flags is always hot, so it is basically zero cost to check it.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/5] blk-mq: add helper of blk_mq_global_quiesce_wait()
  2021-11-22  7:56   ` Sagi Grimberg
  2021-11-22  8:00     ` Sagi Grimberg
@ 2021-11-22 13:26     ` Ming Lei
  2021-11-22 13:55       ` Sagi Grimberg
  1 sibling, 1 reply; 28+ messages in thread
From: Ming Lei @ 2021-11-22 13:26 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Jens Axboe, Martin K . Petersen, linux-block,
	linux-nvme, linux-scsi, Keith Busch

On Mon, Nov 22, 2021 at 09:56:10AM +0200, Sagi Grimberg wrote:
> 
> > Add helper of blk_mq_global_quiesce_wait() for supporting to quiesce
> > queues in parallel, then we can just wait once if global quiesce wait
> > is allowed.
> 
> blk_mq_global_quiesce_wait() is a poor name... global is scope-less and
> obviously it has a scope.

How about blk_mq_shared_quiesce_wait()? or any suggestion?


Thanks,
Ming


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue
  2021-11-22 13:08         ` Ming Lei
@ 2021-11-22 13:47           ` Sagi Grimberg
  0 siblings, 0 replies; 28+ messages in thread
From: Sagi Grimberg @ 2021-11-22 13:47 UTC (permalink / raw)
  To: Ming Lei
  Cc: Bart Van Assche, Christoph Hellwig, Jens Axboe,
	Martin K . Petersen, linux-block, linux-nvme, linux-scsi,
	Keith Busch


>>>>> +	bool			alloc_srcu;
>>>>
>>>> I found the following statement multiple times in this patch:
>>>>
>>>> WARN_ON_ONCE(q->alloc_srcu != !!(q->tag_set->flags & BLK_MQ_F_BLOCKING));
>>>>
>>>> Does this mean that the new q->alloc_srcu member variable can be left out
>>>> and that it can be replaced with the following test?
>>>>
>>>> q->tag_set->flags & BLK_MQ_F_BLOCKING
>>>
>>> q->tag_set can't be used anymore after blk_cleanup_queue() returns,
>>> and we need the flag for freeing request_queue instance.
>>
>> Why not just look at the queue->srcu pointer? it is allocated only
>> for BLK_MQ_F_BLOCKING no?
> 
> Yeah, we can add one extra srcu pointer to request queue, but this way
> needs one extra fetch to q->srcu in fast path compared with current
> code base, so io_uring workload may be affected a bit.

Yea you're right. We should at some point make has_srcu and
mq_sysfs_init_done bits in a flags member, but we have like 6 more
before we need to do it...

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/5] blk-mq: rename hctx_lock & hctx_unlock
  2021-11-22 13:20     ` Ming Lei
@ 2021-11-22 13:50       ` Sagi Grimberg
  2021-11-23  0:08         ` Ming Lei
  0 siblings, 1 reply; 28+ messages in thread
From: Sagi Grimberg @ 2021-11-22 13:50 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Jens Axboe, Martin K . Petersen, linux-block,
	linux-nvme, linux-scsi, Keith Busch



On 11/22/21 3:20 PM, Ming Lei wrote:
> On Mon, Nov 22, 2021 at 09:53:53AM +0200, Sagi Grimberg wrote:
>>
>>> -static inline void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
>>> -	__releases(hctx->srcu)
>>> +static inline void queue_unlock(struct request_queue *q, bool blocking,
>>> +		int srcu_idx)
>>> +	__releases(q->srcu)
>>>    {
>>> -	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
>>> +	if (!blocking)
>>>    		rcu_read_unlock();
>>>    	else
>>> -		srcu_read_unlock(hctx->queue->srcu, srcu_idx);
>>> +		srcu_read_unlock(q->srcu, srcu_idx);
>>
>> Maybe instead of passing blocking bool just look at srcu_idx?
>>
>> 	if (srcu_idx < 0)
>> 		rcu_read_unlock();
>> 	else
>> 		srcu_read_unlock(q->srcu, srcu_idx);
> 
> This way needs to initialize srcu_idx in each callers.

Then look at q->has_srcu that Bart suggested?

> 
>>
>> Or look if the queue has srcu allocated?
>>
>> 	if (!q->srcu)
>> 		rcu_read_unlock();
>> 	else
>> 		srcu_read_unlock(q->srcu, srcu_idx);
> 
> This way is worse since q->srcu may involve one new cacheline fetch.
> 
> hctx->flags is always hot, so it is basically zero cost to check it.

Yea, but the interface is awkward that the caller tells the
routine how it should lock/unlock...

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/5] blk-mq: add helper of blk_mq_global_quiesce_wait()
  2021-11-22 13:26     ` Ming Lei
@ 2021-11-22 13:55       ` Sagi Grimberg
  2021-11-23  0:17         ` Ming Lei
  0 siblings, 1 reply; 28+ messages in thread
From: Sagi Grimberg @ 2021-11-22 13:55 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Jens Axboe, Martin K . Petersen, linux-block,
	linux-nvme, linux-scsi, Keith Busch


>>> Add helper of blk_mq_global_quiesce_wait() for supporting to quiesce
>>> queues in parallel, then we can just wait once if global quiesce wait
>>> is allowed.
>>
>> blk_mq_global_quiesce_wait() is a poor name... global is scope-less and
>> obviously it has a scope.
> 
> How about blk_mq_shared_quiesce_wait()? or any suggestion?

Shared between what?

Maybe if the queue has a non-blocking tagset, it can have a "quiesced"
flag that is cleared in unquiesce? then the callers can just continue
to iterate but will only wait the rcu grace period once.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/5] blk-mq: rename hctx_lock & hctx_unlock
  2021-11-22 13:50       ` Sagi Grimberg
@ 2021-11-23  0:08         ` Ming Lei
  2021-11-23  8:54           ` Sagi Grimberg
  0 siblings, 1 reply; 28+ messages in thread
From: Ming Lei @ 2021-11-23  0:08 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Jens Axboe, Martin K . Petersen, linux-block,
	linux-nvme, linux-scsi, Keith Busch

On Mon, Nov 22, 2021 at 03:50:14PM +0200, Sagi Grimberg wrote:
> 
> 
> On 11/22/21 3:20 PM, Ming Lei wrote:
> > On Mon, Nov 22, 2021 at 09:53:53AM +0200, Sagi Grimberg wrote:
> > > 
> > > > -static inline void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
> > > > -	__releases(hctx->srcu)
> > > > +static inline void queue_unlock(struct request_queue *q, bool blocking,
> > > > +		int srcu_idx)
> > > > +	__releases(q->srcu)
> > > >    {
> > > > -	if (!(hctx->flags & BLK_MQ_F_BLOCKING))
> > > > +	if (!blocking)
> > > >    		rcu_read_unlock();
> > > >    	else
> > > > -		srcu_read_unlock(hctx->queue->srcu, srcu_idx);
> > > > +		srcu_read_unlock(q->srcu, srcu_idx);
> > > 
> > > Maybe instead of passing blocking bool just look at srcu_idx?
> > > 
> > > 	if (srcu_idx < 0)
> > > 		rcu_read_unlock();
> > > 	else
> > > 		srcu_read_unlock(q->srcu, srcu_idx);
> > 
> > This way needs to initialize srcu_idx in each callers.
> 
> Then look at q->has_srcu that Bart suggested?

Bart just suggested to rename q->alloc_srcu as q->has_srcu.

> 
> > 
> > > 
> > > Or look if the queue has srcu allocated?
> > > 
> > > 	if (!q->srcu)
> > > 		rcu_read_unlock();
> > > 	else
> > > 		srcu_read_unlock(q->srcu, srcu_idx);
> > 
> > This way is worse since q->srcu may involve one new cacheline fetch.
> > 
> > hctx->flags is always hot, so it is basically zero cost to check it.
> 
> Yea, but the interface is awkward that the caller tells the
> routine how it should lock/unlock...

If the two helpers are just blk-mq internal, I think it is fine to keep
this way with comment.

If driver needs the two exported, they should be often used in slow path, then
it is fine to refine the interface type.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 4/5] nvme: quiesce namespace queue in parallel
  2021-11-22  8:07   ` Sagi Grimberg
@ 2021-11-23  0:13     ` Ming Lei
  0 siblings, 0 replies; 28+ messages in thread
From: Ming Lei @ 2021-11-23  0:13 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Jens Axboe, Martin K . Petersen, linux-block,
	linux-nvme, linux-scsi, Keith Busch, Chao Leng

On Mon, Nov 22, 2021 at 10:07:24AM +0200, Sagi Grimberg wrote:
> 
> 
> On 11/19/21 4:18 AM, Ming Lei wrote:
> > Chao Leng reported that in case of lots of namespaces, it may take quite a
> > while for nvme_stop_queues() to quiesce all namespace queues.
> > 
> > Improve nvme_stop_queues() by running quiesce in parallel, and just wait
> > once if global quiesce wait is allowed.
> > 
> > Link: https://lore.kernel.org/linux-block/cc732195-c053-9ce4-e1a7-e7f6dcf762ac@huawei.com/
> > Reported-by: Chao Leng <lengchao@huawei.com>
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >   drivers/nvme/host/core.c | 9 ++++++---
> >   1 file changed, 6 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index 4b5de8f5435a..06741d3ed72b 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -4517,9 +4517,7 @@ static void nvme_start_ns_queue(struct nvme_ns *ns)
> >   static void nvme_stop_ns_queue(struct nvme_ns *ns)
> >   {
> >   	if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
> > -		blk_mq_quiesce_queue(ns->queue);
> > -	else
> > -		blk_mq_wait_quiesce_done(ns->queue);
> > +		blk_mq_quiesce_queue_nowait(ns->queue);
> >   }
> >   /*
> > @@ -4620,6 +4618,11 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl)
> >   	down_read(&ctrl->namespaces_rwsem);
> >   	list_for_each_entry(ns, &ctrl->namespaces, list)
> >   		nvme_stop_ns_queue(ns);
> > +	list_for_each_entry(ns, &ctrl->namespaces, list) {
> > +		blk_mq_wait_quiesce_done(ns->queue);
> > +		if (blk_mq_global_quiesce_wait(ns->queue))
> > +			break;
> > +	}
> >   	up_read(&ctrl->namespaces_rwsem);
> 
> 
> Can you quantify how much of a difference it is to do rcu_read_unlock()
> for every queue? The big improvement here is that it is done in parallel
> instead of serially. Just wandering if it is worth the less than elegant
> interface...

The biggest improvement is N * synchronize_rcu() -> 1 * synchronize_rcu()
in case of non blocking, that is what Chao Leng complained before.

Even for blocking case, the parallel quiesce is still good, such as, when
one synchronize_srcu() is done, other srcu syncs should be done usually too.


thanks, 
Ming


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/5] blk-mq: add helper of blk_mq_global_quiesce_wait()
  2021-11-22 13:55       ` Sagi Grimberg
@ 2021-11-23  0:17         ` Ming Lei
  2021-11-23  9:00           ` Sagi Grimberg
  0 siblings, 1 reply; 28+ messages in thread
From: Ming Lei @ 2021-11-23  0:17 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Jens Axboe, Martin K . Petersen, linux-block,
	linux-nvme, linux-scsi, Keith Busch

On Mon, Nov 22, 2021 at 03:55:36PM +0200, Sagi Grimberg wrote:
> 
> > > > Add helper of blk_mq_global_quiesce_wait() for supporting to quiesce
> > > > queues in parallel, then we can just wait once if global quiesce wait
> > > > is allowed.
> > > 
> > > blk_mq_global_quiesce_wait() is a poor name... global is scope-less and
> > > obviously it has a scope.
> > 
> > How about blk_mq_shared_quiesce_wait()? or any suggestion?
> 
> Shared between what?

All request queues in one host-wide, both scsi and nvme has such
requirement.

> 
> Maybe if the queue has a non-blocking tagset, it can have a "quiesced"
> flag that is cleared in unquiesce? then the callers can just continue
> to iterate but will only wait the rcu grace period once.

Yeah, that is what these patches try to implement.


Thanks
Ming


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/5] blk-mq: rename hctx_lock & hctx_unlock
  2021-11-23  0:08         ` Ming Lei
@ 2021-11-23  8:54           ` Sagi Grimberg
  0 siblings, 0 replies; 28+ messages in thread
From: Sagi Grimberg @ 2021-11-23  8:54 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Jens Axboe, Martin K . Petersen, linux-block,
	linux-nvme, linux-scsi, Keith Busch


>> Then look at q->has_srcu that Bart suggested?
> 
> Bart just suggested to rename q->alloc_srcu as q->has_srcu.

Yea, is there a problem using that instead of having callers
pass a flag?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/5] blk-mq: add helper of blk_mq_global_quiesce_wait()
  2021-11-23  0:17         ` Ming Lei
@ 2021-11-23  9:00           ` Sagi Grimberg
  2021-11-30  2:33             ` Ming Lei
  0 siblings, 1 reply; 28+ messages in thread
From: Sagi Grimberg @ 2021-11-23  9:00 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Jens Axboe, Martin K . Petersen, linux-block,
	linux-nvme, linux-scsi, Keith Busch


>>>>> Add helper of blk_mq_global_quiesce_wait() for supporting to quiesce
>>>>> queues in parallel, then we can just wait once if global quiesce wait
>>>>> is allowed.
>>>>
>>>> blk_mq_global_quiesce_wait() is a poor name... global is scope-less and
>>>> obviously it has a scope.
>>>
>>> How about blk_mq_shared_quiesce_wait()? or any suggestion?
>>
>> Shared between what?
> 
> All request queues in one host-wide, both scsi and nvme has such
> requirement.
> 
>>
>> Maybe if the queue has a non-blocking tagset, it can have a "quiesced"
>> flag that is cleared in unquiesce? then the callers can just continue
>> to iterate but will only wait the rcu grace period once.
> 
> Yeah, that is what these patches try to implement.

I was suggesting to "hide" it in the interface.
Maybe something like:
--
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8799fa73ef34..627b631db1f9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -263,14 +263,18 @@ void blk_mq_wait_quiesce_done(struct request_queue *q)
         unsigned int i;
         bool rcu = false;

+       if (!q->has_srcu && q->quiesced)
+               return;
         queue_for_each_hw_ctx(q, hctx, i) {
                 if (hctx->flags & BLK_MQ_F_BLOCKING)
                         synchronize_srcu(hctx->srcu);
                 else
                         rcu = true;
         }
-       if (rcu)
+       if (rcu) {
                 synchronize_rcu();
+               q->quiesced = true;
+       }
  }
  EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);

@@ -308,6 +312,7 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
         } else if (!--q->quiesce_depth) {
                 blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
                 run_queue = true;
+               q->quiesced = false;
         }
         spin_unlock_irqrestore(&q->queue_lock, flags);
--

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/5] blk-mq: add helper of blk_mq_global_quiesce_wait()
  2021-11-23  9:00           ` Sagi Grimberg
@ 2021-11-30  2:33             ` Ming Lei
  2021-12-08 12:49               ` Sagi Grimberg
  0 siblings, 1 reply; 28+ messages in thread
From: Ming Lei @ 2021-11-30  2:33 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Jens Axboe, Martin K . Petersen, linux-block,
	linux-nvme, linux-scsi, Keith Busch

On Tue, Nov 23, 2021 at 11:00:45AM +0200, Sagi Grimberg wrote:
> 
> > > > > > Add helper of blk_mq_global_quiesce_wait() for supporting to quiesce
> > > > > > queues in parallel, then we can just wait once if global quiesce wait
> > > > > > is allowed.
> > > > > 
> > > > > blk_mq_global_quiesce_wait() is a poor name... global is scope-less and
> > > > > obviously it has a scope.
> > > > 
> > > > How about blk_mq_shared_quiesce_wait()? or any suggestion?
> > > 
> > > Shared between what?
> > 
> > All request queues in one host-wide, both scsi and nvme has such
> > requirement.
> > 
> > > 
> > > Maybe if the queue has a non-blocking tagset, it can have a "quiesced"
> > > flag that is cleared in unquiesce? then the callers can just continue
> > > to iterate but will only wait the rcu grace period once.
> > 
> > Yeah, that is what these patches try to implement.
> 
> I was suggesting to "hide" it in the interface.
> Maybe something like:
> --
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 8799fa73ef34..627b631db1f9 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -263,14 +263,18 @@ void blk_mq_wait_quiesce_done(struct request_queue *q)
>         unsigned int i;
>         bool rcu = false;
> 
> +       if (!q->has_srcu && q->quiesced)
> +               return;
>         queue_for_each_hw_ctx(q, hctx, i) {
>                 if (hctx->flags & BLK_MQ_F_BLOCKING)
>                         synchronize_srcu(hctx->srcu);
>                 else
>                         rcu = true;
>         }
> -       if (rcu)
> +       if (rcu) {
>                 synchronize_rcu();
> +               q->quiesced = true;
> +       }
>  }
>  EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
> 
> @@ -308,6 +312,7 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
>         } else if (!--q->quiesce_depth) {
>                 blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
>                 run_queue = true;
> +               q->quiesced = false;

Different request queues are passed to blk_mq_wait_quiesce_done() during
the iteration, so marking 'quiesced' doesn't make any difference here.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/5] blk-mq: add helper of blk_mq_global_quiesce_wait()
  2021-11-30  2:33             ` Ming Lei
@ 2021-12-08 12:49               ` Sagi Grimberg
  2021-12-10  2:02                 ` Ming Lei
  0 siblings, 1 reply; 28+ messages in thread
From: Sagi Grimberg @ 2021-12-08 12:49 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Jens Axboe, Martin K . Petersen, linux-block,
	linux-nvme, linux-scsi, Keith Busch



On 11/30/21 4:33 AM, Ming Lei wrote:
> On Tue, Nov 23, 2021 at 11:00:45AM +0200, Sagi Grimberg wrote:
>>
>>>>>>> Add helper of blk_mq_global_quiesce_wait() for supporting to quiesce
>>>>>>> queues in parallel, then we can just wait once if global quiesce wait
>>>>>>> is allowed.
>>>>>>
>>>>>> blk_mq_global_quiesce_wait() is a poor name... global is scope-less and
>>>>>> obviously it has a scope.
>>>>>
>>>>> How about blk_mq_shared_quiesce_wait()? or any suggestion?
>>>>
>>>> Shared between what?
>>>
>>> All request queues in one host-wide, both scsi and nvme has such
>>> requirement.
>>>
>>>>
>>>> Maybe if the queue has a non-blocking tagset, it can have a "quiesced"
>>>> flag that is cleared in unquiesce? then the callers can just continue
>>>> to iterate but will only wait the rcu grace period once.
>>>
>>> Yeah, that is what these patches try to implement.
>>
>> I was suggesting to "hide" it in the interface.
>> Maybe something like:
>> --
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 8799fa73ef34..627b631db1f9 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -263,14 +263,18 @@ void blk_mq_wait_quiesce_done(struct request_queue *q)
>>          unsigned int i;
>>          bool rcu = false;
>>
>> +       if (!q->has_srcu && q->quiesced)
>> +               return;
>>          queue_for_each_hw_ctx(q, hctx, i) {
>>                  if (hctx->flags & BLK_MQ_F_BLOCKING)
>>                          synchronize_srcu(hctx->srcu);
>>                  else
>>                          rcu = true;
>>          }
>> -       if (rcu)
>> +       if (rcu) {
>>                  synchronize_rcu();
>> +               q->quiesced = true;
>> +       }
>>   }
>>   EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
>>
>> @@ -308,6 +312,7 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
>>          } else if (!--q->quiesce_depth) {
>>                  blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
>>                  run_queue = true;
>> +               q->quiesced = false;
> 
> Different request queues are passed to blk_mq_wait_quiesce_done() during
> the iteration, so marking 'quiesced' doesn't make any difference here.

I actually meant q->tag_set->quiesced, such that the flag will be
used in the tag_set reference. This way this sharing will be kept
hidden from the callers.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/5] blk-mq: add helper of blk_mq_global_quiesce_wait()
  2021-12-08 12:49               ` Sagi Grimberg
@ 2021-12-10  2:02                 ` Ming Lei
  0 siblings, 0 replies; 28+ messages in thread
From: Ming Lei @ 2021-12-10  2:02 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Christoph Hellwig, Jens Axboe, Martin K . Petersen, linux-block,
	linux-nvme, linux-scsi, Keith Busch

On Wed, Dec 08, 2021 at 02:49:25PM +0200, Sagi Grimberg wrote:
> 
> 
> On 11/30/21 4:33 AM, Ming Lei wrote:
> > On Tue, Nov 23, 2021 at 11:00:45AM +0200, Sagi Grimberg wrote:
> > > 
> > > > > > > > Add helper of blk_mq_global_quiesce_wait() for supporting to quiesce
> > > > > > > > queues in parallel, then we can just wait once if global quiesce wait
> > > > > > > > is allowed.
> > > > > > > 
> > > > > > > blk_mq_global_quiesce_wait() is a poor name... global is scope-less and
> > > > > > > obviously it has a scope.
> > > > > > 
> > > > > > How about blk_mq_shared_quiesce_wait()? or any suggestion?
> > > > > 
> > > > > Shared between what?
> > > > 
> > > > All request queues in one host-wide, both scsi and nvme has such
> > > > requirement.
> > > > 
> > > > > 
> > > > > Maybe if the queue has a non-blocking tagset, it can have a "quiesced"
> > > > > flag that is cleared in unquiesce? then the callers can just continue
> > > > > to iterate but will only wait the rcu grace period once.
> > > > 
> > > > Yeah, that is what these patches try to implement.
> > > 
> > > I was suggesting to "hide" it in the interface.
> > > Maybe something like:
> > > --
> > > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > > index 8799fa73ef34..627b631db1f9 100644
> > > --- a/block/blk-mq.c
> > > +++ b/block/blk-mq.c
> > > @@ -263,14 +263,18 @@ void blk_mq_wait_quiesce_done(struct request_queue *q)
> > >          unsigned int i;
> > >          bool rcu = false;
> > > 
> > > +       if (!q->has_srcu && q->quiesced)
> > > +               return;
> > >          queue_for_each_hw_ctx(q, hctx, i) {
> > >                  if (hctx->flags & BLK_MQ_F_BLOCKING)
> > >                          synchronize_srcu(hctx->srcu);
> > >                  else
> > >                          rcu = true;
> > >          }
> > > -       if (rcu)
> > > +       if (rcu) {
> > >                  synchronize_rcu();
> > > +               q->quiesced = true;
> > > +       }
> > >   }
> > >   EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
> > > 
> > > @@ -308,6 +312,7 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
> > >          } else if (!--q->quiesce_depth) {
> > >                  blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
> > >                  run_queue = true;
> > > +               q->quiesced = false;
> > 
> > Different request queues are passed to blk_mq_wait_quiesce_done() during
> > the iteration, so marking 'quiesced' doesn't make any difference here.
> 
> I actually meant q->tag_set->quiesced, such that the flag will be
> used in the tag_set reference. This way this sharing will be kept
> hidden from the callers.

That looks easy, but not true really from API viewpoint, q->tag_set has different
lifetime which is covered by driver, not mention other request queues may touch
q->tag_set->quiesced from other code path, so things will become much complicated,
also what is the meaning of quiesced state for tagset wide?

Here I prefer to philosophy of 'Worse is better'[1], and simplicity over perfection
by adding one simple helper of blk_mq_shared_quiesce_wait() for improving the case
very easily.

[1] ihttps://ipfs.fleek.co/ipfs/QmXoypizjW3WknFiJnKLwHCnL72vedxjQkDDP1mXWo6uco/wiki/Unix_philosophy.html


Thanks,
Ming


^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2021-12-10  2:03 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-19  2:18 [PATCH 0/5] blk-mq: quiesce improvement Ming Lei
2021-11-19  2:18 ` [PATCH 1/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue Ming Lei
2021-11-19  4:30   ` Bart Van Assche
2021-11-19  8:10     ` Ming Lei
2021-11-22  7:48       ` Sagi Grimberg
2021-11-22 13:08         ` Ming Lei
2021-11-22 13:47           ` Sagi Grimberg
2021-11-19  2:18 ` [PATCH 2/5] blk-mq: rename hctx_lock & hctx_unlock Ming Lei
2021-11-22  7:53   ` Sagi Grimberg
2021-11-22 13:20     ` Ming Lei
2021-11-22 13:50       ` Sagi Grimberg
2021-11-23  0:08         ` Ming Lei
2021-11-23  8:54           ` Sagi Grimberg
2021-11-22  9:00   ` Christoph Hellwig
2021-11-19  2:18 ` [PATCH 3/5] blk-mq: add helper of blk_mq_global_quiesce_wait() Ming Lei
2021-11-22  7:56   ` Sagi Grimberg
2021-11-22  8:00     ` Sagi Grimberg
2021-11-22 13:26     ` Ming Lei
2021-11-22 13:55       ` Sagi Grimberg
2021-11-23  0:17         ` Ming Lei
2021-11-23  9:00           ` Sagi Grimberg
2021-11-30  2:33             ` Ming Lei
2021-12-08 12:49               ` Sagi Grimberg
2021-12-10  2:02                 ` Ming Lei
2021-11-19  2:18 ` [PATCH 4/5] nvme: quiesce namespace queue in parallel Ming Lei
2021-11-22  8:07   ` Sagi Grimberg
2021-11-23  0:13     ` Ming Lei
2021-11-19  2:18 ` [PATCH 5/5] scsi: use blk-mq quiesce APIs to implement scsi_host_block Ming Lei

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).