* [PATCH V2 0/5] blk-mq: quiesce improvement
@ 2021-11-30 7:37 Ming Lei
2021-11-30 7:37 ` [PATCH V2 1/5] blk-mq: remove hctx_lock and hctx_unlock Ming Lei
` (5 more replies)
0 siblings, 6 replies; 10+ messages in thread
From: Ming Lei @ 2021-11-30 7:37 UTC (permalink / raw)
To: Christoph Hellwig, Jens Axboe, Martin K . Petersen
Cc: Sagi Grimberg, linux-block, linux-nvme, linux-scsi, Keith Busch,
Ming Lei
Hi Guys,
The 1st patch removes hctx_lock and hctx_unlock, and optimize dispatch
code path a bit.
The 2nd patch moves srcu from blk_mq_hw_ctx to request_queue.
The other patches add one new helper for supporting quiesce in parallel.
V2:
- add patch of 'remove hctx_lock and hctx_unlock'
- replace ->alloc_srcu with queue flag, as suggested by Sagi
Ming Lei (5):
blk-mq: remove hctx_lock and hctx_unlock
blk-mq: move srcu from blk_mq_hw_ctx to request_queue
blk-mq: add helper of blk_mq_shared_quiesce_wait()
nvme: quiesce namespace queue in parallel
scsi: use blk-mq quiesce APIs to implement scsi_host_block
block/blk-core.c | 27 +++++++--
block/blk-mq-sysfs.c | 2 -
block/blk-mq.c | 116 +++++++++++++--------------------------
block/blk-sysfs.c | 3 +-
block/blk.h | 10 +++-
block/genhd.c | 2 +-
drivers/nvme/host/core.c | 9 ++-
drivers/scsi/scsi_lib.c | 16 +++---
include/linux/blk-mq.h | 21 ++++---
include/linux/blkdev.h | 9 +++
10 files changed, 109 insertions(+), 106 deletions(-)
--
2.31.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH V2 1/5] blk-mq: remove hctx_lock and hctx_unlock
2021-11-30 7:37 [PATCH V2 0/5] blk-mq: quiesce improvement Ming Lei
@ 2021-11-30 7:37 ` Ming Lei
2021-11-30 14:42 ` kernel test robot
2021-11-30 7:37 ` [PATCH V2 2/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue Ming Lei
` (4 subsequent siblings)
5 siblings, 1 reply; 10+ messages in thread
From: Ming Lei @ 2021-11-30 7:37 UTC (permalink / raw)
To: Christoph Hellwig, Jens Axboe, Martin K . Petersen
Cc: Sagi Grimberg, linux-block, linux-nvme, linux-scsi, Keith Busch,
Ming Lei
Remove hctx_lock and hctx_unlock, and add one helper of
blk_mq_run_dispatch_ops() to run code block defined in dispatch_ops
with rcu/srcu read held.
Compared with hctx_lock()/hctx_unlock():
1) remove 2 branch to 1, so we just need to check
(hctx->flags & BLK_MQ_F_BLOCKING) once when running one dispatch_ops
2) srcu_idx needn't to be touched in case of non-blocking
3) might_sleep_if() can be moved to the blocking branch
t/io_uring shows that ~4% IOPS boost is observed on null_blk.
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-mq.c | 79 ++++++++++++++++++++------------------------------
1 file changed, 31 insertions(+), 48 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2deb99cf185e..c5dc716b8167 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1079,25 +1079,22 @@ void blk_mq_complete_request(struct request *rq)
}
EXPORT_SYMBOL(blk_mq_complete_request);
-static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
- __releases(hctx->srcu)
-{
- if (!(hctx->flags & BLK_MQ_F_BLOCKING))
- rcu_read_unlock();
- else
- srcu_read_unlock(hctx->srcu, srcu_idx);
-}
-
-static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
- __acquires(hctx->srcu)
-{
- if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
- /* shut up gcc false positive */
- *srcu_idx = 0;
- rcu_read_lock();
- } else
- *srcu_idx = srcu_read_lock(hctx->srcu);
-}
+/* run the code block in @dispatch_ops with rcu/srcu read lock held */
+#define blk_mq_run_dispatch_ops(hctx, dispatch_ops) \
+do { \
+ if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { \
+ rcu_read_lock(); \
+ (dispatch_ops); \
+ rcu_read_unlock(); \
+ } else { \
+ int srcu_idx; \
+ \
+ might_sleep(); \
+ srcu_idx = srcu_read_lock(hctx->srcu); \
+ (dispatch_ops); \
+ srcu_read_unlock(hctx->srcu, srcu_idx); \
+ } \
+} while (0)
/**
* blk_mq_start_request - Start processing a request
@@ -1960,19 +1957,13 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
*/
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
{
- int srcu_idx;
-
/*
* We can't run the queue inline with ints disabled. Ensure that
* we catch bad users of this early.
*/
WARN_ON_ONCE(in_interrupt());
- might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
-
- hctx_lock(hctx, &srcu_idx);
- blk_mq_sched_dispatch_requests(hctx);
- hctx_unlock(hctx, srcu_idx);
+ blk_mq_run_dispatch_ops(hctx, blk_mq_sched_dispatch_requests(hctx));
}
static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
@@ -2084,7 +2075,6 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
*/
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
- int srcu_idx;
bool need_run;
/*
@@ -2095,10 +2085,9 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
* And queue will be rerun in blk_mq_unquiesce_queue() if it is
* quiesced.
*/
- hctx_lock(hctx, &srcu_idx);
- need_run = !blk_queue_quiesced(hctx->queue) &&
- blk_mq_hctx_has_pending(hctx);
- hctx_unlock(hctx, srcu_idx);
+ blk_mq_run_dispatch_ops(hctx,
+ need_run = !blk_queue_quiesced(hctx->queue) &&
+ blk_mq_hctx_has_pending(hctx));
if (need_run)
__blk_mq_delay_run_hw_queue(hctx, async, 0);
@@ -2502,31 +2491,25 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
blk_status_t ret;
- int srcu_idx;
-
- might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
-
- hctx_lock(hctx, &srcu_idx);
-
- ret = __blk_mq_try_issue_directly(hctx, rq, false, true);
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
- blk_mq_request_bypass_insert(rq, false, true);
- else if (ret != BLK_STS_OK)
- blk_mq_end_request(rq, ret);
- hctx_unlock(hctx, srcu_idx);
+ blk_mq_run_dispatch_ops(hctx,
+ {
+ ret = __blk_mq_try_issue_directly(hctx, rq, false, true);
+ if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
+ blk_mq_request_bypass_insert(rq, false, true);
+ else if (ret != BLK_STS_OK)
+ blk_mq_end_request(rq, ret);
+ }
+ );
}
static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
{
blk_status_t ret;
- int srcu_idx;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
- hctx_lock(hctx, &srcu_idx);
- ret = __blk_mq_try_issue_directly(hctx, rq, true, last);
- hctx_unlock(hctx, srcu_idx);
-
+ blk_mq_run_dispatch_ops(hctx,
+ ret = __blk_mq_try_issue_directly(hctx, rq, true, last));
return ret;
}
--
2.31.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH V2 2/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue
2021-11-30 7:37 [PATCH V2 0/5] blk-mq: quiesce improvement Ming Lei
2021-11-30 7:37 ` [PATCH V2 1/5] blk-mq: remove hctx_lock and hctx_unlock Ming Lei
@ 2021-11-30 7:37 ` Ming Lei
2021-11-30 7:37 ` [PATCH V2 3/5] blk-mq: add helper of blk_mq_shared_quiesce_wait() Ming Lei
` (3 subsequent siblings)
5 siblings, 0 replies; 10+ messages in thread
From: Ming Lei @ 2021-11-30 7:37 UTC (permalink / raw)
To: Christoph Hellwig, Jens Axboe, Martin K . Petersen
Cc: Sagi Grimberg, linux-block, linux-nvme, linux-scsi, Keith Busch,
Ming Lei
In case of BLK_MQ_F_BLOCKING, per-hctx srcu is used to protect dispatch
critical area. However, this srcu instance stays at the end of hctx, and
it often takes standalone cacheline, often cold.
Inside srcu_read_lock() and srcu_read_unlock(), WRITE is always done on
the indirect percpu variable which is allocated from heap instead of
being embedded, srcu->srcu_idx is read only in srcu_read_lock(). It
doesn't matter if srcu structure stays in hctx or request queue.
So switch to per-request-queue srcu for protecting dispatch, and this
way simplifies quiesce a lot, not mention quiesce is always done on the
request queue wide.
Cc: Keith Busch <kbusch@kernel.org>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-core.c | 27 ++++++++++++++++++++++-----
block/blk-mq-sysfs.c | 2 --
block/blk-mq.c | 41 ++++++++++-------------------------------
block/blk-sysfs.c | 3 ++-
block/blk.h | 10 +++++++++-
block/genhd.c | 2 +-
include/linux/blk-mq.h | 8 --------
include/linux/blkdev.h | 9 +++++++++
8 files changed, 53 insertions(+), 49 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index b0660c9df852..10619fd83c1b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -66,6 +66,7 @@ DEFINE_IDA(blk_queue_ida);
* For queue allocation
*/
struct kmem_cache *blk_requestq_cachep;
+struct kmem_cache *blk_requestq_srcu_cachep;
/*
* Controlling structure to kblockd
@@ -437,21 +438,27 @@ static void blk_timeout_work(struct work_struct *work)
{
}
-struct request_queue *blk_alloc_queue(int node_id)
+struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
{
struct request_queue *q;
int ret;
- q = kmem_cache_alloc_node(blk_requestq_cachep,
- GFP_KERNEL | __GFP_ZERO, node_id);
+ q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
+ GFP_KERNEL | __GFP_ZERO, node_id);
if (!q)
return NULL;
+ if (alloc_srcu) {
+ blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
+ if (init_srcu_struct(q->srcu) != 0)
+ goto fail_q;
+ }
+
q->last_merge = NULL;
q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
if (q->id < 0)
- goto fail_q;
+ goto fail_srcu;
ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
if (ret)
@@ -508,8 +515,11 @@ struct request_queue *blk_alloc_queue(int node_id)
bioset_exit(&q->bio_split);
fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
+fail_srcu:
+ if (alloc_srcu)
+ cleanup_srcu_struct(q->srcu);
fail_q:
- kmem_cache_free(blk_requestq_cachep, q);
+ kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
return NULL;
}
@@ -1301,6 +1311,9 @@ int __init blk_dev_init(void)
sizeof_field(struct request, cmd_flags));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
sizeof_field(struct bio, bi_opf));
+ BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
+ __alignof__(struct request_queue)) !=
+ sizeof(struct request_queue));
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
kblockd_workqueue = alloc_workqueue("kblockd",
@@ -1311,6 +1324,10 @@ int __init blk_dev_init(void)
blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
+ blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
+ sizeof(struct request_queue) +
+ sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
+
blk_debugfs_root = debugfs_create_dir("block", NULL);
return 0;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 253c857cba47..674786574075 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
kobj);
- if (hctx->flags & BLK_MQ_F_BLOCKING)
- cleanup_srcu_struct(hctx->srcu);
blk_free_flush_queue(hctx->fq);
sbitmap_free(&hctx->ctx_map);
free_cpumask_var(hctx->cpumask);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c5dc716b8167..a3ff671ca20e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -260,17 +260,9 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
*/
void blk_mq_wait_quiesce_done(struct request_queue *q)
{
- struct blk_mq_hw_ctx *hctx;
- unsigned int i;
- bool rcu = false;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- if (hctx->flags & BLK_MQ_F_BLOCKING)
- synchronize_srcu(hctx->srcu);
- else
- rcu = true;
- }
- if (rcu)
+ if (blk_queue_has_srcu(q))
+ synchronize_srcu(q->srcu);
+ else
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
@@ -1090,9 +1082,9 @@ do { \
int srcu_idx; \
\
might_sleep(); \
- srcu_idx = srcu_read_lock(hctx->srcu); \
+ srcu_idx = srcu_read_lock(hctx->queue->srcu); \
(dispatch_ops); \
- srcu_read_unlock(hctx->srcu, srcu_idx); \
+ srcu_read_unlock(hctx->queue->srcu, srcu_idx); \
} \
} while (0)
@@ -3431,20 +3423,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
}
}
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
-{
- int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
-
- BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
- __alignof__(struct blk_mq_hw_ctx)) !=
- sizeof(struct blk_mq_hw_ctx));
-
- if (tag_set->flags & BLK_MQ_F_BLOCKING)
- hw_ctx_size += sizeof(struct srcu_struct);
-
- return hw_ctx_size;
-}
-
static int blk_mq_init_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
@@ -3482,7 +3460,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx;
gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
- hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
+ hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
if (!hctx)
goto fail_alloc_hctx;
@@ -3524,8 +3502,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
if (!hctx->fq)
goto free_bitmap;
- if (hctx->flags & BLK_MQ_F_BLOCKING)
- init_srcu_struct(hctx->srcu);
blk_mq_hctx_kobj_init(hctx);
return hctx;
@@ -3861,7 +3837,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
struct request_queue *q;
int ret;
- q = blk_alloc_queue(set->numa_node);
+ q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
if (!q)
return ERR_PTR(-ENOMEM);
q->queuedata = queuedata;
@@ -4010,6 +3986,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
{
+ WARN_ON_ONCE(blk_queue_has_srcu(q) !=
+ !!(set->flags & BLK_MQ_F_BLOCKING));
+
/* mark the queue as mq asap */
q->mq_ops = set->ops;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 4622da4bb992..3e6357321225 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -735,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
{
struct request_queue *q = container_of(rcu_head, struct request_queue,
rcu_head);
- kmem_cache_free(blk_requestq_cachep, q);
+
+ kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
}
/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
diff --git a/block/blk.h b/block/blk.h
index a57c84654d0a..911f9f8db646 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -27,6 +27,7 @@ struct blk_flush_queue {
};
extern struct kmem_cache *blk_requestq_cachep;
+extern struct kmem_cache *blk_requestq_srcu_cachep;
extern struct kobj_type blk_queue_ktype;
extern struct ida blk_queue_ida;
@@ -428,7 +429,14 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page);
-struct request_queue *blk_alloc_queue(int node_id);
+static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
+{
+ if (srcu)
+ return blk_requestq_srcu_cachep;
+ return blk_requestq_cachep;
+}
+struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
+
int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
int disk_alloc_events(struct gendisk *disk);
diff --git a/block/genhd.c b/block/genhd.c
index 5179a4f00fba..3c139a1b6f04 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1338,7 +1338,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
struct request_queue *q;
struct gendisk *disk;
- q = blk_alloc_queue(node);
+ q = blk_alloc_queue(node, false);
if (!q)
return NULL;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d952c3442261..42fe97adb807 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -4,7 +4,6 @@
#include <linux/blkdev.h>
#include <linux/sbitmap.h>
-#include <linux/srcu.h>
#include <linux/lockdep.h>
#include <linux/scatterlist.h>
#include <linux/prefetch.h>
@@ -376,13 +375,6 @@ struct blk_mq_hw_ctx {
* q->unused_hctx_list.
*/
struct list_head hctx_list;
-
- /**
- * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
- * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
- * blk_mq_hw_ctx_size().
- */
- struct srcu_struct srcu[];
};
/**
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0a4416ef4fbf..c80cfaefc0a8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -16,6 +16,7 @@
#include <linux/percpu-refcount.h>
#include <linux/blkzoned.h>
#include <linux/sbitmap.h>
+#include <linux/srcu.h>
struct module;
struct request_queue;
@@ -373,11 +374,18 @@ struct request_queue {
* devices that do not have multiple independent access ranges.
*/
struct blk_independent_access_ranges *ia_ranges;
+
+ /**
+ * @srcu: Sleepable RCU. Use as lock when type of the request queue
+ * is blocking (BLK_MQ_F_BLOCKING). Must be the last member
+ */
+ struct srcu_struct srcu[];
};
/* Keep blk_queue_flag_name[] in sync with the definitions below */
#define QUEUE_FLAG_STOPPED 0 /* queue is stopped */
#define QUEUE_FLAG_DYING 1 /* queue being torn down */
+#define QUEUE_FLAG_HAS_SRCU 2 /* SRCU is allocated */
#define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */
#define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */
#define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */
@@ -415,6 +423,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
+#define blk_queue_has_srcu(q) test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags)
#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
#define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
--
2.31.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH V2 3/5] blk-mq: add helper of blk_mq_shared_quiesce_wait()
2021-11-30 7:37 [PATCH V2 0/5] blk-mq: quiesce improvement Ming Lei
2021-11-30 7:37 ` [PATCH V2 1/5] blk-mq: remove hctx_lock and hctx_unlock Ming Lei
2021-11-30 7:37 ` [PATCH V2 2/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue Ming Lei
@ 2021-11-30 7:37 ` Ming Lei
2021-11-30 7:37 ` [PATCH V2 4/5] nvme: quiesce namespace queue in parallel Ming Lei
` (2 subsequent siblings)
5 siblings, 0 replies; 10+ messages in thread
From: Ming Lei @ 2021-11-30 7:37 UTC (permalink / raw)
To: Christoph Hellwig, Jens Axboe, Martin K . Petersen
Cc: Sagi Grimberg, linux-block, linux-nvme, linux-scsi, Keith Busch,
Ming Lei
Add helper of blk_mq_shared_quiesce_wait() for supporting to quiesce
queues in parallel, then we can just wait once if global quiesce wait
is allowed.
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
include/linux/blk-mq.h | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 42fe97adb807..6f3ccd604d72 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -788,6 +788,19 @@ static inline bool blk_mq_add_to_batch(struct request *req,
return true;
}
+/*
+ * If the queue has allocated & used srcu to quiesce queue, quiesce wait is
+ * done via the synchronize_srcu(q->rcu), otherwise it can be done via
+ * shared synchronize_rcu() from other request queues in same host wide.
+ *
+ * This helper can help us to support quiescing queue in parallel, so just
+ * one quiesce wait is enough if shared quiesce wait is allowed.
+ */
+static inline bool blk_mq_shared_quiesce_wait(struct request_queue *q)
+{
+ return !blk_queue_has_srcu(q);
+}
+
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list);
void blk_mq_kick_requeue_list(struct request_queue *q);
void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs);
--
2.31.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH V2 4/5] nvme: quiesce namespace queue in parallel
2021-11-30 7:37 [PATCH V2 0/5] blk-mq: quiesce improvement Ming Lei
` (2 preceding siblings ...)
2021-11-30 7:37 ` [PATCH V2 3/5] blk-mq: add helper of blk_mq_shared_quiesce_wait() Ming Lei
@ 2021-11-30 7:37 ` Ming Lei
2021-11-30 7:37 ` [PATCH V2 5/5] scsi: use blk-mq quiesce APIs to implement scsi_host_block Ming Lei
2022-06-07 11:21 ` [PATCH V2 4/5] nvme: quiesce namespace queue in parallel Ismael Luceno
5 siblings, 0 replies; 10+ messages in thread
From: Ming Lei @ 2021-11-30 7:37 UTC (permalink / raw)
To: Christoph Hellwig, Jens Axboe, Martin K . Petersen
Cc: Sagi Grimberg, linux-block, linux-nvme, linux-scsi, Keith Busch,
Ming Lei, Chao Leng
Chao Leng reported that in case of lots of namespaces, it may take quite a
while for nvme_stop_queues() to quiesce all namespace queues.
Improve nvme_stop_queues() by running quiesce in parallel, and just wait
once if global quiesce wait is allowed.
Link: https://lore.kernel.org/linux-block/cc732195-c053-9ce4-e1a7-e7f6dcf762ac@huawei.com/
Reported-by: Chao Leng <lengchao@huawei.com>
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
drivers/nvme/host/core.c | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4c63564adeaa..20827a360099 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4540,9 +4540,7 @@ static void nvme_start_ns_queue(struct nvme_ns *ns)
static void nvme_stop_ns_queue(struct nvme_ns *ns)
{
if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
- blk_mq_quiesce_queue(ns->queue);
- else
- blk_mq_wait_quiesce_done(ns->queue);
+ blk_mq_quiesce_queue_nowait(ns->queue);
}
/*
@@ -4643,6 +4641,11 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl)
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
nvme_stop_ns_queue(ns);
+ list_for_each_entry(ns, &ctrl->namespaces, list) {
+ blk_mq_wait_quiesce_done(ns->queue);
+ if (blk_mq_shared_quiesce_wait(ns->queue))
+ break;
+ }
up_read(&ctrl->namespaces_rwsem);
}
EXPORT_SYMBOL_GPL(nvme_stop_queues);
--
2.31.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH V2 5/5] scsi: use blk-mq quiesce APIs to implement scsi_host_block
2021-11-30 7:37 [PATCH V2 0/5] blk-mq: quiesce improvement Ming Lei
` (3 preceding siblings ...)
2021-11-30 7:37 ` [PATCH V2 4/5] nvme: quiesce namespace queue in parallel Ming Lei
@ 2021-11-30 7:37 ` Ming Lei
2022-06-07 11:21 ` [PATCH V2 4/5] nvme: quiesce namespace queue in parallel Ismael Luceno
5 siblings, 0 replies; 10+ messages in thread
From: Ming Lei @ 2021-11-30 7:37 UTC (permalink / raw)
To: Christoph Hellwig, Jens Axboe, Martin K . Petersen
Cc: Sagi Grimberg, linux-block, linux-nvme, linux-scsi, Keith Busch,
Ming Lei
scsi_host_block() calls synchronize_rcu() directly to wait for
quiesce done, this way is ugly since it exposes blk-mq quiesce's
implementation details.
Instead apply blk_mq_wait_quiesce_done() and
blk_mq_shared_quiesce_wait() for scsi_host_block().
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
drivers/scsi/scsi_lib.c | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 5e8b5ecb3245..d93bfc08bc1a 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2952,15 +2952,15 @@ scsi_host_block(struct Scsi_Host *shost)
}
}
- /*
- * SCSI never enables blk-mq's BLK_MQ_F_BLOCKING flag so
- * calling synchronize_rcu() once is enough.
- */
- WARN_ON_ONCE(shost->tag_set.flags & BLK_MQ_F_BLOCKING);
-
- if (!ret)
- synchronize_rcu();
+ if (!ret) {
+ shost_for_each_device(sdev, shost) {
+ struct request_queue *q = sdev->request_queue;
+ blk_mq_wait_quiesce_done(q);
+ if (blk_mq_shared_quiesce_wait(q))
+ break;
+ }
+ }
return ret;
}
EXPORT_SYMBOL_GPL(scsi_host_block);
--
2.31.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH V2 1/5] blk-mq: remove hctx_lock and hctx_unlock
2021-11-30 7:37 ` [PATCH V2 1/5] blk-mq: remove hctx_lock and hctx_unlock Ming Lei
@ 2021-11-30 14:42 ` kernel test robot
0 siblings, 0 replies; 10+ messages in thread
From: kernel test robot @ 2021-11-30 14:42 UTC (permalink / raw)
To: Ming Lei, Christoph Hellwig, Jens Axboe, Martin K . Petersen
Cc: llvm, kbuild-all, Sagi Grimberg, linux-block, linux-nvme,
linux-scsi, Keith Busch, Ming Lei
Hi Ming,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on mkp-scsi/for-next v5.16-rc3 next-20211130]
[cannot apply to linux-nvme/for-next hch-configfs/for-next]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]
url: https://github.com/0day-ci/linux/commits/Ming-Lei/blk-mq-quiesce-improvement/20211130-154015
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
config: hexagon-randconfig-r045-20211129 (https://download.01.org/0day-ci/archive/20211130/202111302217.rEsBycwv-lkp@intel.com/config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 25eb7fa01d7ebbe67648ea03841cda55b4239ab2)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/4575a8b36e3a56fa87b1f77e0064fc2ec36ebb7c
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Ming-Lei/blk-mq-quiesce-improvement/20211130-154015
git checkout 4575a8b36e3a56fa87b1f77e0064fc2ec36ebb7c
# save the config file to linux build tree
mkdir build_dir
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
All warnings (new ones prefixed by >>):
>> block/blk-mq.c:2482:2: warning: '(' and '{' tokens introducing statement expression appear in different macro expansion contexts [-Wcompound-token-split-by-macro]
blk_mq_run_dispatch_ops(hctx,
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
block/blk-mq.c:1079:3: note: expanded from macro 'blk_mq_run_dispatch_ops'
(dispatch_ops); \
^
block/blk-mq.c:2483:3: note: '{' token is here
{
^
block/blk-mq.c:1079:4: note: expanded from macro 'blk_mq_run_dispatch_ops'
(dispatch_ops); \
^~~~~~~~~~~~
>> block/blk-mq.c:2489:3: warning: '}' and ')' tokens terminating statement expression appear in different macro expansion contexts [-Wcompound-token-split-by-macro]
}
^
block/blk-mq.c:1079:4: note: expanded from macro 'blk_mq_run_dispatch_ops'
(dispatch_ops); \
^~~~~~~~~~~~
block/blk-mq.c:2482:2: note: ')' token is here
blk_mq_run_dispatch_ops(hctx,
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
block/blk-mq.c:1079:16: note: expanded from macro 'blk_mq_run_dispatch_ops'
(dispatch_ops); \
^
>> block/blk-mq.c:2482:2: warning: '(' and '{' tokens introducing statement expression appear in different macro expansion contexts [-Wcompound-token-split-by-macro]
blk_mq_run_dispatch_ops(hctx,
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
block/blk-mq.c:1086:3: note: expanded from macro 'blk_mq_run_dispatch_ops'
(dispatch_ops); \
^
block/blk-mq.c:2483:3: note: '{' token is here
{
^
block/blk-mq.c:1086:4: note: expanded from macro 'blk_mq_run_dispatch_ops'
(dispatch_ops); \
^~~~~~~~~~~~
>> block/blk-mq.c:2489:3: warning: '}' and ')' tokens terminating statement expression appear in different macro expansion contexts [-Wcompound-token-split-by-macro]
}
^
block/blk-mq.c:1086:4: note: expanded from macro 'blk_mq_run_dispatch_ops'
(dispatch_ops); \
^~~~~~~~~~~~
block/blk-mq.c:2482:2: note: ')' token is here
blk_mq_run_dispatch_ops(hctx,
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
block/blk-mq.c:1086:16: note: expanded from macro 'blk_mq_run_dispatch_ops'
(dispatch_ops); \
^
4 warnings generated.
vim +2482 block/blk-mq.c
2466
2467 /**
2468 * blk_mq_try_issue_directly - Try to send a request directly to device driver.
2469 * @hctx: Pointer of the associated hardware queue.
2470 * @rq: Pointer to request to be sent.
2471 *
2472 * If the device has enough resources to accept a new request now, send the
2473 * request directly to device driver. Else, insert at hctx->dispatch queue, so
2474 * we can try send it another time in the future. Requests inserted at this
2475 * queue have higher priority.
2476 */
2477 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
2478 struct request *rq)
2479 {
2480 blk_status_t ret;
2481
> 2482 blk_mq_run_dispatch_ops(hctx,
2483 {
2484 ret = __blk_mq_try_issue_directly(hctx, rq, false, true);
2485 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
2486 blk_mq_request_bypass_insert(rq, false, true);
2487 else if (ret != BLK_STS_OK)
2488 blk_mq_end_request(rq, ret);
> 2489 }
2490 );
2491 }
2492
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH V2 4/5] nvme: quiesce namespace queue in parallel
2021-11-30 7:37 [PATCH V2 0/5] blk-mq: quiesce improvement Ming Lei
` (4 preceding siblings ...)
2021-11-30 7:37 ` [PATCH V2 5/5] scsi: use blk-mq quiesce APIs to implement scsi_host_block Ming Lei
@ 2022-06-07 11:21 ` Ismael Luceno
2022-06-07 14:03 ` Ming Lei
5 siblings, 1 reply; 10+ messages in thread
From: Ismael Luceno @ 2022-06-07 11:21 UTC (permalink / raw)
To: Ming Lei
Cc: Enzo Matsumiya, Sagi Grimberg, Chao Leng, linux-block,
linux-nvme, linux-scsi
Hi Ming,
Has this patch been dropped/abandoned?
On Tue, 30 Nov 2021 15:37:51 +0800
Ming Lei <ming.lei@redhat.com> wrote:
> Chao Leng reported that in case of lots of namespaces, it may take
> quite a while for nvme_stop_queues() to quiesce all namespace queues.
>
> Improve nvme_stop_queues() by running quiesce in parallel, and just
> wait once if global quiesce wait is allowed.
>
> Link:
> https://lore.kernel.org/linux-block/cc732195-c053-9ce4-e1a7-e7f6dcf762ac@huawei.com/
> Reported-by: Chao Leng <lengchao@huawei.com>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
<...>
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH V2 4/5] nvme: quiesce namespace queue in parallel
2022-06-07 11:21 ` [PATCH V2 4/5] nvme: quiesce namespace queue in parallel Ismael Luceno
@ 2022-06-07 14:03 ` Ming Lei
2022-07-06 15:37 ` Ismael Luceno
0 siblings, 1 reply; 10+ messages in thread
From: Ming Lei @ 2022-06-07 14:03 UTC (permalink / raw)
To: Ismael Luceno
Cc: Enzo Matsumiya, Sagi Grimberg, Chao Leng, linux-block,
linux-nvme, linux-scsi
On Tue, Jun 07, 2022 at 01:21:18PM +0200, Ismael Luceno wrote:
> Hi Ming,
>
> Has this patch been dropped/abandoned?
Hi Ismael,
The whole patchset wasn't be accepted if I remember correctly, but
finally we moved srcu out of hctx in another patchset.
If you think the patch of 'nvme: quiesce namespace queue in parallel'
is useful, please provide a bit info about your case, then we may
figure out similar patch if it is necessary.
Thanks,
Ming
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH V2 4/5] nvme: quiesce namespace queue in parallel
2022-06-07 14:03 ` Ming Lei
@ 2022-07-06 15:37 ` Ismael Luceno
0 siblings, 0 replies; 10+ messages in thread
From: Ismael Luceno @ 2022-07-06 15:37 UTC (permalink / raw)
To: Ming Lei
Cc: Enzo Matsumiya, Sagi Grimberg, Chao Leng, linux-block,
linux-nvme, linux-scsi
On Tue, 7 Jun 2022 22:03:40 +0800
Ming Lei <ming.lei@redhat.com> wrote:
> On Tue, Jun 07, 2022 at 01:21:18PM +0200, Ismael Luceno wrote:
> > Hi Ming,
> >
> > Has this patch been dropped/abandoned?
>
> Hi Ismael,
>
> The whole patchset wasn't be accepted if I remember correctly, but
> finally we moved srcu out of hctx in another patchset.
>
> If you think the patch of 'nvme: quiesce namespace queue in parallel'
> is useful, please provide a bit info about your case, then we may
> figure out similar patch if it is necessary.
Chao Leng's outgoing email (lengchao@huawei.com) permission is
restricted; I got from him (through a couple of indirections):
> Hi, Ismael and Ming, The case: When the multipathing software is used,
> if one path failed, fail over to other good path may take long time.
> This is important for scenarios that require low latency and high
> reliability, such as real-time deals.
>
> This patch can fix the bug.
Same thing he said here:
https://lore.kernel.org/linux-nvme/cc732195-c053-9ce4-e1a7-e7f6dcf762ac@huawei.com/
Huawei is still looking for a solution to be merged in mainline.
--
Ismael Luceno
SUSE L3 Support
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2022-07-06 15:38 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-11-30 7:37 [PATCH V2 0/5] blk-mq: quiesce improvement Ming Lei
2021-11-30 7:37 ` [PATCH V2 1/5] blk-mq: remove hctx_lock and hctx_unlock Ming Lei
2021-11-30 14:42 ` kernel test robot
2021-11-30 7:37 ` [PATCH V2 2/5] blk-mq: move srcu from blk_mq_hw_ctx to request_queue Ming Lei
2021-11-30 7:37 ` [PATCH V2 3/5] blk-mq: add helper of blk_mq_shared_quiesce_wait() Ming Lei
2021-11-30 7:37 ` [PATCH V2 4/5] nvme: quiesce namespace queue in parallel Ming Lei
2021-11-30 7:37 ` [PATCH V2 5/5] scsi: use blk-mq quiesce APIs to implement scsi_host_block Ming Lei
2022-06-07 11:21 ` [PATCH V2 4/5] nvme: quiesce namespace queue in parallel Ismael Luceno
2022-06-07 14:03 ` Ming Lei
2022-07-06 15:37 ` Ismael Luceno
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).