* [PATCH 0/4] blk-mq: improve dispatch lock & quiesce implementation
@ 2021-12-03 13:15 Ming Lei
2021-12-03 13:15 ` [PATCH 1/4] blk-mq: remove hctx_lock and hctx_unlock Ming Lei
` (5 more replies)
0 siblings, 6 replies; 10+ messages in thread
From: Ming Lei @ 2021-12-03 13:15 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-block, Ming Lei
Hello,
The 1st patch replaces hctx_lock/hctx_unlock with
blk_mq_run_dispatch_ops(), so that the fast dispatch code path gets
optimized a bit.
The 2nd patch moves srcu from hctx into request queue.
The last two patches call blk_mq_run_dispatch_ops once in case of
issuing directly from list.
Ming Lei (4):
blk-mq: remove hctx_lock and hctx_unlock
blk-mq: move srcu from blk_mq_hw_ctx to request_queue
blk-mq: pass request queue to blk_mq_run_dispatch_ops
blk-mq: run dispatch lock once in case of issuing from list
block/blk-core.c | 27 +++++++++--
block/blk-mq-sched.c | 3 +-
block/blk-mq-sysfs.c | 2 -
block/blk-mq.c | 105 +++++++++--------------------------------
block/blk-mq.h | 16 +++++++
block/blk-sysfs.c | 3 +-
block/blk.h | 10 +++-
block/genhd.c | 2 +-
include/linux/blk-mq.h | 8 ----
include/linux/blkdev.h | 9 ++++
10 files changed, 84 insertions(+), 101 deletions(-)
--
2.31.1
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH 1/4] blk-mq: remove hctx_lock and hctx_unlock
2021-12-03 13:15 [PATCH 0/4] blk-mq: improve dispatch lock & quiesce implementation Ming Lei
@ 2021-12-03 13:15 ` Ming Lei
[not found] ` <CGME20211206103121eucas1p2ac12934f15129fe77d7f8d95c02fe447@eucas1p2.samsung.com>
2021-12-03 13:15 ` [PATCH 2/4] blk-mq: move srcu from blk_mq_hw_ctx to request_queue Ming Lei
` (4 subsequent siblings)
5 siblings, 1 reply; 10+ messages in thread
From: Ming Lei @ 2021-12-03 13:15 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-block, Ming Lei
Remove hctx_lock and hctx_unlock, and add one helper of
blk_mq_run_dispatch_ops() to run code block defined in dispatch_ops
with rcu/srcu read held.
Compared with hctx_lock()/hctx_unlock():
1) remove 2 branch to 1, so we just need to check
(hctx->flags & BLK_MQ_F_BLOCKING) once when running one dispatch_ops
2) srcu_idx needn't to be touched in case of non-blocking
3) might_sleep_if() can be moved to the blocking branch
Also put the added blk_mq_run_dispatch_ops() in private header, so that
the following patch can use it out of blk-mq.c.
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-mq.c | 57 +++++++++-----------------------------------------
block/blk-mq.h | 16 ++++++++++++++
2 files changed, 26 insertions(+), 47 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index fc4520e992b1..a9d69e1dea8b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1071,26 +1071,6 @@ void blk_mq_complete_request(struct request *rq)
}
EXPORT_SYMBOL(blk_mq_complete_request);
-static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
- __releases(hctx->srcu)
-{
- if (!(hctx->flags & BLK_MQ_F_BLOCKING))
- rcu_read_unlock();
- else
- srcu_read_unlock(hctx->srcu, srcu_idx);
-}
-
-static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
- __acquires(hctx->srcu)
-{
- if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
- /* shut up gcc false positive */
- *srcu_idx = 0;
- rcu_read_lock();
- } else
- *srcu_idx = srcu_read_lock(hctx->srcu);
-}
-
/**
* blk_mq_start_request - Start processing a request
* @rq: Pointer to request to be started
@@ -1947,19 +1927,13 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
*/
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
{
- int srcu_idx;
-
/*
* We can't run the queue inline with ints disabled. Ensure that
* we catch bad users of this early.
*/
WARN_ON_ONCE(in_interrupt());
- might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
-
- hctx_lock(hctx, &srcu_idx);
- blk_mq_sched_dispatch_requests(hctx);
- hctx_unlock(hctx, srcu_idx);
+ blk_mq_run_dispatch_ops(hctx, blk_mq_sched_dispatch_requests(hctx));
}
static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
@@ -2071,7 +2045,6 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
*/
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
- int srcu_idx;
bool need_run;
/*
@@ -2082,10 +2055,9 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
* And queue will be rerun in blk_mq_unquiesce_queue() if it is
* quiesced.
*/
- hctx_lock(hctx, &srcu_idx);
- need_run = !blk_queue_quiesced(hctx->queue) &&
- blk_mq_hctx_has_pending(hctx);
- hctx_unlock(hctx, srcu_idx);
+ blk_mq_run_dispatch_ops(hctx,
+ need_run = !blk_queue_quiesced(hctx->queue) &&
+ blk_mq_hctx_has_pending(hctx));
if (need_run)
__blk_mq_delay_run_hw_queue(hctx, async, 0);
@@ -2488,32 +2460,22 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
- blk_status_t ret;
- int srcu_idx;
-
- might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
+ blk_status_t ret =
+ __blk_mq_try_issue_directly(hctx, rq, false, true);
- hctx_lock(hctx, &srcu_idx);
-
- ret = __blk_mq_try_issue_directly(hctx, rq, false, true);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
blk_mq_request_bypass_insert(rq, false, true);
else if (ret != BLK_STS_OK)
blk_mq_end_request(rq, ret);
-
- hctx_unlock(hctx, srcu_idx);
}
static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
{
blk_status_t ret;
- int srcu_idx;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
- hctx_lock(hctx, &srcu_idx);
- ret = __blk_mq_try_issue_directly(hctx, rq, true, last);
- hctx_unlock(hctx, srcu_idx);
-
+ blk_mq_run_dispatch_ops(hctx,
+ ret = __blk_mq_try_issue_directly(hctx, rq, true, last));
return ret;
}
@@ -2826,7 +2788,8 @@ void blk_mq_submit_bio(struct bio *bio)
(q->nr_hw_queues == 1 || !is_sync)))
blk_mq_sched_insert_request(rq, false, true, true);
else
- blk_mq_try_issue_directly(rq->mq_hctx, rq);
+ blk_mq_run_dispatch_ops(rq->mq_hctx,
+ blk_mq_try_issue_directly(rq->mq_hctx, rq));
}
/**
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d516c7a46f57..e4c396204928 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -374,5 +374,21 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
return __blk_mq_active_requests(hctx) < depth;
}
+/* run the code block in @dispatch_ops with rcu/srcu read lock held */
+#define blk_mq_run_dispatch_ops(hctx, dispatch_ops) \
+do { \
+ if (!((hctx)->flags & BLK_MQ_F_BLOCKING)) { \
+ rcu_read_lock(); \
+ (dispatch_ops); \
+ rcu_read_unlock(); \
+ } else { \
+ int srcu_idx; \
+ \
+ might_sleep(); \
+ srcu_idx = srcu_read_lock((hctx)->srcu); \
+ (dispatch_ops); \
+ srcu_read_unlock((hctx)->srcu, srcu_idx); \
+ } \
+} while (0)
#endif
--
2.31.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 2/4] blk-mq: move srcu from blk_mq_hw_ctx to request_queue
2021-12-03 13:15 [PATCH 0/4] blk-mq: improve dispatch lock & quiesce implementation Ming Lei
2021-12-03 13:15 ` [PATCH 1/4] blk-mq: remove hctx_lock and hctx_unlock Ming Lei
@ 2021-12-03 13:15 ` Ming Lei
2021-12-03 13:15 ` [PATCH 3/4] blk-mq: pass request queue to blk_mq_run_dispatch_ops Ming Lei
` (3 subsequent siblings)
5 siblings, 0 replies; 10+ messages in thread
From: Ming Lei @ 2021-12-03 13:15 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-block, Ming Lei
In case of BLK_MQ_F_BLOCKING, per-hctx srcu is used to protect dispatch
critical area. However, this srcu instance stays at the end of hctx, and
it often takes standalone cacheline, often cold.
Inside srcu_read_lock() and srcu_read_unlock(), WRITE is always done on
the indirect percpu variable which is allocated from heap instead of
being embedded, srcu->srcu_idx is read only in srcu_read_lock(). It
doesn't matter if srcu structure stays in hctx or request queue.
So switch to per-request-queue srcu for protecting dispatch, and this
way simplifies quiesce a lot, not mention quiesce is always done on the
request queue wide.
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-core.c | 27 ++++++++++++++++++++++-----
block/blk-mq-sysfs.c | 2 --
block/blk-mq.c | 37 ++++++++-----------------------------
block/blk-mq.h | 4 ++--
block/blk-sysfs.c | 3 ++-
block/blk.h | 10 +++++++++-
block/genhd.c | 2 +-
include/linux/blk-mq.h | 8 --------
include/linux/blkdev.h | 9 +++++++++
9 files changed, 53 insertions(+), 49 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c
index b0660c9df852..10619fd83c1b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -66,6 +66,7 @@ DEFINE_IDA(blk_queue_ida);
* For queue allocation
*/
struct kmem_cache *blk_requestq_cachep;
+struct kmem_cache *blk_requestq_srcu_cachep;
/*
* Controlling structure to kblockd
@@ -437,21 +438,27 @@ static void blk_timeout_work(struct work_struct *work)
{
}
-struct request_queue *blk_alloc_queue(int node_id)
+struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
{
struct request_queue *q;
int ret;
- q = kmem_cache_alloc_node(blk_requestq_cachep,
- GFP_KERNEL | __GFP_ZERO, node_id);
+ q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
+ GFP_KERNEL | __GFP_ZERO, node_id);
if (!q)
return NULL;
+ if (alloc_srcu) {
+ blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
+ if (init_srcu_struct(q->srcu) != 0)
+ goto fail_q;
+ }
+
q->last_merge = NULL;
q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
if (q->id < 0)
- goto fail_q;
+ goto fail_srcu;
ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0);
if (ret)
@@ -508,8 +515,11 @@ struct request_queue *blk_alloc_queue(int node_id)
bioset_exit(&q->bio_split);
fail_id:
ida_simple_remove(&blk_queue_ida, q->id);
+fail_srcu:
+ if (alloc_srcu)
+ cleanup_srcu_struct(q->srcu);
fail_q:
- kmem_cache_free(blk_requestq_cachep, q);
+ kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
return NULL;
}
@@ -1301,6 +1311,9 @@ int __init blk_dev_init(void)
sizeof_field(struct request, cmd_flags));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
sizeof_field(struct bio, bi_opf));
+ BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
+ __alignof__(struct request_queue)) !=
+ sizeof(struct request_queue));
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
kblockd_workqueue = alloc_workqueue("kblockd",
@@ -1311,6 +1324,10 @@ int __init blk_dev_init(void)
blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
+ blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
+ sizeof(struct request_queue) +
+ sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
+
blk_debugfs_root = debugfs_create_dir("block", NULL);
return 0;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 253c857cba47..674786574075 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
kobj);
- if (hctx->flags & BLK_MQ_F_BLOCKING)
- cleanup_srcu_struct(hctx->srcu);
blk_free_flush_queue(hctx->fq);
sbitmap_free(&hctx->ctx_map);
free_cpumask_var(hctx->cpumask);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index a9d69e1dea8b..2186971c683b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -260,17 +260,9 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
*/
void blk_mq_wait_quiesce_done(struct request_queue *q)
{
- struct blk_mq_hw_ctx *hctx;
- unsigned int i;
- bool rcu = false;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- if (hctx->flags & BLK_MQ_F_BLOCKING)
- synchronize_srcu(hctx->srcu);
- else
- rcu = true;
- }
- if (rcu)
+ if (blk_queue_has_srcu(q))
+ synchronize_srcu(q->srcu);
+ else
synchronize_rcu();
}
EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
@@ -3400,20 +3392,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
}
}
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
-{
- int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
-
- BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
- __alignof__(struct blk_mq_hw_ctx)) !=
- sizeof(struct blk_mq_hw_ctx));
-
- if (tag_set->flags & BLK_MQ_F_BLOCKING)
- hw_ctx_size += sizeof(struct srcu_struct);
-
- return hw_ctx_size;
-}
-
static int blk_mq_init_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
@@ -3451,7 +3429,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx;
gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
- hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
+ hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
if (!hctx)
goto fail_alloc_hctx;
@@ -3493,8 +3471,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
if (!hctx->fq)
goto free_bitmap;
- if (hctx->flags & BLK_MQ_F_BLOCKING)
- init_srcu_struct(hctx->srcu);
blk_mq_hctx_kobj_init(hctx);
return hctx;
@@ -3830,7 +3806,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
struct request_queue *q;
int ret;
- q = blk_alloc_queue(set->numa_node);
+ q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
if (!q)
return ERR_PTR(-ENOMEM);
q->queuedata = queuedata;
@@ -3979,6 +3955,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
{
+ WARN_ON_ONCE(blk_queue_has_srcu(q) !=
+ !!(set->flags & BLK_MQ_F_BLOCKING));
+
/* mark the queue as mq asap */
q->mq_ops = set->ops;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e4c396204928..792f0b29c6eb 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -385,9 +385,9 @@ do { \
int srcu_idx; \
\
might_sleep(); \
- srcu_idx = srcu_read_lock((hctx)->srcu); \
+ srcu_idx = srcu_read_lock((hctx)->queue->srcu); \
(dispatch_ops); \
- srcu_read_unlock((hctx)->srcu, srcu_idx); \
+ srcu_read_unlock((hctx)->queue->srcu, srcu_idx); \
} \
} while (0)
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 4622da4bb992..3e6357321225 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -735,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
{
struct request_queue *q = container_of(rcu_head, struct request_queue,
rcu_head);
- kmem_cache_free(blk_requestq_cachep, q);
+
+ kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
}
/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
diff --git a/block/blk.h b/block/blk.h
index a55d82c3d1c2..0114e18b9903 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -27,6 +27,7 @@ struct blk_flush_queue {
};
extern struct kmem_cache *blk_requestq_cachep;
+extern struct kmem_cache *blk_requestq_srcu_cachep;
extern struct kobj_type blk_queue_ktype;
extern struct ida blk_queue_ida;
@@ -424,7 +425,14 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page);
-struct request_queue *blk_alloc_queue(int node_id);
+static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
+{
+ if (srcu)
+ return blk_requestq_srcu_cachep;
+ return blk_requestq_cachep;
+}
+struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
+
int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
int disk_alloc_events(struct gendisk *disk);
diff --git a/block/genhd.c b/block/genhd.c
index 5179a4f00fba..3c139a1b6f04 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1338,7 +1338,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
struct request_queue *q;
struct gendisk *disk;
- q = blk_alloc_queue(node);
+ q = blk_alloc_queue(node, false);
if (!q)
return NULL;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1b87b7c8bbff..bfc3cc61f653 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -4,7 +4,6 @@
#include <linux/blkdev.h>
#include <linux/sbitmap.h>
-#include <linux/srcu.h>
#include <linux/lockdep.h>
#include <linux/scatterlist.h>
#include <linux/prefetch.h>
@@ -375,13 +374,6 @@ struct blk_mq_hw_ctx {
* q->unused_hctx_list.
*/
struct list_head hctx_list;
-
- /**
- * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
- * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
- * blk_mq_hw_ctx_size().
- */
- struct srcu_struct srcu[];
};
/**
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 0a4416ef4fbf..c80cfaefc0a8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -16,6 +16,7 @@
#include <linux/percpu-refcount.h>
#include <linux/blkzoned.h>
#include <linux/sbitmap.h>
+#include <linux/srcu.h>
struct module;
struct request_queue;
@@ -373,11 +374,18 @@ struct request_queue {
* devices that do not have multiple independent access ranges.
*/
struct blk_independent_access_ranges *ia_ranges;
+
+ /**
+ * @srcu: Sleepable RCU. Use as lock when type of the request queue
+ * is blocking (BLK_MQ_F_BLOCKING). Must be the last member
+ */
+ struct srcu_struct srcu[];
};
/* Keep blk_queue_flag_name[] in sync with the definitions below */
#define QUEUE_FLAG_STOPPED 0 /* queue is stopped */
#define QUEUE_FLAG_DYING 1 /* queue being torn down */
+#define QUEUE_FLAG_HAS_SRCU 2 /* SRCU is allocated */
#define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */
#define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */
#define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */
@@ -415,6 +423,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
+#define blk_queue_has_srcu(q) test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags)
#define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
#define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
--
2.31.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 3/4] blk-mq: pass request queue to blk_mq_run_dispatch_ops
2021-12-03 13:15 [PATCH 0/4] blk-mq: improve dispatch lock & quiesce implementation Ming Lei
2021-12-03 13:15 ` [PATCH 1/4] blk-mq: remove hctx_lock and hctx_unlock Ming Lei
2021-12-03 13:15 ` [PATCH 2/4] blk-mq: move srcu from blk_mq_hw_ctx to request_queue Ming Lei
@ 2021-12-03 13:15 ` Ming Lei
2021-12-03 13:15 ` [PATCH 4/4] blk-mq: run dispatch lock once in case of issuing from list Ming Lei
` (2 subsequent siblings)
5 siblings, 0 replies; 10+ messages in thread
From: Ming Lei @ 2021-12-03 13:15 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-block, Ming Lei
We have switched to allocate srcu into request queue, so it is fine
to pass request queue to blk_mq_run_dispatch_ops().
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-mq.c | 9 +++++----
block/blk-mq.h | 8 ++++----
2 files changed, 9 insertions(+), 8 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2186971c683b..03dc76f92555 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1925,7 +1925,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
*/
WARN_ON_ONCE(in_interrupt());
- blk_mq_run_dispatch_ops(hctx, blk_mq_sched_dispatch_requests(hctx));
+ blk_mq_run_dispatch_ops(hctx->queue,
+ blk_mq_sched_dispatch_requests(hctx));
}
static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
@@ -2047,7 +2048,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
* And queue will be rerun in blk_mq_unquiesce_queue() if it is
* quiesced.
*/
- blk_mq_run_dispatch_ops(hctx,
+ blk_mq_run_dispatch_ops(hctx->queue,
need_run = !blk_queue_quiesced(hctx->queue) &&
blk_mq_hctx_has_pending(hctx));
@@ -2466,7 +2467,7 @@ static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
blk_status_t ret;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
- blk_mq_run_dispatch_ops(hctx,
+ blk_mq_run_dispatch_ops(rq->q,
ret = __blk_mq_try_issue_directly(hctx, rq, true, last));
return ret;
}
@@ -2780,7 +2781,7 @@ void blk_mq_submit_bio(struct bio *bio)
(q->nr_hw_queues == 1 || !is_sync)))
blk_mq_sched_insert_request(rq, false, true, true);
else
- blk_mq_run_dispatch_ops(rq->mq_hctx,
+ blk_mq_run_dispatch_ops(rq->q,
blk_mq_try_issue_directly(rq->mq_hctx, rq));
}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 792f0b29c6eb..d62004e2d531 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -375,9 +375,9 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
}
/* run the code block in @dispatch_ops with rcu/srcu read lock held */
-#define blk_mq_run_dispatch_ops(hctx, dispatch_ops) \
+#define blk_mq_run_dispatch_ops(q, dispatch_ops) \
do { \
- if (!((hctx)->flags & BLK_MQ_F_BLOCKING)) { \
+ if (!blk_queue_has_srcu(q)) { \
rcu_read_lock(); \
(dispatch_ops); \
rcu_read_unlock(); \
@@ -385,9 +385,9 @@ do { \
int srcu_idx; \
\
might_sleep(); \
- srcu_idx = srcu_read_lock((hctx)->queue->srcu); \
+ srcu_idx = srcu_read_lock((q)->srcu); \
(dispatch_ops); \
- srcu_read_unlock((hctx)->queue->srcu, srcu_idx); \
+ srcu_read_unlock((q)->srcu, srcu_idx); \
} \
} while (0)
--
2.31.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [PATCH 4/4] blk-mq: run dispatch lock once in case of issuing from list
2021-12-03 13:15 [PATCH 0/4] blk-mq: improve dispatch lock & quiesce implementation Ming Lei
` (2 preceding siblings ...)
2021-12-03 13:15 ` [PATCH 3/4] blk-mq: pass request queue to blk_mq_run_dispatch_ops Ming Lei
@ 2021-12-03 13:15 ` Ming Lei
2021-12-03 13:57 ` [PATCH 0/4] blk-mq: improve dispatch lock & quiesce implementation Jens Axboe
2021-12-03 19:12 ` Jens Axboe
5 siblings, 0 replies; 10+ messages in thread
From: Ming Lei @ 2021-12-03 13:15 UTC (permalink / raw)
To: Jens Axboe; +Cc: linux-block, Ming Lei
It isn't necessary to call blk_mq_run_dispatch_ops() once for issuing
single request directly, and enough to do it one time when issuing from
whole list.
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-mq-sched.c | 3 ++-
block/blk-mq.c | 14 ++++++--------
2 files changed, 8 insertions(+), 9 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 0d7257848f7e..55488ba97823 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -475,7 +475,8 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
* us one extra enqueue & dequeue to sw queue.
*/
if (!hctx->dispatch_busy && !run_queue_async) {
- blk_mq_try_issue_list_directly(hctx, list);
+ blk_mq_run_dispatch_ops(hctx->queue,
+ blk_mq_try_issue_list_directly(hctx, list));
if (list_empty(list))
goto out;
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 03dc76f92555..fa464a3e2f9a 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2464,12 +2464,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
{
- blk_status_t ret;
- struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
-
- blk_mq_run_dispatch_ops(rq->q,
- ret = __blk_mq_try_issue_directly(hctx, rq, true, last));
- return ret;
+ return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last);
}
static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
@@ -2526,7 +2521,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
plug->rq_count = 0;
if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
- blk_mq_plug_issue_direct(plug, false);
+ blk_mq_run_dispatch_ops(plug->mq_list->q,
+ blk_mq_plug_issue_direct(plug, false));
if (rq_list_empty(plug->mq_list))
return;
}
@@ -2867,7 +2863,9 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
* bypass a potential scheduler on the bottom device for
* insert.
*/
- return blk_mq_request_issue_directly(rq, true);
+ blk_mq_run_dispatch_ops(rq->q,
+ ret = blk_mq_request_issue_directly(rq, true));
+ return ret;
}
EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
--
2.31.1
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH 0/4] blk-mq: improve dispatch lock & quiesce implementation
2021-12-03 13:15 [PATCH 0/4] blk-mq: improve dispatch lock & quiesce implementation Ming Lei
` (3 preceding siblings ...)
2021-12-03 13:15 ` [PATCH 4/4] blk-mq: run dispatch lock once in case of issuing from list Ming Lei
@ 2021-12-03 13:57 ` Jens Axboe
2021-12-03 19:12 ` Jens Axboe
5 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2021-12-03 13:57 UTC (permalink / raw)
To: Ming Lei; +Cc: linux-block
On 12/3/21 6:15 AM, Ming Lei wrote:
> Hello,
>
> The 1st patch replaces hctx_lock/hctx_unlock with
> blk_mq_run_dispatch_ops(), so that the fast dispatch code path gets
> optimized a bit.
>
> The 2nd patch moves srcu from hctx into request queue.
>
> The last two patches call blk_mq_run_dispatch_ops once in case of
> issuing directly from list.
Looks good, and makes it easier to do the list issue on top of as well.
I'll take a closer look, thanks.
--
Jens Axboe
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 0/4] blk-mq: improve dispatch lock & quiesce implementation
2021-12-03 13:15 [PATCH 0/4] blk-mq: improve dispatch lock & quiesce implementation Ming Lei
` (4 preceding siblings ...)
2021-12-03 13:57 ` [PATCH 0/4] blk-mq: improve dispatch lock & quiesce implementation Jens Axboe
@ 2021-12-03 19:12 ` Jens Axboe
5 siblings, 0 replies; 10+ messages in thread
From: Jens Axboe @ 2021-12-03 19:12 UTC (permalink / raw)
To: Ming Lei; +Cc: linux-block
On Fri, 3 Dec 2021 21:15:30 +0800, Ming Lei wrote:
> The 1st patch replaces hctx_lock/hctx_unlock with
> blk_mq_run_dispatch_ops(), so that the fast dispatch code path gets
> optimized a bit.
>
> The 2nd patch moves srcu from hctx into request queue.
>
> The last two patches call blk_mq_run_dispatch_ops once in case of
> issuing directly from list.
>
> [...]
Applied, thanks!
[1/4] blk-mq: remove hctx_lock and hctx_unlock
commit: 0ae36d0d96a27b748ade2b2f1550e52e9bafb137
[2/4] blk-mq: move srcu from blk_mq_hw_ctx to request_queue
commit: 8770079f7b1c48b6cf4a9853c3f5c071f5a8df55
[3/4] blk-mq: pass request queue to blk_mq_run_dispatch_ops
commit: ffa0cfa2b689413db2a97fa2dad90218befa66f9
[4/4] blk-mq: run dispatch lock once in case of issuing from list
commit: daa7d82f3e50cfe005d620672dfe939e8ed804b1
Best regards,
--
Jens Axboe
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/4] blk-mq: remove hctx_lock and hctx_unlock
[not found] ` <CGME20211206103121eucas1p2ac12934f15129fe77d7f8d95c02fe447@eucas1p2.samsung.com>
@ 2021-12-06 10:31 ` Marek Szyprowski
2021-12-06 11:12 ` Ming Lei
0 siblings, 1 reply; 10+ messages in thread
From: Marek Szyprowski @ 2021-12-06 10:31 UTC (permalink / raw)
To: Ming Lei, Jens Axboe; +Cc: linux-block
Hi,
On 03.12.2021 14:15, Ming Lei wrote:
> Remove hctx_lock and hctx_unlock, and add one helper of
> blk_mq_run_dispatch_ops() to run code block defined in dispatch_ops
> with rcu/srcu read held.
>
> Compared with hctx_lock()/hctx_unlock():
>
> 1) remove 2 branch to 1, so we just need to check
> (hctx->flags & BLK_MQ_F_BLOCKING) once when running one dispatch_ops
>
> 2) srcu_idx needn't to be touched in case of non-blocking
>
> 3) might_sleep_if() can be moved to the blocking branch
>
> Also put the added blk_mq_run_dispatch_ops() in private header, so that
> the following patch can use it out of blk-mq.c.
>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
This patch landed in linux next-20211206 as commit 2a904d00855f
("blk-mq: remove hctx_lock and hctx_unlock"). It introduces a following
'BUG' warning on my test systems (ARM/ARM64-based boards with rootfs on
SD card or eMMC):
BUG: sleeping function called from invalid context at block/blk-mq.c:2060
in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 249, name:
kworker/0:3H
preempt_count: 1, expected: 0
RCU nest depth: 0, expected: 0
4 locks held by kworker/0:3H/249:
#0: c1d782a8 ((wq_completion)mmc_complete){+.+.}-{0:0}, at:
process_one_work+0x21c/0x7ec
#1: c3b59f18 ((work_completion)(&mq->complete_work)){+.+.}-{0:0}, at:
process_one_work+0x21c/0x7ec
#2: c1d7858c (&mq->complete_lock){+.+.}-{3:3}, at:
mmc_blk_mq_complete_prev_req.part.3+0x2c/0x234
#3: c1f7a1b4 (&fq->mq_flush_lock){....}-{2:2}, at:
mq_flush_data_end_io+0x68/0x124
irq event stamp: 16306
hardirqs last enabled at (16305): [<c01da058>] ktime_get+0x178/0x19c
hardirqs last disabled at (16306): [<c0b7a090>]
_raw_spin_lock_irqsave+0x24/0x60
softirqs last enabled at (16300): [<c01016fc>] __do_softirq+0x4cc/0x5ec
softirqs last disabled at (16289): [<c012f95c>] do_softirq+0xb8/0xc4
Preemption disabled at:
[<00000000>] 0x0
CPU: 0 PID: 249 Comm: kworker/0:3H Not tainted
5.16.0-rc3-00080-g2a904d00855f #4254
Hardware name: Samsung Exynos (Flattened Device Tree)
Workqueue: mmc_complete mmc_blk_mq_complete_work
[<c01110d0>] (unwind_backtrace) from [<c010cab8>] (show_stack+0x10/0x14)
[<c010cab8>] (show_stack) from [<c0b6c728>] (dump_stack_lvl+0x58/0x70)
[<c0b6c728>] (dump_stack_lvl) from [<c01594d0>]
(__might_resched+0x248/0x298)
[<c01594d0>] (__might_resched) from [<c0512cec>]
(blk_mq_run_hw_queue+0xac/0x230)
[<c0512cec>] (blk_mq_run_hw_queue) from [<c050f660>]
(__blk_mq_free_request+0x84/0x90)
[<c050f660>] (__blk_mq_free_request) from [<c05086b8>]
(blk_flush_complete_seq+0x250/0x260)
[<c05086b8>] (blk_flush_complete_seq) from [<c0508b34>]
(mq_flush_data_end_io+0x80/0x124)
[<c0508b34>] (mq_flush_data_end_io) from [<c08a0c28>]
(mmc_blk_mq_post_req+0xc4/0xd8)
[<c08a0c28>] (mmc_blk_mq_post_req) from [<c08a0e68>]
(mmc_blk_mq_complete_prev_req.part.3+0x22c/0x234)
[<c08a0e68>] (mmc_blk_mq_complete_prev_req.part.3) from [<c0148980>]
(process_one_work+0x2c8/0x7ec)
[<c0148980>] (process_one_work) from [<c0148ef4>] (worker_thread+0x50/0x584)
[<c0148ef4>] (worker_thread) from [<c015118c>] (kthread+0x13c/0x19c)
[<c015118c>] (kthread) from [<c0100108>] (ret_from_fork+0x14/0x2c)
Exception stack(0xc3b59fb0 to 0xc3b59ff8)
9fa0: 00000000 00000000 00000000
00000000
9fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000000
9fe0: 00000000 00000000 00000000 00000000 00000013 00000000
Please let me know if I can do something to help debugging this issue.
> ---
> block/blk-mq.c | 57 +++++++++-----------------------------------------
> block/blk-mq.h | 16 ++++++++++++++
> 2 files changed, 26 insertions(+), 47 deletions(-)
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index fc4520e992b1..a9d69e1dea8b 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1071,26 +1071,6 @@ void blk_mq_complete_request(struct request *rq)
> }
> EXPORT_SYMBOL(blk_mq_complete_request);
>
> -static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
> - __releases(hctx->srcu)
> -{
> - if (!(hctx->flags & BLK_MQ_F_BLOCKING))
> - rcu_read_unlock();
> - else
> - srcu_read_unlock(hctx->srcu, srcu_idx);
> -}
> -
> -static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
> - __acquires(hctx->srcu)
> -{
> - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
> - /* shut up gcc false positive */
> - *srcu_idx = 0;
> - rcu_read_lock();
> - } else
> - *srcu_idx = srcu_read_lock(hctx->srcu);
> -}
> -
> /**
> * blk_mq_start_request - Start processing a request
> * @rq: Pointer to request to be started
> @@ -1947,19 +1927,13 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
> */
> static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
> {
> - int srcu_idx;
> -
> /*
> * We can't run the queue inline with ints disabled. Ensure that
> * we catch bad users of this early.
> */
> WARN_ON_ONCE(in_interrupt());
>
> - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
> -
> - hctx_lock(hctx, &srcu_idx);
> - blk_mq_sched_dispatch_requests(hctx);
> - hctx_unlock(hctx, srcu_idx);
> + blk_mq_run_dispatch_ops(hctx, blk_mq_sched_dispatch_requests(hctx));
> }
>
> static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
> @@ -2071,7 +2045,6 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
> */
> void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
> {
> - int srcu_idx;
> bool need_run;
>
> /*
> @@ -2082,10 +2055,9 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
> * And queue will be rerun in blk_mq_unquiesce_queue() if it is
> * quiesced.
> */
> - hctx_lock(hctx, &srcu_idx);
> - need_run = !blk_queue_quiesced(hctx->queue) &&
> - blk_mq_hctx_has_pending(hctx);
> - hctx_unlock(hctx, srcu_idx);
> + blk_mq_run_dispatch_ops(hctx,
> + need_run = !blk_queue_quiesced(hctx->queue) &&
> + blk_mq_hctx_has_pending(hctx));
>
> if (need_run)
> __blk_mq_delay_run_hw_queue(hctx, async, 0);
> @@ -2488,32 +2460,22 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
> static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
> struct request *rq)
> {
> - blk_status_t ret;
> - int srcu_idx;
> -
> - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
> + blk_status_t ret =
> + __blk_mq_try_issue_directly(hctx, rq, false, true);
>
> - hctx_lock(hctx, &srcu_idx);
> -
> - ret = __blk_mq_try_issue_directly(hctx, rq, false, true);
> if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
> blk_mq_request_bypass_insert(rq, false, true);
> else if (ret != BLK_STS_OK)
> blk_mq_end_request(rq, ret);
> -
> - hctx_unlock(hctx, srcu_idx);
> }
>
> static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
> {
> blk_status_t ret;
> - int srcu_idx;
> struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
>
> - hctx_lock(hctx, &srcu_idx);
> - ret = __blk_mq_try_issue_directly(hctx, rq, true, last);
> - hctx_unlock(hctx, srcu_idx);
> -
> + blk_mq_run_dispatch_ops(hctx,
> + ret = __blk_mq_try_issue_directly(hctx, rq, true, last));
> return ret;
> }
>
> @@ -2826,7 +2788,8 @@ void blk_mq_submit_bio(struct bio *bio)
> (q->nr_hw_queues == 1 || !is_sync)))
> blk_mq_sched_insert_request(rq, false, true, true);
> else
> - blk_mq_try_issue_directly(rq->mq_hctx, rq);
> + blk_mq_run_dispatch_ops(rq->mq_hctx,
> + blk_mq_try_issue_directly(rq->mq_hctx, rq));
> }
>
> /**
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index d516c7a46f57..e4c396204928 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -374,5 +374,21 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
> return __blk_mq_active_requests(hctx) < depth;
> }
>
> +/* run the code block in @dispatch_ops with rcu/srcu read lock held */
> +#define blk_mq_run_dispatch_ops(hctx, dispatch_ops) \
> +do { \
> + if (!((hctx)->flags & BLK_MQ_F_BLOCKING)) { \
> + rcu_read_lock(); \
> + (dispatch_ops); \
> + rcu_read_unlock(); \
> + } else { \
> + int srcu_idx; \
> + \
> + might_sleep(); \
> + srcu_idx = srcu_read_lock((hctx)->srcu); \
> + (dispatch_ops); \
> + srcu_read_unlock((hctx)->srcu, srcu_idx); \
> + } \
> +} while (0)
>
> #endif
Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH 1/4] blk-mq: remove hctx_lock and hctx_unlock
2021-12-06 10:31 ` Marek Szyprowski
@ 2021-12-06 11:12 ` Ming Lei
2021-12-06 11:27 ` Marek Szyprowski
0 siblings, 1 reply; 10+ messages in thread
From: Ming Lei @ 2021-12-06 11:12 UTC (permalink / raw)
To: Marek Szyprowski; +Cc: Jens Axboe, linux-block
On Mon, Dec 06, 2021 at 11:31:21AM +0100, Marek Szyprowski wrote:
> Hi,
>
> On 03.12.2021 14:15, Ming Lei wrote:
> > Remove hctx_lock and hctx_unlock, and add one helper of
> > blk_mq_run_dispatch_ops() to run code block defined in dispatch_ops
> > with rcu/srcu read held.
> >
> > Compared with hctx_lock()/hctx_unlock():
> >
> > 1) remove 2 branch to 1, so we just need to check
> > (hctx->flags & BLK_MQ_F_BLOCKING) once when running one dispatch_ops
> >
> > 2) srcu_idx needn't to be touched in case of non-blocking
> >
> > 3) might_sleep_if() can be moved to the blocking branch
> >
> > Also put the added blk_mq_run_dispatch_ops() in private header, so that
> > the following patch can use it out of blk-mq.c.
> >
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
>
> This patch landed in linux next-20211206 as commit 2a904d00855f
> ("blk-mq: remove hctx_lock and hctx_unlock"). It introduces a following
> 'BUG' warning on my test systems (ARM/ARM64-based boards with rootfs on
> SD card or eMMC):
>
> BUG: sleeping function called from invalid context at block/blk-mq.c:2060
> in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 249, name:
> kworker/0:3H
> preempt_count: 1, expected: 0
> RCU nest depth: 0, expected: 0
> 4 locks held by kworker/0:3H/249:
> #0: c1d782a8 ((wq_completion)mmc_complete){+.+.}-{0:0}, at:
> process_one_work+0x21c/0x7ec
> #1: c3b59f18 ((work_completion)(&mq->complete_work)){+.+.}-{0:0}, at:
> process_one_work+0x21c/0x7ec
> #2: c1d7858c (&mq->complete_lock){+.+.}-{3:3}, at:
> mmc_blk_mq_complete_prev_req.part.3+0x2c/0x234
> #3: c1f7a1b4 (&fq->mq_flush_lock){....}-{2:2}, at:
> mq_flush_data_end_io+0x68/0x124
It should be fixed by the attached patch.
From bce4d1bf7ab4ac4c04a65eca67705567e9d5f0c0 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@redhat.com>
Date: Mon, 6 Dec 2021 15:54:11 +0800
Subject: [PATCH] blk-mq: don't run might_sleep() if the operation needn't
blocking
The operation protected via blk_mq_run_dispatch_ops() in blk_mq_run_hw_queue
won't sleep, so don't run might_sleep() for it.
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
block/blk-mq.c | 2 +-
block/blk-mq.h | 7 +++++--
2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 537295f6e0e9..0bf3523dd1f5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2048,7 +2048,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
* And queue will be rerun in blk_mq_unquiesce_queue() if it is
* quiesced.
*/
- blk_mq_run_dispatch_ops(hctx->queue,
+ __blk_mq_run_dispatch_ops(hctx->queue, false,
need_run = !blk_queue_quiesced(hctx->queue) &&
blk_mq_hctx_has_pending(hctx));
diff --git a/block/blk-mq.h b/block/blk-mq.h
index d62004e2d531..948791ea2a3e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -375,7 +375,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
}
/* run the code block in @dispatch_ops with rcu/srcu read lock held */
-#define blk_mq_run_dispatch_ops(q, dispatch_ops) \
+#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \
do { \
if (!blk_queue_has_srcu(q)) { \
rcu_read_lock(); \
@@ -384,11 +384,14 @@ do { \
} else { \
int srcu_idx; \
\
- might_sleep(); \
+ might_sleep_if(check_sleep); \
srcu_idx = srcu_read_lock((q)->srcu); \
(dispatch_ops); \
srcu_read_unlock((q)->srcu, srcu_idx); \
} \
} while (0)
+#define blk_mq_run_dispatch_ops(q, dispatch_ops) \
+ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \
+
#endif
--
2.31.1
Thanks,
Ming
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH 1/4] blk-mq: remove hctx_lock and hctx_unlock
2021-12-06 11:12 ` Ming Lei
@ 2021-12-06 11:27 ` Marek Szyprowski
0 siblings, 0 replies; 10+ messages in thread
From: Marek Szyprowski @ 2021-12-06 11:27 UTC (permalink / raw)
To: Ming Lei; +Cc: Jens Axboe, linux-block
Hi,
On 06.12.2021 12:12, Ming Lei wrote:
> On Mon, Dec 06, 2021 at 11:31:21AM +0100, Marek Szyprowski wrote:
>> On 03.12.2021 14:15, Ming Lei wrote:
>>> Remove hctx_lock and hctx_unlock, and add one helper of
>>> blk_mq_run_dispatch_ops() to run code block defined in dispatch_ops
>>> with rcu/srcu read held.
>>>
>>> Compared with hctx_lock()/hctx_unlock():
>>>
>>> 1) remove 2 branch to 1, so we just need to check
>>> (hctx->flags & BLK_MQ_F_BLOCKING) once when running one dispatch_ops
>>>
>>> 2) srcu_idx needn't to be touched in case of non-blocking
>>>
>>> 3) might_sleep_if() can be moved to the blocking branch
>>>
>>> Also put the added blk_mq_run_dispatch_ops() in private header, so that
>>> the following patch can use it out of blk-mq.c.
>>>
>>> Signed-off-by: Ming Lei <ming.lei@redhat.com>
>> This patch landed in linux next-20211206 as commit 2a904d00855f
>> ("blk-mq: remove hctx_lock and hctx_unlock"). It introduces a following
>> 'BUG' warning on my test systems (ARM/ARM64-based boards with rootfs on
>> SD card or eMMC):
>>
>> BUG: sleeping function called from invalid context at block/blk-mq.c:2060
>> in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 249, name:
>> kworker/0:3H
>> preempt_count: 1, expected: 0
>> RCU nest depth: 0, expected: 0
>> 4 locks held by kworker/0:3H/249:
>> #0: c1d782a8 ((wq_completion)mmc_complete){+.+.}-{0:0}, at:
>> process_one_work+0x21c/0x7ec
>> #1: c3b59f18 ((work_completion)(&mq->complete_work)){+.+.}-{0:0}, at:
>> process_one_work+0x21c/0x7ec
>> #2: c1d7858c (&mq->complete_lock){+.+.}-{3:3}, at:
>> mmc_blk_mq_complete_prev_req.part.3+0x2c/0x234
>> #3: c1f7a1b4 (&fq->mq_flush_lock){....}-{2:2}, at:
>> mq_flush_data_end_io+0x68/0x124
> It should be fixed by the attached patch.
>
> >From bce4d1bf7ab4ac4c04a65eca67705567e9d5f0c0 Mon Sep 17 00:00:00 2001
> From: Ming Lei <ming.lei@redhat.com>
> Date: Mon, 6 Dec 2021 15:54:11 +0800
> Subject: [PATCH] blk-mq: don't run might_sleep() if the operation needn't
> blocking
>
> The operation protected via blk_mq_run_dispatch_ops() in blk_mq_run_hw_queue
> won't sleep, so don't run might_sleep() for it.
>
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
Confirmed, this fixed the issue.
Reported-by: Marek Szyprowski <m.szyprowski@samsung.com>
Tested-by: Marek Szyprowski <m.szyprowski@samsung.com>
> ---
> block/blk-mq.c | 2 +-
> block/blk-mq.h | 7 +++++--
> 2 files changed, 6 insertions(+), 3 deletions(-)
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 537295f6e0e9..0bf3523dd1f5 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2048,7 +2048,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
> * And queue will be rerun in blk_mq_unquiesce_queue() if it is
> * quiesced.
> */
> - blk_mq_run_dispatch_ops(hctx->queue,
> + __blk_mq_run_dispatch_ops(hctx->queue, false,
> need_run = !blk_queue_quiesced(hctx->queue) &&
> blk_mq_hctx_has_pending(hctx));
>
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index d62004e2d531..948791ea2a3e 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -375,7 +375,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
> }
>
> /* run the code block in @dispatch_ops with rcu/srcu read lock held */
> -#define blk_mq_run_dispatch_ops(q, dispatch_ops) \
> +#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \
> do { \
> if (!blk_queue_has_srcu(q)) { \
> rcu_read_lock(); \
> @@ -384,11 +384,14 @@ do { \
> } else { \
> int srcu_idx; \
> \
> - might_sleep(); \
> + might_sleep_if(check_sleep); \
> srcu_idx = srcu_read_lock((q)->srcu); \
> (dispatch_ops); \
> srcu_read_unlock((q)->srcu, srcu_idx); \
> } \
> } while (0)
>
> +#define blk_mq_run_dispatch_ops(q, dispatch_ops) \
> + __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \
> +
> #endif
Best regards
--
Marek Szyprowski, PhD
Samsung R&D Institute Poland
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2021-12-06 11:27 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-03 13:15 [PATCH 0/4] blk-mq: improve dispatch lock & quiesce implementation Ming Lei
2021-12-03 13:15 ` [PATCH 1/4] blk-mq: remove hctx_lock and hctx_unlock Ming Lei
[not found] ` <CGME20211206103121eucas1p2ac12934f15129fe77d7f8d95c02fe447@eucas1p2.samsung.com>
2021-12-06 10:31 ` Marek Szyprowski
2021-12-06 11:12 ` Ming Lei
2021-12-06 11:27 ` Marek Szyprowski
2021-12-03 13:15 ` [PATCH 2/4] blk-mq: move srcu from blk_mq_hw_ctx to request_queue Ming Lei
2021-12-03 13:15 ` [PATCH 3/4] blk-mq: pass request queue to blk_mq_run_dispatch_ops Ming Lei
2021-12-03 13:15 ` [PATCH 4/4] blk-mq: run dispatch lock once in case of issuing from list Ming Lei
2021-12-03 13:57 ` [PATCH 0/4] blk-mq: improve dispatch lock & quiesce implementation Jens Axboe
2021-12-03 19:12 ` Jens Axboe
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.