Linux-Block Archive on lore.kernel.org
 help / color / Atom feed
* blk-mq: improvement CPU hotplug (simplified version) v2
@ 2020-05-18  6:39 Christoph Hellwig
  2020-05-18  6:39 ` [PATCH 1/9] blk-mq: split out a __blk_mq_get_driver_tag helper Christoph Hellwig
                   ` (9 more replies)
  0 siblings, 10 replies; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-18  6:39 UTC (permalink / raw)
  Cc: linux-block, John Garry, Bart Van Assche, Hannes Reinecke,
	Thomas Gleixner

Hi all,

this series ensures I/O is quiesced before a cpu and thus the managed
interrupt handler is shut down.

This patchset tries to address the issue by the following approach:

 - before the last cpu in hctx->cpumask is going to offline, mark this
   hctx as inactive

 - disable preempt during allocating tag for request, and after tag is
   allocated, check if this hctx is inactive. If yes, give up the
   allocation and try remote allocation from online CPUs

 - before hctx becomes inactive, drain all allocated requests on this
   hctx

The guts of the changes are from Ming Lei, I just did a bunch of prep
cleanups so that they can fit in more nicely.  The series also depends
on my "avoid a few q_usage_counter roundtrips v3" series.

Thanks John Garry for running lots of tests on arm64 with this previous
version patches and co-working on investigating all kinds of issues.

A git tree is available here:

    git://git.infradead.org/users/hch/block.git blk-mq-hotplug

Gitweb:

    http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/blk-mq-hotplug

^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH 1/9] blk-mq: split out a __blk_mq_get_driver_tag helper
  2020-05-18  6:39 blk-mq: improvement CPU hotplug (simplified version) v2 Christoph Hellwig
@ 2020-05-18  6:39 ` Christoph Hellwig
  2020-05-18  6:39 ` [PATCH 2/9] blk-mq: remove the bio argument to ->prepare_request Christoph Hellwig
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-18  6:39 UTC (permalink / raw)
  Cc: linux-block, John Garry, Bart Van Assche, Hannes Reinecke,
	Thomas Gleixner

Allocation of the driver tag in the case of using a scheduler shares very
little code with the "normal" tag allocation.  Split out a new helper to
streamline this path, and untangle it from the complex normal tag
allocation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-tag.c | 27 +++++++++++++++++++++++++++
 block/blk-mq-tag.h |  8 ++++++++
 block/blk-mq.c     | 29 -----------------------------
 block/blk-mq.h     |  1 -
 4 files changed, 35 insertions(+), 30 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 586c9d6e904ab..e5b17300ec882 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -183,6 +183,33 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	return tag + tag_offset;
 }
 
+bool __blk_mq_get_driver_tag(struct request *rq)
+{
+	struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
+	unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
+	bool shared = blk_mq_tag_busy(rq->mq_hctx);
+	int tag;
+
+	if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
+		bt = &rq->mq_hctx->tags->breserved_tags;
+		tag_offset = 0;
+	}
+
+	if (!hctx_may_queue(rq->mq_hctx, bt))
+		return false;
+	tag = __sbitmap_queue_get(bt);
+	if (tag == BLK_MQ_TAG_FAIL)
+		return false;
+
+	rq->tag = tag + tag_offset;
+	if (shared) {
+		rq->rq_flags |= RQF_MQ_INFLIGHT;
+		atomic_inc(&rq->mq_hctx->nr_active);
+	}
+	rq->mq_hctx->tags->rqs[rq->tag] = rq;
+	return true;
+}
+
 void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
 		    unsigned int tag)
 {
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 2b8321efb6820..c2ad3ed637106 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -25,6 +25,14 @@ struct blk_mq_tags {
 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy);
 extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 
+bool __blk_mq_get_driver_tag(struct request *rq);
+static inline bool blk_mq_get_driver_tag(struct request *rq)
+{
+	if (rq->tag != -1)
+		return true;
+	return __blk_mq_get_driver_tag(rq);
+}
+
 extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
 extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
 			   unsigned int tag);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index cac11945f6023..5283c0105f6f2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1015,35 +1015,6 @@ static inline unsigned int queued_to_index(unsigned int queued)
 	return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
 }
 
-bool blk_mq_get_driver_tag(struct request *rq)
-{
-	struct blk_mq_alloc_data data = {
-		.q = rq->q,
-		.hctx = rq->mq_hctx,
-		.flags = BLK_MQ_REQ_NOWAIT,
-		.cmd_flags = rq->cmd_flags,
-	};
-	bool shared;
-
-	if (rq->tag != -1)
-		return true;
-
-	if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
-		data.flags |= BLK_MQ_REQ_RESERVED;
-
-	shared = blk_mq_tag_busy(data.hctx);
-	rq->tag = blk_mq_get_tag(&data);
-	if (rq->tag >= 0) {
-		if (shared) {
-			rq->rq_flags |= RQF_MQ_INFLIGHT;
-			atomic_inc(&data.hctx->nr_active);
-		}
-		data.hctx->tags->rqs[rq->tag] = rq;
-	}
-
-	return rq->tag != -1;
-}
-
 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
 				int flags, void *key)
 {
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 10bfdfb494faf..e7d1da4b1f731 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -44,7 +44,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
 				bool kick_requeue_list);
 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
-bool blk_mq_get_driver_tag(struct request *rq);
 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
 					struct blk_mq_ctx *start);
 
-- 
2.26.2


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH 2/9] blk-mq: remove the bio argument to ->prepare_request
  2020-05-18  6:39 blk-mq: improvement CPU hotplug (simplified version) v2 Christoph Hellwig
  2020-05-18  6:39 ` [PATCH 1/9] blk-mq: split out a __blk_mq_get_driver_tag helper Christoph Hellwig
@ 2020-05-18  6:39 ` Christoph Hellwig
  2020-05-18  6:39 ` [PATCH 3/9] blk-mq: simplify the blk_mq_get_request calling convention Christoph Hellwig
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-18  6:39 UTC (permalink / raw)
  Cc: linux-block, John Garry, Bart Van Assche, Hannes Reinecke,
	Thomas Gleixner

None of the I/O schedulers actually needs it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/bfq-iosched.c      | 2 +-
 block/blk-mq.c           | 2 +-
 block/kyber-iosched.c    | 2 +-
 block/mq-deadline.c      | 2 +-
 include/linux/elevator.h | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3d411716d7ee4..50c8f034c01c5 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -6073,7 +6073,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
  * comments on bfq_init_rq for the reason behind this delayed
  * preparation.
  */
-static void bfq_prepare_request(struct request *rq, struct bio *bio)
+static void bfq_prepare_request(struct request *rq)
 {
 	/*
 	 * Regardless of whether we have an icq attached, we have to
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5283c0105f6f2..341af7752c509 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -387,7 +387,7 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 			if (e->type->icq_cache)
 				blk_mq_sched_assign_ioc(rq);
 
-			e->type->ops.prepare_request(rq, bio);
+			e->type->ops.prepare_request(rq);
 			rq->rq_flags |= RQF_ELVPRIV;
 		}
 	}
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 34dcea0ef6377..a38c5ab103d12 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -579,7 +579,7 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
 	return merged;
 }
 
-static void kyber_prepare_request(struct request *rq, struct bio *bio)
+static void kyber_prepare_request(struct request *rq)
 {
 	rq_set_domain_token(rq, -1);
 }
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index b490f47fd553c..b57470e154c8f 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -541,7 +541,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
  * Nothing to do here. This is defined only to ensure that .finish_request
  * method is called upon request completion.
  */
-static void dd_prepare_request(struct request *rq, struct bio *bio)
+static void dd_prepare_request(struct request *rq)
 {
 }
 
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 901bda352dcb7..bacc40a0bdf39 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -39,7 +39,7 @@ struct elevator_mq_ops {
 	void (*request_merged)(struct request_queue *, struct request *, enum elv_merge);
 	void (*requests_merged)(struct request_queue *, struct request *, struct request *);
 	void (*limit_depth)(unsigned int, struct blk_mq_alloc_data *);
-	void (*prepare_request)(struct request *, struct bio *bio);
+	void (*prepare_request)(struct request *);
 	void (*finish_request)(struct request *);
 	void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
 	struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
-- 
2.26.2


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH 3/9] blk-mq: simplify the blk_mq_get_request calling convention
  2020-05-18  6:39 blk-mq: improvement CPU hotplug (simplified version) v2 Christoph Hellwig
  2020-05-18  6:39 ` [PATCH 1/9] blk-mq: split out a __blk_mq_get_driver_tag helper Christoph Hellwig
  2020-05-18  6:39 ` [PATCH 2/9] blk-mq: remove the bio argument to ->prepare_request Christoph Hellwig
@ 2020-05-18  6:39 ` Christoph Hellwig
  2020-05-18  6:39 ` [PATCH 4/9] blk-mq: merge blk_mq_rq_ctx_init into __blk_mq_alloc_request Christoph Hellwig
                   ` (6 subsequent siblings)
  9 siblings, 0 replies; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-18  6:39 UTC (permalink / raw)
  Cc: linux-block, John Garry, Bart Van Assche, Hannes Reinecke,
	Thomas Gleixner

The bio argument is entirely unused, and the request_queue can be passed
through the alloc_data, given that it needs to be filled out for the
low-level tag allocation anyway.  Also rename the function to
__blk_mq_alloc_request as the switch between get and alloc in the call
chains is rather confusing.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 341af7752c509..ef8f50cdab858 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -332,10 +332,9 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	return rq;
 }
 
-static struct request *blk_mq_get_request(struct request_queue *q,
-					  struct bio *bio,
-					  struct blk_mq_alloc_data *data)
+static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
 {
+	struct request_queue *q = data->q;
 	struct elevator_queue *e = q->elevator;
 	struct request *rq;
 	unsigned int tag;
@@ -346,7 +345,6 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 	if (blk_queue_rq_alloc_time(q))
 		alloc_time_ns = ktime_get_ns();
 
-	data->q = q;
 	if (likely(!data->ctx)) {
 		data->ctx = blk_mq_get_ctx(q);
 		clear_ctx_on_error = true;
@@ -398,7 +396,11 @@ static struct request *blk_mq_get_request(struct request_queue *q,
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 		blk_mq_req_flags_t flags)
 {
-	struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
+	struct blk_mq_alloc_data data = {
+		.q		= q,
+		.flags		= flags,
+		.cmd_flags	= op,
+	};
 	struct request *rq;
 	int ret;
 
@@ -406,7 +408,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 	if (ret)
 		return ERR_PTR(ret);
 
-	rq = blk_mq_get_request(q, NULL, &alloc_data);
+	rq = __blk_mq_alloc_request(&data);
 	if (!rq)
 		goto out_queue_exit;
 	rq->__data_len = 0;
@@ -422,7 +424,11 @@ EXPORT_SYMBOL(blk_mq_alloc_request);
 struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
 {
-	struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
+	struct blk_mq_alloc_data data = {
+		.q		= q,
+		.flags		= flags,
+		.cmd_flags	= op,
+	};
 	struct request *rq;
 	unsigned int cpu;
 	int ret;
@@ -448,14 +454,14 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	 * If not tell the caller that it should skip this queue.
 	 */
 	ret = -EXDEV;
-	alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
-	if (!blk_mq_hw_queue_mapped(alloc_data.hctx))
+	data.hctx = q->queue_hw_ctx[hctx_idx];
+	if (!blk_mq_hw_queue_mapped(data.hctx))
 		goto out_queue_exit;
-	cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
-	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
+	cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
+	data.ctx = __blk_mq_get_ctx(q, cpu);
 
 	ret = -EWOULDBLOCK;
-	rq = blk_mq_get_request(q, NULL, &alloc_data);
+	rq = __blk_mq_alloc_request(&data);
 	if (!rq)
 		goto out_queue_exit;
 	return rq;
@@ -1987,7 +1993,9 @@ blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = op_is_sync(bio->bi_opf);
 	const int is_flush_fua = op_is_flush(bio->bi_opf);
-	struct blk_mq_alloc_data data = { .flags = 0};
+	struct blk_mq_alloc_data data = {
+		.q		= q,
+	};
 	struct request *rq;
 	struct blk_plug *plug;
 	struct request *same_queue_rq = NULL;
@@ -2011,7 +2019,7 @@ blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	rq_qos_throttle(q, bio);
 
 	data.cmd_flags = bio->bi_opf;
-	rq = blk_mq_get_request(q, bio, &data);
+	rq = __blk_mq_alloc_request(&data);
 	if (unlikely(!rq)) {
 		rq_qos_cleanup(q, bio);
 		if (bio->bi_opf & REQ_NOWAIT)
-- 
2.26.2


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH 4/9] blk-mq: merge blk_mq_rq_ctx_init into __blk_mq_alloc_request
  2020-05-18  6:39 blk-mq: improvement CPU hotplug (simplified version) v2 Christoph Hellwig
                   ` (2 preceding siblings ...)
  2020-05-18  6:39 ` [PATCH 3/9] blk-mq: simplify the blk_mq_get_request calling convention Christoph Hellwig
@ 2020-05-18  6:39 ` Christoph Hellwig
  2020-05-18  6:39 ` [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx Christoph Hellwig
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-18  6:39 UTC (permalink / raw)
  Cc: linux-block, John Garry, Bart Van Assche, Hannes Reinecke,
	Thomas Gleixner

There is no logical split between what gets initialized by which
function, so just merge the two.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq.c         | 103 +++++++++++++++++++----------------------
 include/linux/blkdev.h |   2 +-
 2 files changed, 49 insertions(+), 56 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ef8f50cdab858..fcfce666457e2 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -270,17 +270,59 @@ static inline bool blk_mq_need_time_stamp(struct request *rq)
 	return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
 }
 
-static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
-		unsigned int tag, unsigned int op, u64 alloc_time_ns)
+static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
 {
-	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
-	struct request *rq = tags->static_rqs[tag];
+	struct request_queue *q = data->q;
+	struct elevator_queue *e = q->elevator;
+	struct request *rq;
+	unsigned int tag;
+	bool clear_ctx_on_error = false;
 	req_flags_t rq_flags = 0;
+	u64 alloc_time_ns = 0;
+
+	/* alloc_time includes depth and tag waits */
+	if (blk_queue_rq_alloc_time(q))
+		alloc_time_ns = ktime_get_ns();
+
+	if (likely(!data->ctx)) {
+		data->ctx = blk_mq_get_ctx(q);
+		clear_ctx_on_error = true;
+	}
+	if (likely(!data->hctx))
+		data->hctx = blk_mq_map_queue(q, data->cmd_flags,
+						data->ctx);
+	if (data->cmd_flags & REQ_NOWAIT)
+		data->flags |= BLK_MQ_REQ_NOWAIT;
+
+	if (e) {
+		data->flags |= BLK_MQ_REQ_INTERNAL;
+
+		/*
+		 * Flush requests are special and go directly to the
+		 * dispatch list. Don't include reserved tags in the
+		 * limiting, as it isn't useful.
+		 */
+		if (!op_is_flush(data->cmd_flags) &&
+		    e->type->ops.limit_depth &&
+		    !(data->flags & BLK_MQ_REQ_RESERVED))
+			e->type->ops.limit_depth(data->cmd_flags, data);
+	} else {
+		blk_mq_tag_busy(data->hctx);
+	}
+
+	tag = blk_mq_get_tag(data);
+	if (tag == BLK_MQ_TAG_FAIL) {
+		if (clear_ctx_on_error)
+			data->ctx = NULL;
+		return NULL;
+	}
 
 	if (data->flags & BLK_MQ_REQ_INTERNAL) {
+		rq = data->hctx->sched_tags->static_rqs[tag];
 		rq->tag = -1;
 		rq->internal_tag = tag;
 	} else {
+		rq = data->hctx->tags->static_rqs[tag];
 		if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
 			rq_flags = RQF_MQ_INFLIGHT;
 			atomic_inc(&data->hctx->nr_active);
@@ -295,7 +337,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->mq_ctx = data->ctx;
 	rq->mq_hctx = data->hctx;
 	rq->rq_flags = rq_flags;
-	rq->cmd_flags = op;
+	rq->cmd_flags = data->cmd_flags;
 	if (data->flags & BLK_MQ_REQ_PREEMPT)
 		rq->rq_flags |= RQF_PREEMPT;
 	if (blk_queue_io_stat(data->q))
@@ -327,58 +369,9 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	rq->end_io = NULL;
 	rq->end_io_data = NULL;
 
-	data->ctx->rq_dispatched[op_is_sync(op)]++;
+	data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
 	refcount_set(&rq->ref, 1);
-	return rq;
-}
-
-static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
-{
-	struct request_queue *q = data->q;
-	struct elevator_queue *e = q->elevator;
-	struct request *rq;
-	unsigned int tag;
-	bool clear_ctx_on_error = false;
-	u64 alloc_time_ns = 0;
-
-	/* alloc_time includes depth and tag waits */
-	if (blk_queue_rq_alloc_time(q))
-		alloc_time_ns = ktime_get_ns();
-
-	if (likely(!data->ctx)) {
-		data->ctx = blk_mq_get_ctx(q);
-		clear_ctx_on_error = true;
-	}
-	if (likely(!data->hctx))
-		data->hctx = blk_mq_map_queue(q, data->cmd_flags,
-						data->ctx);
-	if (data->cmd_flags & REQ_NOWAIT)
-		data->flags |= BLK_MQ_REQ_NOWAIT;
-
-	if (e) {
-		data->flags |= BLK_MQ_REQ_INTERNAL;
-
-		/*
-		 * Flush requests are special and go directly to the
-		 * dispatch list. Don't include reserved tags in the
-		 * limiting, as it isn't useful.
-		 */
-		if (!op_is_flush(data->cmd_flags) &&
-		    e->type->ops.limit_depth &&
-		    !(data->flags & BLK_MQ_REQ_RESERVED))
-			e->type->ops.limit_depth(data->cmd_flags, data);
-	} else {
-		blk_mq_tag_busy(data->hctx);
-	}
-
-	tag = blk_mq_get_tag(data);
-	if (tag == BLK_MQ_TAG_FAIL) {
-		if (clear_ctx_on_error)
-			data->ctx = NULL;
-		return NULL;
-	}
 
-	rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns);
 	if (!op_is_flush(data->cmd_flags)) {
 		rq->elv.icq = NULL;
 		if (e && e->type->ops.prepare_request) {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2b33166b9daf1..e44d56ee435db 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -126,7 +126,7 @@ enum mq_rq_state {
  * Try to put the fields that are referenced together in the same cacheline.
  *
  * If you modify this structure, make sure to update blk_rq_init() and
- * especially blk_mq_rq_ctx_init() to take care of the added fields.
+ * especially __blk_mq_alloc_request() to take care of the added fields.
  */
 struct request {
 	struct request_queue *q;
-- 
2.26.2


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-18  6:39 blk-mq: improvement CPU hotplug (simplified version) v2 Christoph Hellwig
                   ` (3 preceding siblings ...)
  2020-05-18  6:39 ` [PATCH 4/9] blk-mq: merge blk_mq_rq_ctx_init into __blk_mq_alloc_request Christoph Hellwig
@ 2020-05-18  6:39 ` Christoph Hellwig
  2020-05-18  8:32   ` Thomas Gleixner
  2020-05-18  6:39 ` [PATCH 6/9] blk-mq: don't set data->ctx and data->hctx in __blk_mq_alloc_request Christoph Hellwig
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-18  6:39 UTC (permalink / raw)
  Cc: linux-block, John Garry, Bart Van Assche, Hannes Reinecke,
	Thomas Gleixner, Ming Lei

From: Ming Lei <ming.lei@redhat.com>

blk_mq_alloc_request_hctx() asks blk-mq to allocate request from a
specified hctx, which is usually bound with fixed cpu mapping, and
request is supposed to be allocated on CPU in hctx->cpumask.

Use smp_call_function_any() to allocate a request on a CPU that is set
in hctx->cpumask and return it in blk_mq_alloc_data instead to prepare
for better CPU hotplug support in blk-mq.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
[hch: reuse blk_mq_alloc_data for the smp_call_function_any argument]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq.c | 24 ++++++++++++++++++------
 block/blk-mq.h |  3 +++
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index fcfce666457e2..540b5845cd1d3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -386,6 +386,20 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
 	return rq;
 }
 
+static void __blk_mq_alloc_request_cb(void *__data)
+{
+	struct blk_mq_alloc_data *data = __data;
+
+	data->rq = __blk_mq_alloc_request(data);
+}
+
+static struct request *__blk_mq_alloc_request_on_cpumask(const cpumask_t *mask,
+		struct blk_mq_alloc_data *data)
+{
+	smp_call_function_any(mask, __blk_mq_alloc_request_cb, data, 1);
+	return data->rq;
+}
+
 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 		blk_mq_req_flags_t flags)
 {
@@ -423,7 +437,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 		.cmd_flags	= op,
 	};
 	struct request *rq;
-	unsigned int cpu;
+	struct blk_mq_hw_ctx *hctx;
 	int ret;
 
 	/*
@@ -447,14 +461,12 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	 * If not tell the caller that it should skip this queue.
 	 */
 	ret = -EXDEV;
-	data.hctx = q->queue_hw_ctx[hctx_idx];
-	if (!blk_mq_hw_queue_mapped(data.hctx))
+	hctx = q->queue_hw_ctx[hctx_idx];
+	if (!blk_mq_hw_queue_mapped(hctx))
 		goto out_queue_exit;
-	cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
-	data.ctx = __blk_mq_get_ctx(q, cpu);
 
 	ret = -EWOULDBLOCK;
-	rq = __blk_mq_alloc_request(&data);
+	rq = __blk_mq_alloc_request_on_cpumask(hctx->cpumask, &data);
 	if (!rq)
 		goto out_queue_exit;
 	return rq;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e7d1da4b1f731..82921b30b6afa 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -155,6 +155,9 @@ struct blk_mq_alloc_data {
 	/* input & output parameter */
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_hw_ctx *hctx;
+
+	/* output parameter for __blk_mq_alloc_request_cb */
+	struct request *rq;
 };
 
 static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
-- 
2.26.2


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH 6/9] blk-mq: don't set data->ctx and data->hctx in __blk_mq_alloc_request
  2020-05-18  6:39 blk-mq: improvement CPU hotplug (simplified version) v2 Christoph Hellwig
                   ` (4 preceding siblings ...)
  2020-05-18  6:39 ` [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx Christoph Hellwig
@ 2020-05-18  6:39 ` Christoph Hellwig
  2020-05-18  6:39 ` [PATCH 7/9] blk-mq: disable preemption during allocating request tag Christoph Hellwig
                   ` (3 subsequent siblings)
  9 siblings, 0 replies; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-18  6:39 UTC (permalink / raw)
  Cc: linux-block, John Garry, Bart Van Assche, Hannes Reinecke,
	Thomas Gleixner

Now that blk_mq_alloc_request_hctx doesn't set ->ctx and ->hctx itself,
all setting of it can be done in the low-level blk_mq_get_tag helper.

Based on patch from Ming Lei <ming.lei@redhat.com>.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-tag.c |  8 +++++++-
 block/blk-mq.c     | 15 +--------------
 block/blk-mq.h     |  4 ++--
 3 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index e5b17300ec882..b526f1f5a3bf3 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -101,13 +101,19 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
 
 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 {
-	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+	struct blk_mq_tags *tags;
 	struct sbitmap_queue *bt;
 	struct sbq_wait_state *ws;
 	DEFINE_SBQ_WAIT(wait);
 	unsigned int tag_offset;
 	int tag;
 
+	data->ctx = blk_mq_get_ctx(data->q);
+	data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, data->ctx);
+	tags = blk_mq_tags_from_data(data);
+	if (!(data->flags & BLK_MQ_REQ_INTERNAL))
+		blk_mq_tag_busy(data->hctx);
+
 	if (data->flags & BLK_MQ_REQ_RESERVED) {
 		if (unlikely(!tags->nr_reserved_tags)) {
 			WARN_ON_ONCE(1);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 540b5845cd1d3..74c2d8f61426c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -276,7 +276,6 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
 	struct elevator_queue *e = q->elevator;
 	struct request *rq;
 	unsigned int tag;
-	bool clear_ctx_on_error = false;
 	req_flags_t rq_flags = 0;
 	u64 alloc_time_ns = 0;
 
@@ -284,13 +283,6 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
 	if (blk_queue_rq_alloc_time(q))
 		alloc_time_ns = ktime_get_ns();
 
-	if (likely(!data->ctx)) {
-		data->ctx = blk_mq_get_ctx(q);
-		clear_ctx_on_error = true;
-	}
-	if (likely(!data->hctx))
-		data->hctx = blk_mq_map_queue(q, data->cmd_flags,
-						data->ctx);
 	if (data->cmd_flags & REQ_NOWAIT)
 		data->flags |= BLK_MQ_REQ_NOWAIT;
 
@@ -306,16 +298,11 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
 		    e->type->ops.limit_depth &&
 		    !(data->flags & BLK_MQ_REQ_RESERVED))
 			e->type->ops.limit_depth(data->cmd_flags, data);
-	} else {
-		blk_mq_tag_busy(data->hctx);
 	}
 
 	tag = blk_mq_get_tag(data);
-	if (tag == BLK_MQ_TAG_FAIL) {
-		if (clear_ctx_on_error)
-			data->ctx = NULL;
+	if (tag == BLK_MQ_TAG_FAIL)
 		return NULL;
-	}
 
 	if (data->flags & BLK_MQ_REQ_INTERNAL) {
 		rq = data->hctx->sched_tags->static_rqs[tag];
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 82921b30b6afa..1338be9d51777 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -146,13 +146,13 @@ static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
 }
 
 struct blk_mq_alloc_data {
-	/* input parameter */
+	/* input parameters */
 	struct request_queue *q;
 	blk_mq_req_flags_t flags;
 	unsigned int shallow_depth;
 	unsigned int cmd_flags;
 
-	/* input & output parameter */
+	/* output parameters */
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_hw_ctx *hctx;
 
-- 
2.26.2


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH 7/9] blk-mq: disable preemption during allocating request tag
  2020-05-18  6:39 blk-mq: improvement CPU hotplug (simplified version) v2 Christoph Hellwig
                   ` (5 preceding siblings ...)
  2020-05-18  6:39 ` [PATCH 6/9] blk-mq: don't set data->ctx and data->hctx in __blk_mq_alloc_request Christoph Hellwig
@ 2020-05-18  6:39 ` Christoph Hellwig
  2020-05-18  6:39 ` [PATCH 8/9] blk-mq: add blk_mq_all_tag_iter Christoph Hellwig
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-18  6:39 UTC (permalink / raw)
  Cc: linux-block, John Garry, Bart Van Assche, Hannes Reinecke,
	Thomas Gleixner, Ming Lei

From: Ming Lei <ming.lei@redhat.com>

Disable preempt for a little while during allocating request tag, so
request's tag(internal tag) is always allocated on the cpu of data->ctx,
prepare for improving to handle cpu hotplug during allocating request.

In the following patch, hctx->state will be checked to see if it becomes
inactive which is always set on the last CPU of hctx->cpumask.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
[hch: keep the preempt disable confined inside blk_mq_get_tag]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-tag.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index b526f1f5a3bf3..0cc38883618b9 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -108,6 +108,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	unsigned int tag_offset;
 	int tag;
 
+	preempt_disable();
 	data->ctx = blk_mq_get_ctx(data->q);
 	data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, data->ctx);
 	tags = blk_mq_tags_from_data(data);
@@ -115,10 +116,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		blk_mq_tag_busy(data->hctx);
 
 	if (data->flags & BLK_MQ_REQ_RESERVED) {
-		if (unlikely(!tags->nr_reserved_tags)) {
-			WARN_ON_ONCE(1);
-			return BLK_MQ_TAG_FAIL;
-		}
+		if (WARN_ON_ONCE(!tags->nr_reserved_tags))
+			goto fail;
 		bt = &tags->breserved_tags;
 		tag_offset = 0;
 	} else {
@@ -131,7 +130,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		goto found_tag;
 
 	if (data->flags & BLK_MQ_REQ_NOWAIT)
-		return BLK_MQ_TAG_FAIL;
+		goto fail;
 
 	ws = bt_wait_ptr(bt, data->hctx);
 	do {
@@ -158,11 +157,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 		if (tag != -1)
 			break;
 
+		preempt_enable();
+
 		bt_prev = bt;
 		io_schedule();
-
 		sbitmap_finish_wait(bt, ws, &wait);
 
+		preempt_disable();
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
 						data->ctx);
@@ -186,7 +187,11 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	sbitmap_finish_wait(bt, ws, &wait);
 
 found_tag:
+	preempt_enable();
 	return tag + tag_offset;
+fail:
+	preempt_enable();
+	return BLK_MQ_TAG_FAIL;
 }
 
 bool __blk_mq_get_driver_tag(struct request *rq)
-- 
2.26.2


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH 8/9] blk-mq: add blk_mq_all_tag_iter
  2020-05-18  6:39 blk-mq: improvement CPU hotplug (simplified version) v2 Christoph Hellwig
                   ` (6 preceding siblings ...)
  2020-05-18  6:39 ` [PATCH 7/9] blk-mq: disable preemption during allocating request tag Christoph Hellwig
@ 2020-05-18  6:39 ` Christoph Hellwig
  2020-05-18  6:39 ` [PATCH 9/9] blk-mq: drain I/O when all CPUs in a hctx are offline Christoph Hellwig
  2020-05-18 11:49 ` blk-mq: improvement CPU hotplug (simplified version) v2 John Garry
  9 siblings, 0 replies; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-18  6:39 UTC (permalink / raw)
  Cc: linux-block, John Garry, Bart Van Assche, Hannes Reinecke,
	Thomas Gleixner, Ming Lei

From: Ming Lei <ming.lei@redhat.com>

Add one new function of blk_mq_all_tag_iter so that we can iterate every
allocated request from either io scheduler tags or driver tags, and this
way is more flexible since it allows the callers to do whatever they want
on allocated request.

It will be used to implement draining allocated requests on specified
hctx in this patchset.

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-tag.c | 33 +++++++++++++++++++++++++++++----
 block/blk-mq-tag.h |  2 ++
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 0cc38883618b9..b4b8d4e8e6e20 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -295,6 +295,7 @@ struct bt_tags_iter_data {
 	busy_tag_iter_fn *fn;
 	void *data;
 	bool reserved;
+	bool iterate_all;
 };
 
 static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
@@ -312,7 +313,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	 * test and set the bit before assining ->rqs[].
 	 */
 	rq = tags->rqs[bitnr];
-	if (rq && blk_mq_request_started(rq))
+	if (rq && (iter_data->iterate_all || blk_mq_request_started(rq)))
 		return iter_data->fn(rq, iter_data->data, reserved);
 
 	return true;
@@ -332,13 +333,15 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
  *		bitmap_tags member of struct blk_mq_tags.
  */
 static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
-			     busy_tag_iter_fn *fn, void *data, bool reserved)
+			     busy_tag_iter_fn *fn, void *data, bool reserved,
+			     bool iterate_all)
 {
 	struct bt_tags_iter_data iter_data = {
 		.tags = tags,
 		.fn = fn,
 		.data = data,
 		.reserved = reserved,
+		.iterate_all = iterate_all,
 	};
 
 	if (tags->rqs)
@@ -359,8 +362,30 @@ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
 		busy_tag_iter_fn *fn, void *priv)
 {
 	if (tags->nr_reserved_tags)
-		bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true);
-	bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false);
+		bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true,
+				 false);
+	bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false, false);
+}
+
+/**
+ * blk_mq_all_tag_iter - iterate over all requests in a tag map
+ * @tags:	Tag map to iterate over.
+ * @fn:		Pointer to the function that will be called for each
+ *		request. @fn will be called as follows: @fn(rq, @priv,
+ *		reserved) where rq is a pointer to a request. 'reserved'
+ *		indicates whether or not @rq is a reserved request. Return
+ *		true to continue iterating tags, false to stop.
+ * @priv:	Will be passed as second argument to @fn.
+ *
+ * It is the caller's responsibility to check rq's state in @fn.
+ */
+void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+		void *priv)
+{
+	if (tags->nr_reserved_tags)
+		bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true,
+				 true);
+	bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false, true);
 }
 
 /**
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index c2ad3ed637106..6465478b69a6e 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -42,6 +42,8 @@ extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
 void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 		void *priv);
+void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+		void *priv);
 
 static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
 						 struct blk_mq_hw_ctx *hctx)
-- 
2.26.2


^ permalink raw reply	[flat|nested] 47+ messages in thread

* [PATCH 9/9] blk-mq: drain I/O when all CPUs in a hctx are offline
  2020-05-18  6:39 blk-mq: improvement CPU hotplug (simplified version) v2 Christoph Hellwig
                   ` (7 preceding siblings ...)
  2020-05-18  6:39 ` [PATCH 8/9] blk-mq: add blk_mq_all_tag_iter Christoph Hellwig
@ 2020-05-18  6:39 ` Christoph Hellwig
  2020-05-18  8:42   ` John Garry
  2020-05-18 11:49 ` blk-mq: improvement CPU hotplug (simplified version) v2 John Garry
  9 siblings, 1 reply; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-18  6:39 UTC (permalink / raw)
  Cc: linux-block, John Garry, Bart Van Assche, Hannes Reinecke,
	Thomas Gleixner, Ming Lei

From: Ming Lei <ming.lei@redhat.com>

Most of blk-mq drivers depend on managed IRQ's auto-affinity to setup
up queue mapping. Thomas mentioned the following point[1]:

"That was the constraint of managed interrupts from the very beginning:

 The driver/subsystem has to quiesce the interrupt line and the associated
 queue _before_ it gets shutdown in CPU unplug and not fiddle with it
 until it's restarted by the core when the CPU is plugged in again."

However, current blk-mq implementation doesn't quiesce hw queue before
the last CPU in the hctx is shutdown. Even worse, CPUHP_BLK_MQ_DEAD is
one cpuhp state handled after the CPU is down, so there isn't any chance
to quiesce hctx for blk-mq wrt. CPU hotplug.

Add new cpuhp state of CPUHP_AP_BLK_MQ_ONLINE for blk-mq to stop queues
and wait for completion of in-flight requests.

We will stop request allocation on any cpu in hctx->cpumask and wait for
completion of all allocated requests when one hctx is becoming inactive in
the following patch. This way may cause dead-lock for some stacking blk-mq
drivers, such as dm-rq and loop.

Before one CPU becomes offline, check if it is the last online CPU of hctx.
If yes, mark this hctx as inactive, meantime wait for completion of all
allocated requests originated from this hctx. Meantime check if this
hctx has become inactive during allocating request, if yes, give up it
and retry from new online CPU.

This way guarantees that there isn't any inflight I/O before shutdown the
managed IRQ line when all CPUs of this IRQ line is offline.

Add a BLK_MQ_F_STACKING and set it for dm-rq and loop, so we don't need
to wait for completion of in-flight requests from these drivers to avoid
a potential dead-lock. It is safe to do this for stacking drivers as those
do not use interrupts at all and their I/O completions are triggered by
underlying devices I/O completion.

[1] https://lore.kernel.org/linux-block/alpine.DEB.2.21.1904051331270.1802@nanos.tec.linutronix.de/

Signed-off-by: Ming Lei <ming.lei@redhat.com>
[hch: rebased, merged two patches, slightly simplified a few helpers,
      use a while loop to keep trying until we find an online CPU]
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-mq-debugfs.c     |  2 +
 block/blk-mq-tag.c         |  8 ++++
 block/blk-mq.c             | 98 ++++++++++++++++++++++++++++++++++++++
 drivers/block/loop.c       |  2 +-
 drivers/md/dm-rq.c         |  2 +-
 include/linux/blk-mq.h     | 10 ++++
 include/linux/cpuhotplug.h |  1 +
 7 files changed, 121 insertions(+), 2 deletions(-)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 96b7a35c898a7..15df3a36e9fa4 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -213,6 +213,7 @@ static const char *const hctx_state_name[] = {
 	HCTX_STATE_NAME(STOPPED),
 	HCTX_STATE_NAME(TAG_ACTIVE),
 	HCTX_STATE_NAME(SCHED_RESTART),
+	HCTX_STATE_NAME(INACTIVE),
 };
 #undef HCTX_STATE_NAME
 
@@ -239,6 +240,7 @@ static const char *const hctx_flag_name[] = {
 	HCTX_FLAG_NAME(TAG_SHARED),
 	HCTX_FLAG_NAME(BLOCKING),
 	HCTX_FLAG_NAME(NO_SCHED),
+	HCTX_FLAG_NAME(STACKING),
 };
 #undef HCTX_FLAG_NAME
 
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index b4b8d4e8e6e20..8c05b952e98a9 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -187,6 +187,14 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 	sbitmap_finish_wait(bt, ws, &wait);
 
 found_tag:
+	/*
+	 * Give up this allocation if the hctx is inactive.  The caller will
+	 * retry on an active hctx.
+	 */
+	if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) {
+		blk_mq_put_tag(tags, data->ctx, tag + tag_offset);
+		goto fail;
+	}
 	preempt_enable();
 	return tag + tag_offset;
 fail:
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 74c2d8f61426c..6fc25936568cb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -403,6 +403,8 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
 		return ERR_PTR(ret);
 
 	rq = __blk_mq_alloc_request(&data);
+	while (unlikely(!rq && !(flags & BLK_MQ_REQ_NOWAIT)))
+		rq = __blk_mq_alloc_request_on_cpumask(cpu_online_mask, &data);
 	if (!rq)
 		goto out_queue_exit;
 	rq->__data_len = 0;
@@ -2010,8 +2012,14 @@ blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 
 	rq_qos_throttle(q, bio);
 
+	/*
+	 * Waiting allocations only fail because of an inactive hctx, retry on
+	 * an online CPU in that case.
+	 */
 	data.cmd_flags = bio->bi_opf;
 	rq = __blk_mq_alloc_request(&data);
+	while (unlikely(!rq && !(data.cmd_flags & REQ_NOWAIT)))
+		rq = __blk_mq_alloc_request_on_cpumask(cpu_online_mask, &data);
 	if (unlikely(!rq)) {
 		rq_qos_cleanup(q, bio);
 		if (bio->bi_opf & REQ_NOWAIT)
@@ -2283,6 +2291,84 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 	return -ENOMEM;
 }
 
+struct rq_iter_data {
+	bool has_rq;
+	struct blk_mq_hw_ctx *hctx;
+};
+
+static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
+{
+	struct rq_iter_data *iter_data = data;
+
+	if (rq->mq_hctx == iter_data->hctx) {
+		iter_data->has_rq = true;
+		return false;
+	}
+
+	return true;
+}
+
+static bool blk_mq_tags_has_request(struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_tags *tags = hctx->sched_tags ?: hctx->tags;
+	struct rq_iter_data data = {
+		.hctx	= hctx,
+	};
+
+	blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
+	return data.has_rq;
+}
+
+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
+		struct blk_mq_hw_ctx *hctx)
+{
+	if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
+		return false;
+	if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
+		return false;
+	return true;
+}
+
+static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
+{
+	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+			struct blk_mq_hw_ctx, cpuhp_online);
+	struct request_queue *q = hctx->queue;
+
+	if (!cpumask_test_cpu(cpu, hctx->cpumask))
+		return 0;
+
+	if (!blk_mq_last_cpu_in_hctx(cpu, hctx))
+		return 0;
+
+	/* Prevent new request from being allocated on the current hctx/cpu */
+	set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+
+	/*
+	 * Grab one refcount for avoiding scheduler switch, and
+	 * return immediately if queue has been frozen.
+	 */
+	if (!percpu_ref_tryget(&q->q_usage_counter))
+		return 0;
+
+	/* wait until all requests in this hctx are gone */
+	while (blk_mq_tags_has_request(hctx))
+		msleep(5);
+
+	percpu_ref_put(&q->q_usage_counter);
+	return 0;
+}
+
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+			struct blk_mq_hw_ctx, cpuhp_online);
+
+	if (cpumask_test_cpu(cpu, hctx->cpumask))
+		clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+	return 0;
+}
+
 /*
  * 'cpu' is going away. splice any existing rq_list entries from this
  * software queue to the hw queue dispatch list, and ensure that it
@@ -2296,6 +2382,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 	enum hctx_type type;
 
 	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
+	if (!cpumask_test_cpu(cpu, hctx->cpumask))
+		return 0;
+
 	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
 	type = hctx->type;
 
@@ -2319,6 +2408,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 
 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
 {
+	if (!(hctx->flags & BLK_MQ_F_STACKING))
+		cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+						    &hctx->cpuhp_online);
 	cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
 					    &hctx->cpuhp_dead);
 }
@@ -2378,6 +2470,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
 {
 	hctx->queue_num = hctx_idx;
 
+	if (!(hctx->flags & BLK_MQ_F_STACKING))
+		cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+				&hctx->cpuhp_online);
 	cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
 
 	hctx->tags = set->tags[hctx_idx];
@@ -3632,6 +3727,9 @@ static int __init blk_mq_init(void)
 {
 	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
 				blk_mq_hctx_notify_dead);
+	cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
+				blk_mq_hctx_notify_online,
+				blk_mq_hctx_notify_offline);
 	return 0;
 }
 subsys_initcall(blk_mq_init);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index da693e6a834e5..d7904b4d8d126 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -2037,7 +2037,7 @@ static int loop_add(struct loop_device **l, int i)
 	lo->tag_set.queue_depth = 128;
 	lo->tag_set.numa_node = NUMA_NO_NODE;
 	lo->tag_set.cmd_size = sizeof(struct loop_cmd);
-	lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;
 	lo->tag_set.driver_data = lo;
 
 	err = blk_mq_alloc_tag_set(&lo->tag_set);
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 3f8577e2c13be..f60c025121215 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -547,7 +547,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
 	md->tag_set->ops = &dm_mq_ops;
 	md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
 	md->tag_set->numa_node = md->numa_node_id;
-	md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
+	md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING;
 	md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
 	md->tag_set->driver_data = md;
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d7307795439a4..a20f8c241d665 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -140,6 +140,8 @@ struct blk_mq_hw_ctx {
 	 */
 	atomic_t		nr_active;
 
+	/** @cpuhp_online: List to store request if CPU is going to die */
+	struct hlist_node	cpuhp_online;
 	/** @cpuhp_dead: List to store request if some CPU die. */
 	struct hlist_node	cpuhp_dead;
 	/** @kobj: Kernel object for sysfs. */
@@ -391,6 +393,11 @@ struct blk_mq_ops {
 enum {
 	BLK_MQ_F_SHOULD_MERGE	= 1 << 0,
 	BLK_MQ_F_TAG_SHARED	= 1 << 1,
+	/*
+	 * Set when this device requires underlying blk-mq device for
+	 * completing IO:
+	 */
+	BLK_MQ_F_STACKING	= 1 << 2,
 	BLK_MQ_F_BLOCKING	= 1 << 5,
 	BLK_MQ_F_NO_SCHED	= 1 << 6,
 	BLK_MQ_F_ALLOC_POLICY_START_BIT = 8,
@@ -400,6 +407,9 @@ enum {
 	BLK_MQ_S_TAG_ACTIVE	= 1,
 	BLK_MQ_S_SCHED_RESTART	= 2,
 
+	/* hw queue is inactive after all its CPUs become offline */
+	BLK_MQ_S_INACTIVE	= 3,
+
 	BLK_MQ_MAX_DEPTH	= 10240,
 
 	BLK_MQ_CPU_WORK_BATCH	= 8,
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 77d70b6335318..24b3a77810b6d 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -152,6 +152,7 @@ enum cpuhp_state {
 	CPUHP_AP_SMPBOOT_THREADS,
 	CPUHP_AP_X86_VDSO_VMA_ONLINE,
 	CPUHP_AP_IRQ_AFFINITY_ONLINE,
+	CPUHP_AP_BLK_MQ_ONLINE,
 	CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS,
 	CPUHP_AP_X86_INTEL_EPB_ONLINE,
 	CPUHP_AP_PERF_ONLINE,
-- 
2.26.2


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-18  6:39 ` [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx Christoph Hellwig
@ 2020-05-18  8:32   ` Thomas Gleixner
  2020-05-18  9:31     ` Ming Lei
  0 siblings, 1 reply; 47+ messages in thread
From: Thomas Gleixner @ 2020-05-18  8:32 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-block, John Garry, Bart Van Assche, Hannes Reinecke, Ming Lei

Christoph Hellwig <hch@lst.de> writes:
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index fcfce666457e2..540b5845cd1d3 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -386,6 +386,20 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
>  	return rq;
>  }
>  
> +static void __blk_mq_alloc_request_cb(void *__data)
> +{
> +	struct blk_mq_alloc_data *data = __data;
> +
> +	data->rq = __blk_mq_alloc_request(data);
> +}
> +
> +static struct request *__blk_mq_alloc_request_on_cpumask(const cpumask_t *mask,
> +		struct blk_mq_alloc_data *data)
> +{
> +	smp_call_function_any(mask, __blk_mq_alloc_request_cb, data, 1);
> +	return data->rq;
> +}

Is this absolutely necessary to be a smp function call? That's going to
be problematic vs. RT. Same applies to the explicit preempt_disable() in
patch 7.

Thanks,

        tglx





^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 9/9] blk-mq: drain I/O when all CPUs in a hctx are offline
  2020-05-18  6:39 ` [PATCH 9/9] blk-mq: drain I/O when all CPUs in a hctx are offline Christoph Hellwig
@ 2020-05-18  8:42   ` John Garry
  2020-05-18  9:21     ` Ming Lei
  0 siblings, 1 reply; 47+ messages in thread
From: John Garry @ 2020-05-18  8:42 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-block, Bart Van Assche, Hannes Reinecke, Thomas Gleixner, Ming Lei

On 18/05/2020 07:39, Christoph Hellwig wrote:
> +
>   /*
>    * 'cpu' is going away. splice any existing rq_list entries from this

Do we need to fix up this comment again?

Cheers,
John

>    * software queue to the hw queue dispatch list, and ensure that it
> @@ -2296,6 +2382,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
>   	enum hctx_type type;
>   
>   	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
> +	if (!cpumask_test_cpu(cpu, hctx->cpumask))
> +		return 0;
> +
>   	ctx = __blk_mq_get_ctx(hctx->queue, cpu);
>   	type = hctx->type;


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 9/9] blk-mq: drain I/O when all CPUs in a hctx are offline
  2020-05-18  8:42   ` John Garry
@ 2020-05-18  9:21     ` Ming Lei
  0 siblings, 0 replies; 47+ messages in thread
From: Ming Lei @ 2020-05-18  9:21 UTC (permalink / raw)
  To: John Garry
  Cc: Christoph Hellwig, linux-block, Bart Van Assche, Hannes Reinecke,
	Thomas Gleixner

On Mon, May 18, 2020 at 09:42:57AM +0100, John Garry wrote:
> On 18/05/2020 07:39, Christoph Hellwig wrote:
> > +
> >   /*
> >    * 'cpu' is going away. splice any existing rq_list entries from this
> 
> Do we need to fix up this comment again?

I think we don't need, now blk_mq_hctx_notify_dead() is only for
handling non-inactive hctx, and requests from the dead sw queue need to
be dispatched to other online CPUs.

Thanks,
Ming


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-18  8:32   ` Thomas Gleixner
@ 2020-05-18  9:31     ` Ming Lei
  2020-05-18 10:42       ` Thomas Gleixner
  0 siblings, 1 reply; 47+ messages in thread
From: Ming Lei @ 2020-05-18  9:31 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Christoph Hellwig, linux-block, John Garry, Bart Van Assche,
	Hannes Reinecke

On Mon, May 18, 2020 at 10:32:22AM +0200, Thomas Gleixner wrote:
> Christoph Hellwig <hch@lst.de> writes:
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index fcfce666457e2..540b5845cd1d3 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -386,6 +386,20 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
> >  	return rq;
> >  }
> >  
> > +static void __blk_mq_alloc_request_cb(void *__data)
> > +{
> > +	struct blk_mq_alloc_data *data = __data;
> > +
> > +	data->rq = __blk_mq_alloc_request(data);
> > +}
> > +
> > +static struct request *__blk_mq_alloc_request_on_cpumask(const cpumask_t *mask,
> > +		struct blk_mq_alloc_data *data)
> > +{
> > +	smp_call_function_any(mask, __blk_mq_alloc_request_cb, data, 1);
> > +	return data->rq;
> > +}
> 
> Is this absolutely necessary to be a smp function call? That's going to

I think it is.

Request is bound to the allocation CPU and the hw queue(hctx) which is
mapped from the allocation CPU.

If request is allocated from one cpu which is going to offline, we can't
handle that easily.

> be problematic vs. RT. Same applies to the explicit preempt_disable() in
> patch 7.

I think it is true and the reason is same too, but the period is quite short,
and it is just taken for iterating several bitmaps for finding one free bit.



thanks,
Ming


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-18  9:31     ` Ming Lei
@ 2020-05-18 10:42       ` Thomas Gleixner
  2020-05-18 11:54         ` Ming Lei
  0 siblings, 1 reply; 47+ messages in thread
From: Thomas Gleixner @ 2020-05-18 10:42 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, linux-block, John Garry, Bart Van Assche,
	Hannes Reinecke

Ming Lei <ming.lei@redhat.com> writes:
> On Mon, May 18, 2020 at 10:32:22AM +0200, Thomas Gleixner wrote:
>> Christoph Hellwig <hch@lst.de> writes:
>> Is this absolutely necessary to be a smp function call? That's going to
>
> I think it is.
>
> Request is bound to the allocation CPU and the hw queue(hctx) which is
> mapped from the allocation CPU.
>
> If request is allocated from one cpu which is going to offline, we can't
> handle that easily.

That's a pretty handwavy explanation and does not give any reason why
this needs to be a smp function call and cannot be solved otherwise,
e.g. by delegating this to a work queue.

>> be problematic vs. RT. Same applies to the explicit preempt_disable() in
>> patch 7.
>
> I think it is true and the reason is same too, but the period is quite short,
> and it is just taken for iterating several bitmaps for finding one free bit.

And takes spinlocks along the way.... See:

  https://www.kernel.org/doc/html/latest/locking/locktypes.html

for a full explanation why this can't work on RT. And that's the same
reason why the smp function call will fall apart on a RT enabled kernel.

Thanks,

        tglx


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: blk-mq: improvement CPU hotplug (simplified version) v2
  2020-05-18  6:39 blk-mq: improvement CPU hotplug (simplified version) v2 Christoph Hellwig
                   ` (8 preceding siblings ...)
  2020-05-18  6:39 ` [PATCH 9/9] blk-mq: drain I/O when all CPUs in a hctx are offline Christoph Hellwig
@ 2020-05-18 11:49 ` John Garry
  2020-05-19 15:30   ` Christoph Hellwig
  9 siblings, 1 reply; 47+ messages in thread
From: John Garry @ 2020-05-18 11:49 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-block, Bart Van Assche, Hannes Reinecke, Thomas Gleixner

On 18/05/2020 07:39, Christoph Hellwig wrote:
> Hi all,
> 
> this series ensures I/O is quiesced before a cpu and thus the managed
> interrupt handler is shut down.
> 
> This patchset tries to address the issue by the following approach:
> 
>   - before the last cpu in hctx->cpumask is going to offline, mark this
>     hctx as inactive
> 
>   - disable preempt during allocating tag for request, and after tag is
>     allocated, check if this hctx is inactive. If yes, give up the
>     allocation and try remote allocation from online CPUs
> 
>   - before hctx becomes inactive, drain all allocated requests on this
>     hctx
> 
> The guts of the changes are from Ming Lei, I just did a bunch of prep
> cleanups so that they can fit in more nicely.  The series also depends
> on my "avoid a few q_usage_counter roundtrips v3" series.
> 
> Thanks John Garry for running lots of tests on arm64 with this previous
> version patches and co-working on investigating all kinds of issues.
> 
> A git tree is available here:
> 
>      git://git.infradead.org/users/hch/block.git blk-mq-hotplug
> 
> Gitweb:
> 
>      http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/blk-mq-hotplug
> .
> 

FWIW, I tested this series for cpu hotplug and it looked ok.

john

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-18 10:42       ` Thomas Gleixner
@ 2020-05-18 11:54         ` Ming Lei
  2020-05-18 13:16           ` Christoph Hellwig
  2020-05-18 13:18           ` Thomas Gleixner
  0 siblings, 2 replies; 47+ messages in thread
From: Ming Lei @ 2020-05-18 11:54 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Christoph Hellwig, linux-block, John Garry, Bart Van Assche,
	Hannes Reinecke

On Mon, May 18, 2020 at 12:42:54PM +0200, Thomas Gleixner wrote:
> Ming Lei <ming.lei@redhat.com> writes:
> > On Mon, May 18, 2020 at 10:32:22AM +0200, Thomas Gleixner wrote:
> >> Christoph Hellwig <hch@lst.de> writes:
> >> Is this absolutely necessary to be a smp function call? That's going to
> >
> > I think it is.
> >
> > Request is bound to the allocation CPU and the hw queue(hctx) which is
> > mapped from the allocation CPU.
> >
> > If request is allocated from one cpu which is going to offline, we can't
> > handle that easily.
> 
> That's a pretty handwavy explanation and does not give any reason why
> this needs to be a smp function call and cannot be solved otherwise,
> e.g. by delegating this to a work queue.

I guess I misunderstood your point, sorry for that.

The requirement is just that the request needs to be allocated on one online
CPU after INACTIVE is set, and we can use a workqueue to do that.

> 
> >> be problematic vs. RT. Same applies to the explicit preempt_disable() in
> >> patch 7.
> >
> > I think it is true and the reason is same too, but the period is quite short,
> > and it is just taken for iterating several bitmaps for finding one free bit.
> 
> And takes spinlocks along the way.... See:
> 
>   https://www.kernel.org/doc/html/latest/locking/locktypes.html
> 
> for a full explanation why this can't work on RT. And that's the same
> reason why the smp function call will fall apart on a RT enabled kernel.

We do want to avoid the cost of any lock, because it is in the fast IO path.

Looks preempt_disable in patch 7 can't be avoided.

Thanks,
Ming


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-18 11:54         ` Ming Lei
@ 2020-05-18 13:16           ` Christoph Hellwig
  2020-05-18 14:11             ` Ming Lei
  2020-05-18 18:47             ` Thomas Gleixner
  2020-05-18 13:18           ` Thomas Gleixner
  1 sibling, 2 replies; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-18 13:16 UTC (permalink / raw)
  To: Ming Lei
  Cc: Thomas Gleixner, Christoph Hellwig, linux-block, John Garry,
	Bart Van Assche, Hannes Reinecke

On Mon, May 18, 2020 at 07:54:54PM +0800, Ming Lei wrote:
> 
> I guess I misunderstood your point, sorry for that.
> 
> The requirement is just that the request needs to be allocated on one online
> CPU after INACTIVE is set, and we can use a workqueue to do that.

I've looked over the code again, and I'm really not sure why we need that.
Presumable the CPU hotplug code ensures tasks don't get schedule on the
CPU running the shutdown state machine, so if we just do a retry of the
tag allocation we can assume we are on a different CPU now (Thomas,
correct me if that assumption is wrong).  So I think we can do entirely
with the preempt_disable and the smp calls.  Something like this branch,
which has only survived basic testing (the last patch is the pimary
interesting one, plus maybe the last 3 before that):

    git://git.infradead.org/users/hch/block.git blk-mq-hotplug.2

Gitweb:

    http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/blk-mq-hotplug.2

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-18 11:54         ` Ming Lei
  2020-05-18 13:16           ` Christoph Hellwig
@ 2020-05-18 13:18           ` Thomas Gleixner
  1 sibling, 0 replies; 47+ messages in thread
From: Thomas Gleixner @ 2020-05-18 13:18 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, linux-block, John Garry, Bart Van Assche,
	Hannes Reinecke

Ming Lei <ming.lei@redhat.com> writes:
> On Mon, May 18, 2020 at 12:42:54PM +0200, Thomas Gleixner wrote:
>> Ming Lei <ming.lei@redhat.com> writes:
>> > If request is allocated from one cpu which is going to offline, we can't
>> > handle that easily.
>> 
>> That's a pretty handwavy explanation and does not give any reason why
>> this needs to be a smp function call and cannot be solved otherwise,
>> e.g. by delegating this to a work queue.
>
> I guess I misunderstood your point, sorry for that.
>
> The requirement is just that the request needs to be allocated on one online
> CPU after INACTIVE is set, and we can use a workqueue to do that.

That'd be great.

>> >> be problematic vs. RT. Same applies to the explicit preempt_disable() in
>> >> patch 7.
>> >
>> > I think it is true and the reason is same too, but the period is quite short,
>> > and it is just taken for iterating several bitmaps for finding one free bit.
>> 
>> And takes spinlocks along the way.... See:
>> 
>>   https://www.kernel.org/doc/html/latest/locking/locktypes.html
>> 
>> for a full explanation why this can't work on RT. And that's the same
>> reason why the smp function call will fall apart on a RT enabled kernel.
>
> We do want to avoid the cost of any lock, because it is in the fast IO path.
>
> Looks preempt_disable in patch 7 can't be avoided.

Well are you concerned about preemption or do you just need to make sure
that the task can't be migrated?

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-18 13:16           ` Christoph Hellwig
@ 2020-05-18 14:11             ` Ming Lei
  2020-05-18 16:56               ` Christoph Hellwig
  2020-05-18 18:47             ` Thomas Gleixner
  1 sibling, 1 reply; 47+ messages in thread
From: Ming Lei @ 2020-05-18 14:11 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Thomas Gleixner, linux-block, John Garry, Bart Van Assche,
	Hannes Reinecke

On Mon, May 18, 2020 at 03:16:34PM +0200, Christoph Hellwig wrote:
> On Mon, May 18, 2020 at 07:54:54PM +0800, Ming Lei wrote:
> > 
> > I guess I misunderstood your point, sorry for that.
> > 
> > The requirement is just that the request needs to be allocated on one online
> > CPU after INACTIVE is set, and we can use a workqueue to do that.
> 
> I've looked over the code again, and I'm really not sure why we need that.
> Presumable the CPU hotplug code ensures tasks don't get schedule on the
> CPU running the shutdown state machine, so if we just do a retry of the

percpu kthread still can be scheduled on the cpu to be online, see
is_cpu_allowed(). And bound wq has been used widely in fs code.

> tag allocation we can assume we are on a different CPU now (Thomas,
> correct me if that assumption is wrong).  So I think we can do entirely
> with the preempt_disable and the smp calls.  Something like this branch,
> which has only survived basic testing (the last patch is the pimary
> interesting one, plus maybe the last 3 before that):
> 
>     git://git.infradead.org/users/hch/block.git blk-mq-hotplug.2
> 
> Gitweb:
> 
>     http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/blk-mq-hotplug.2
> 

After preempt_disable() is removed, the INACTIVE bit is set in the CPU
to be offline, and the bit can be read on all other CPUs, so other CPUs
may not get synced with the INACTIVE bit.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-18 14:11             ` Ming Lei
@ 2020-05-18 16:56               ` Christoph Hellwig
  2020-05-18 18:38                 ` Thomas Gleixner
  2020-05-19  1:54                 ` Ming Lei
  0 siblings, 2 replies; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-18 16:56 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Thomas Gleixner, linux-block, John Garry,
	Bart Van Assche, Hannes Reinecke

On Mon, May 18, 2020 at 10:11:07PM +0800, Ming Lei wrote:
> On Mon, May 18, 2020 at 03:16:34PM +0200, Christoph Hellwig wrote:
> > On Mon, May 18, 2020 at 07:54:54PM +0800, Ming Lei wrote:
> > > 
> > > I guess I misunderstood your point, sorry for that.
> > > 
> > > The requirement is just that the request needs to be allocated on one online
> > > CPU after INACTIVE is set, and we can use a workqueue to do that.
> > 
> > I've looked over the code again, and I'm really not sure why we need that.
> > Presumable the CPU hotplug code ensures tasks don't get schedule on the
> > CPU running the shutdown state machine, so if we just do a retry of the
> 
> percpu kthread still can be scheduled on the cpu to be online, see
> is_cpu_allowed(). And bound wq has been used widely in fs code.

s/to be online/to be offlined/ I guess.

Shouldn't all the per-cpu kthreads also stop as part of the offlining?
If they don't quiesce before the new blk-mq stop state I think we need
to make sure they do.  It is rather pointless to quiesce the requests
if a thread that can submit I/O is still live.

> > 
> >     http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/blk-mq-hotplug.2
> > 
> 
> After preempt_disable() is removed, the INACTIVE bit is set in the CPU
> to be offline, and the bit can be read on all other CPUs, so other CPUs
> may not get synced with the INACTIVE bit.

INACTIVE is set to the hctx, and it is set by the last CPU to be
offlined that is mapped to the hctx.  once the bit is set the barrier
ensured it is seen everywhere before we start waiting for the requests
to finish.  What is missing?:

> 
> 
> Thanks,
> Ming
---end quoted text---

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-18 16:56               ` Christoph Hellwig
@ 2020-05-18 18:38                 ` Thomas Gleixner
  2020-05-18 18:45                   ` Christoph Hellwig
  2020-05-19  1:54                 ` Ming Lei
  1 sibling, 1 reply; 47+ messages in thread
From: Thomas Gleixner @ 2020-05-18 18:38 UTC (permalink / raw)
  To: Christoph Hellwig, Ming Lei
  Cc: Christoph Hellwig, linux-block, John Garry, Bart Van Assche,
	Hannes Reinecke

Christoph Hellwig <hch@lst.de> writes:

> On Mon, May 18, 2020 at 10:11:07PM +0800, Ming Lei wrote:
>> On Mon, May 18, 2020 at 03:16:34PM +0200, Christoph Hellwig wrote:
>> > On Mon, May 18, 2020 at 07:54:54PM +0800, Ming Lei wrote:
>> > > 
>> > > I guess I misunderstood your point, sorry for that.
>> > > 
>> > > The requirement is just that the request needs to be allocated on one online
>> > > CPU after INACTIVE is set, and we can use a workqueue to do that.
>> > 
>> > I've looked over the code again, and I'm really not sure why we need that.
>> > Presumable the CPU hotplug code ensures tasks don't get schedule on the
>> > CPU running the shutdown state machine, so if we just do a retry of the
>> 
>> percpu kthread still can be scheduled on the cpu to be online, see
>> is_cpu_allowed(). And bound wq has been used widely in fs code.
>
> s/to be online/to be offlined/ I guess.
>
> Shouldn't all the per-cpu kthreads also stop as part of the offlining?
> If they don't quiesce before the new blk-mq stop state I think we need
> to make sure they do.  It is rather pointless to quiesce the requests
> if a thread that can submit I/O is still live.

Which kthreads are you talking about?

Workqueues? CPU bound workqueues are shut down in
CPUHP_AP_WORKQUEUE_ONLINE state.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-18 18:38                 ` Thomas Gleixner
@ 2020-05-18 18:45                   ` Christoph Hellwig
  2020-05-18 18:59                     ` Thomas Gleixner
  0 siblings, 1 reply; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-18 18:45 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Christoph Hellwig, Ming Lei, linux-block, John Garry,
	Bart Van Assche, Hannes Reinecke

On Mon, May 18, 2020 at 08:38:04PM +0200, Thomas Gleixner wrote:
> > Shouldn't all the per-cpu kthreads also stop as part of the offlining?
> > If they don't quiesce before the new blk-mq stop state I think we need
> > to make sure they do.  It is rather pointless to quiesce the requests
> > if a thread that can submit I/O is still live.
> 
> Which kthreads are you talking about?

I think PF_KTHREAD threads bound to single cpu will usually be
workqueues, yes.

> Workqueues? CPU bound workqueues are shut down in
> CPUHP_AP_WORKQUEUE_ONLINE state.

That's what I mean.  If we shut down I/O before that happend we'd have
a problem, but as I expected your state machine is smarter than that :)

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-18 13:16           ` Christoph Hellwig
  2020-05-18 14:11             ` Ming Lei
@ 2020-05-18 18:47             ` Thomas Gleixner
  1 sibling, 0 replies; 47+ messages in thread
From: Thomas Gleixner @ 2020-05-18 18:47 UTC (permalink / raw)
  To: Christoph Hellwig, Ming Lei
  Cc: Christoph Hellwig, linux-block, John Garry, Bart Van Assche,
	Hannes Reinecke

Christoph Hellwig <hch@lst.de> writes:
> On Mon, May 18, 2020 at 07:54:54PM +0800, Ming Lei wrote:
>> 
>> I guess I misunderstood your point, sorry for that.
>> 
>> The requirement is just that the request needs to be allocated on one online
>> CPU after INACTIVE is set, and we can use a workqueue to do that.
>
> I've looked over the code again, and I'm really not sure why we need that.
> Presumable the CPU hotplug code ensures tasks don't get schedule on the
> CPU running the shutdown state machine, so if we just do a retry of the
> tag allocation we can assume we are on a different CPU now (Thomas,
> correct me if that assumption is wrong).

User space tasks are kicked away once the CPU is cleared in the active
set, which is the very first operation on unplug.

Unbound kthreads are handled the same way as user space tasks.

Per CPU kthreads can run until the CPU clears the online bit which is at
the very end of the unplug operation just before it dies.

Some of these kthreads are managed and stop earlier, e.g. workqueues.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-18 18:45                   ` Christoph Hellwig
@ 2020-05-18 18:59                     ` Thomas Gleixner
  0 siblings, 0 replies; 47+ messages in thread
From: Thomas Gleixner @ 2020-05-18 18:59 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Christoph Hellwig, Ming Lei, linux-block, John Garry,
	Bart Van Assche, Hannes Reinecke

Christoph Hellwig <hch@lst.de> writes:
> On Mon, May 18, 2020 at 08:38:04PM +0200, Thomas Gleixner wrote:
>> > Shouldn't all the per-cpu kthreads also stop as part of the offlining?
>> > If they don't quiesce before the new blk-mq stop state I think we need
>> > to make sure they do.  It is rather pointless to quiesce the requests
>> > if a thread that can submit I/O is still live.
>> 
>> Which kthreads are you talking about?
>
> I think PF_KTHREAD threads bound to single cpu will usually be
> workqueues, yes.
>
>> Workqueues? CPU bound workqueues are shut down in
>> CPUHP_AP_WORKQUEUE_ONLINE state.
>
> That's what I mean.  If we shut down I/O before that happend we'd have
> a problem, but as I expected your state machine is smarter than that :)

It would have been a problem with the old notifier mess to actually
figure out what runs when. But with the explicit states it should be
pretty easy to find a spot which meets your requirements :)

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-18 16:56               ` Christoph Hellwig
  2020-05-18 18:38                 ` Thomas Gleixner
@ 2020-05-19  1:54                 ` Ming Lei
  2020-05-19 15:30                   ` Christoph Hellwig
  1 sibling, 1 reply; 47+ messages in thread
From: Ming Lei @ 2020-05-19  1:54 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Thomas Gleixner, linux-block, John Garry, Bart Van Assche,
	Hannes Reinecke

On Mon, May 18, 2020 at 06:56:19PM +0200, Christoph Hellwig wrote:
> On Mon, May 18, 2020 at 10:11:07PM +0800, Ming Lei wrote:
> > On Mon, May 18, 2020 at 03:16:34PM +0200, Christoph Hellwig wrote:
> > > On Mon, May 18, 2020 at 07:54:54PM +0800, Ming Lei wrote:
> > > > 
> > > > I guess I misunderstood your point, sorry for that.
> > > > 
> > > > The requirement is just that the request needs to be allocated on one online
> > > > CPU after INACTIVE is set, and we can use a workqueue to do that.
> > > 
> > > I've looked over the code again, and I'm really not sure why we need that.
> > > Presumable the CPU hotplug code ensures tasks don't get schedule on the
> > > CPU running the shutdown state machine, so if we just do a retry of the
> > 
> > percpu kthread still can be scheduled on the cpu to be online, see
> > is_cpu_allowed(). And bound wq has been used widely in fs code.
> 
> s/to be online/to be offlined/ I guess.
> 
> Shouldn't all the per-cpu kthreads also stop as part of the offlining?
> If they don't quiesce before the new blk-mq stop state I think we need
> to make sure they do.  It is rather pointless to quiesce the requests
> if a thread that can submit I/O is still live.

As Thomas clarified, workqueue hasn't such issue any more, and only other
per CPU kthreads can run until the CPU clears the online bit.

So the question is if IO can be submitted from such kernel context?

> 
> > > 
> > >     http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/blk-mq-hotplug.2
> > > 
> > 
> > After preempt_disable() is removed, the INACTIVE bit is set in the CPU
> > to be offline, and the bit can be read on all other CPUs, so other CPUs
> > may not get synced with the INACTIVE bit.
> 
> INACTIVE is set to the hctx, and it is set by the last CPU to be
> offlined that is mapped to the hctx.  once the bit is set the barrier
> ensured it is seen everywhere before we start waiting for the requests
> to finish.  What is missing?:

memory barrier should always be used as pair, and you should have mentioned
that the implied barrier in test_and_set_bit_lock pair from sbitmap_get()
is pair of smp_mb__after_atomic() in blk_mq_hctx_notify_offline().

Then setting tag bit and checking INACTIVE in blk_mq_get_tag() can be ordered,
same with setting INACTIVE and checking tag bit in blk_mq_hctx_notify_offline().

Then such usage is basically same with previous use of smp_mb() in blk_mq_get_driver_tag()
in earlier version.

BTW, smp_mb__before_atomic() in blk_mq_hctx_notify_offline() isn't needed.

Thanks,
Ming


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-19  1:54                 ` Ming Lei
@ 2020-05-19 15:30                   ` Christoph Hellwig
  2020-05-20  1:18                     ` Ming Lei
  0 siblings, 1 reply; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-19 15:30 UTC (permalink / raw)
  To: Ming Lei
  Cc: Christoph Hellwig, Thomas Gleixner, linux-block, John Garry,
	Bart Van Assche, Hannes Reinecke

On Tue, May 19, 2020 at 09:54:20AM +0800, Ming Lei wrote:
> As Thomas clarified, workqueue hasn't such issue any more, and only other
> per CPU kthreads can run until the CPU clears the online bit.
> 
> So the question is if IO can be submitted from such kernel context?

What other per-CPU kthreads even exist?

> > INACTIVE is set to the hctx, and it is set by the last CPU to be
> > offlined that is mapped to the hctx.  once the bit is set the barrier
> > ensured it is seen everywhere before we start waiting for the requests
> > to finish.  What is missing?:
> 
> memory barrier should always be used as pair, and you should have mentioned
> that the implied barrier in test_and_set_bit_lock pair from sbitmap_get()
> is pair of smp_mb__after_atomic() in blk_mq_hctx_notify_offline().

Documentation/core-api/atomic_ops.rst makes it pretty clear that the
special smp_mb__before_atomic and smp_mb__after_atomic barriers are only
used around the set_bit/clear_bit/change_bit operations, and not on the
test_bit side.  That is also how they are used in all the callsites I
checked.

> Then setting tag bit and checking INACTIVE in blk_mq_get_tag() can be ordered,
> same with setting INACTIVE and checking tag bit in blk_mq_hctx_notify_offline().

Buy yes, even if not that would take care of it.

> BTW, smp_mb__before_atomic() in blk_mq_hctx_notify_offline() isn't needed.

True.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: blk-mq: improvement CPU hotplug (simplified version) v2
  2020-05-18 11:49 ` blk-mq: improvement CPU hotplug (simplified version) v2 John Garry
@ 2020-05-19 15:30   ` Christoph Hellwig
  2020-05-19 17:17     ` John Garry
  2020-05-20 14:35     ` John Garry
  0 siblings, 2 replies; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-19 15:30 UTC (permalink / raw)
  To: John Garry
  Cc: Christoph Hellwig, linux-block, Bart Van Assche, Hannes Reinecke,
	Thomas Gleixner

On Mon, May 18, 2020 at 12:49:14PM +0100, John Garry wrote:
>> A git tree is available here:
>>
>>      git://git.infradead.org/users/hch/block.git blk-mq-hotplug
>>
>> Gitweb:
>>
>>      http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/blk-mq-hotplug
>> .
>>
>
> FWIW, I tested this series for cpu hotplug and it looked ok.

Can you also test the blk-mq-hotplug.2 branch?

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: blk-mq: improvement CPU hotplug (simplified version) v2
  2020-05-19 15:30   ` Christoph Hellwig
@ 2020-05-19 17:17     ` John Garry
  2020-05-20 14:35     ` John Garry
  1 sibling, 0 replies; 47+ messages in thread
From: John Garry @ 2020-05-19 17:17 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-block, Bart Van Assche, Hannes Reinecke, Thomas Gleixner

On 19/05/2020 16:30, Christoph Hellwig wrote:
> On Mon, May 18, 2020 at 12:49:14PM +0100, John Garry wrote:
>>> A git tree is available here:
>>>
>>>       git://git.infradead.org/users/hch/block.git blk-mq-hotplug
>>>
>>> Gitweb:
>>>
>>>       http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/blk-mq-hotplug
>>> .
>>>
>>
>> FWIW, I tested this series for cpu hotplug and it looked ok.
> 
> Can you also test the blk-mq-hotplug.2 branch?
> .
> 

Sure,




^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-19 15:30                   ` Christoph Hellwig
@ 2020-05-20  1:18                     ` Ming Lei
  2020-05-20  3:04                       ` Ming Lei
  0 siblings, 1 reply; 47+ messages in thread
From: Ming Lei @ 2020-05-20  1:18 UTC (permalink / raw)
  To: Christoph Hellwig, linux-kernel
  Cc: Thomas Gleixner, linux-block, John Garry, Bart Van Assche,
	Hannes Reinecke

On Tue, May 19, 2020 at 05:30:00PM +0200, Christoph Hellwig wrote:
> On Tue, May 19, 2020 at 09:54:20AM +0800, Ming Lei wrote:
> > As Thomas clarified, workqueue hasn't such issue any more, and only other
> > per CPU kthreads can run until the CPU clears the online bit.
> > 
> > So the question is if IO can be submitted from such kernel context?
> 
> What other per-CPU kthreads even exist?

I don't know, so expose to wider audiences.

> 
> > > INACTIVE is set to the hctx, and it is set by the last CPU to be
> > > offlined that is mapped to the hctx.  once the bit is set the barrier
> > > ensured it is seen everywhere before we start waiting for the requests
> > > to finish.  What is missing?:
> > 
> > memory barrier should always be used as pair, and you should have mentioned
> > that the implied barrier in test_and_set_bit_lock pair from sbitmap_get()
> > is pair of smp_mb__after_atomic() in blk_mq_hctx_notify_offline().
> 
> Documentation/core-api/atomic_ops.rst makes it pretty clear that the
> special smp_mb__before_atomic and smp_mb__after_atomic barriers are only
> used around the set_bit/clear_bit/change_bit operations, and not on the
> test_bit side.  That is also how they are used in all the callsites I
> checked.

I didn't care if the barrier is smp_mb__after_atomic or smp_mb() because it
is added in slow path.

What I tried to express is that every SMP memory barrier use should be commented
clearly, especially about the pairing usage, see "SMP BARRIER PAIRING" section of
Documentation/memory-barriers.txt.

So please add comments around the new added smp_mb__after_atomic(),
something like:

/*
 * The pair of the following smp_mb__after_atomic() is smp_mb() implied in
 * test_and_set_bit_lock pair from sbitmap_get(), so that setting tag bit and
 * checking INACTIVE in blk_mq_get_tag() can be ordered, same with setting
 * INACTIVE and checking tag bit in blk_mq_hctx_notify_offline().
 */

> 
> > Then setting tag bit and checking INACTIVE in blk_mq_get_tag() can be ordered,
> > same with setting INACTIVE and checking tag bit in blk_mq_hctx_notify_offline().
> 
> Buy yes, even if not that would take care of it.

The OPs have been ordered in this way, that is exactly purpose of the added memory
barrier.

thanks,
Ming


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-20  1:18                     ` Ming Lei
@ 2020-05-20  3:04                       ` Ming Lei
  2020-05-20  8:03                         ` io_uring vs CPU hotplug, was " Christoph Hellwig
  0 siblings, 1 reply; 47+ messages in thread
From: Ming Lei @ 2020-05-20  3:04 UTC (permalink / raw)
  To: Christoph Hellwig, linux-kernel
  Cc: Thomas Gleixner, linux-block, John Garry, Bart Van Assche,
	Hannes Reinecke

On Wed, May 20, 2020 at 09:18:23AM +0800, Ming Lei wrote:
> On Tue, May 19, 2020 at 05:30:00PM +0200, Christoph Hellwig wrote:
> > On Tue, May 19, 2020 at 09:54:20AM +0800, Ming Lei wrote:
> > > As Thomas clarified, workqueue hasn't such issue any more, and only other
> > > per CPU kthreads can run until the CPU clears the online bit.
> > > 
> > > So the question is if IO can be submitted from such kernel context?
> > 
> > What other per-CPU kthreads even exist?
> 
> I don't know, so expose to wider audiences.

One user is io uring with IORING_SETUP_SQPOLL & IORING_SETUP_SQ_AFF, see
io_sq_offload_start(), and it is a IO submission kthread.

Thanks,
Ming


^ permalink raw reply	[flat|nested] 47+ messages in thread

* io_uring vs CPU hotplug, was Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-20  3:04                       ` Ming Lei
@ 2020-05-20  8:03                         ` Christoph Hellwig
  2020-05-20 14:45                           ` Jens Axboe
  0 siblings, 1 reply; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-20  8:03 UTC (permalink / raw)
  To: Jens Axboe, Ming Lei
  Cc: Christoph Hellwig, linux-kernel, Thomas Gleixner, linux-block,
	John Garry, Bart Van Assche, Hannes Reinecke, io-uring

On Wed, May 20, 2020 at 11:04:24AM +0800, Ming Lei wrote:
> On Wed, May 20, 2020 at 09:18:23AM +0800, Ming Lei wrote:
> > On Tue, May 19, 2020 at 05:30:00PM +0200, Christoph Hellwig wrote:
> > > On Tue, May 19, 2020 at 09:54:20AM +0800, Ming Lei wrote:
> > > > As Thomas clarified, workqueue hasn't such issue any more, and only other
> > > > per CPU kthreads can run until the CPU clears the online bit.
> > > > 
> > > > So the question is if IO can be submitted from such kernel context?
> > > 
> > > What other per-CPU kthreads even exist?
> > 
> > I don't know, so expose to wider audiences.
> 
> One user is io uring with IORING_SETUP_SQPOLL & IORING_SETUP_SQ_AFF, see
> io_sq_offload_start(), and it is a IO submission kthread.

As far as I can tell that code is buggy, as it still needs to migrate
the thread away when the cpu is offlined.  This isn't a per-cpu kthread
in the sene of having one for each CPU.

Jens?

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: blk-mq: improvement CPU hotplug (simplified version) v2
  2020-05-19 15:30   ` Christoph Hellwig
  2020-05-19 17:17     ` John Garry
@ 2020-05-20 14:35     ` John Garry
  1 sibling, 0 replies; 47+ messages in thread
From: John Garry @ 2020-05-20 14:35 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-block, Bart Van Assche, Hannes Reinecke, Thomas Gleixner, Ming Lei

On 19/05/2020 16:30, Christoph Hellwig wrote:
> On Mon, May 18, 2020 at 12:49:14PM +0100, John Garry wrote:
>>> A git tree is available here:
>>>
>>>       git://git.infradead.org/users/hch/block.git blk-mq-hotplug
>>>
>>> Gitweb:
>>>
>>>       http://git.infradead.org/users/hch/block.git/shortlog/refs/heads/blk-mq-hotplug
>>> .
>>>
>>
>> FWIW, I tested this series for cpu hotplug and it looked ok.
> 
> Can you also test the blk-mq-hotplug.2 branch?
> .
> 

Yeah, it looks ok. No timeouts for CPU hotplug.

Cheers,
John

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: io_uring vs CPU hotplug, was Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-20  8:03                         ` io_uring vs CPU hotplug, was " Christoph Hellwig
@ 2020-05-20 14:45                           ` Jens Axboe
  2020-05-20 15:20                             ` Jens Axboe
  0 siblings, 1 reply; 47+ messages in thread
From: Jens Axboe @ 2020-05-20 14:45 UTC (permalink / raw)
  To: Christoph Hellwig, Ming Lei
  Cc: linux-kernel, Thomas Gleixner, linux-block, John Garry,
	Bart Van Assche, Hannes Reinecke, io-uring

On 5/20/20 2:03 AM, Christoph Hellwig wrote:
> On Wed, May 20, 2020 at 11:04:24AM +0800, Ming Lei wrote:
>> On Wed, May 20, 2020 at 09:18:23AM +0800, Ming Lei wrote:
>>> On Tue, May 19, 2020 at 05:30:00PM +0200, Christoph Hellwig wrote:
>>>> On Tue, May 19, 2020 at 09:54:20AM +0800, Ming Lei wrote:
>>>>> As Thomas clarified, workqueue hasn't such issue any more, and only other
>>>>> per CPU kthreads can run until the CPU clears the online bit.
>>>>>
>>>>> So the question is if IO can be submitted from such kernel context?
>>>>
>>>> What other per-CPU kthreads even exist?
>>>
>>> I don't know, so expose to wider audiences.
>>
>> One user is io uring with IORING_SETUP_SQPOLL & IORING_SETUP_SQ_AFF, see
>> io_sq_offload_start(), and it is a IO submission kthread.
> 
> As far as I can tell that code is buggy, as it still needs to migrate
> the thread away when the cpu is offlined.  This isn't a per-cpu kthread
> in the sene of having one for each CPU.
> 
> Jens?

It just uses kthread_create_on_cpu(), nothing home grown. Pretty sure
they just break affinity if that CPU goes offline.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: io_uring vs CPU hotplug, was Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-20 14:45                           ` Jens Axboe
@ 2020-05-20 15:20                             ` Jens Axboe
  2020-05-20 15:31                               ` Christoph Hellwig
  2020-05-20 19:41                               ` Thomas Gleixner
  0 siblings, 2 replies; 47+ messages in thread
From: Jens Axboe @ 2020-05-20 15:20 UTC (permalink / raw)
  To: Christoph Hellwig, Ming Lei
  Cc: linux-kernel, Thomas Gleixner, linux-block, John Garry,
	Bart Van Assche, Hannes Reinecke, io-uring

On 5/20/20 8:45 AM, Jens Axboe wrote:
> On 5/20/20 2:03 AM, Christoph Hellwig wrote:
>> On Wed, May 20, 2020 at 11:04:24AM +0800, Ming Lei wrote:
>>> On Wed, May 20, 2020 at 09:18:23AM +0800, Ming Lei wrote:
>>>> On Tue, May 19, 2020 at 05:30:00PM +0200, Christoph Hellwig wrote:
>>>>> On Tue, May 19, 2020 at 09:54:20AM +0800, Ming Lei wrote:
>>>>>> As Thomas clarified, workqueue hasn't such issue any more, and only other
>>>>>> per CPU kthreads can run until the CPU clears the online bit.
>>>>>>
>>>>>> So the question is if IO can be submitted from such kernel context?
>>>>>
>>>>> What other per-CPU kthreads even exist?
>>>>
>>>> I don't know, so expose to wider audiences.
>>>
>>> One user is io uring with IORING_SETUP_SQPOLL & IORING_SETUP_SQ_AFF, see
>>> io_sq_offload_start(), and it is a IO submission kthread.
>>
>> As far as I can tell that code is buggy, as it still needs to migrate
>> the thread away when the cpu is offlined.  This isn't a per-cpu kthread
>> in the sene of having one for each CPU.
>>
>> Jens?
> 
> It just uses kthread_create_on_cpu(), nothing home grown. Pretty sure
> they just break affinity if that CPU goes offline.

Just checked, and it works fine for me. If I create an SQPOLL ring with
SQ_AFF set and bound to CPU 3, if CPU 3 goes offline, then the kthread
just appears unbound but runs just fine. When CPU 3 comes online again,
the mask appears correct.

So don't think there's anything wrong on that side. The affinity is a
performance optimization, not a correctness issue. Really not much we
can do if the chosen CPU is offlined, apart from continue to chug along.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: io_uring vs CPU hotplug, was Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-20 15:20                             ` Jens Axboe
@ 2020-05-20 15:31                               ` Christoph Hellwig
  2020-05-20 19:41                               ` Thomas Gleixner
  1 sibling, 0 replies; 47+ messages in thread
From: Christoph Hellwig @ 2020-05-20 15:31 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Ming Lei, linux-kernel, Thomas Gleixner,
	linux-block, John Garry, Bart Van Assche, Hannes Reinecke,
	io-uring

On Wed, May 20, 2020 at 09:20:50AM -0600, Jens Axboe wrote:
> Just checked, and it works fine for me. If I create an SQPOLL ring with
> SQ_AFF set and bound to CPU 3, if CPU 3 goes offline, then the kthread
> just appears unbound but runs just fine. When CPU 3 comes online again,
> the mask appears correct.
> 
> So don't think there's anything wrong on that side. The affinity is a
> performance optimization, not a correctness issue. Really not much we
> can do if the chosen CPU is offlined, apart from continue to chug along.

Ok, that sounds pretty sensible.

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: io_uring vs CPU hotplug, was Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-20 15:20                             ` Jens Axboe
  2020-05-20 15:31                               ` Christoph Hellwig
@ 2020-05-20 19:41                               ` Thomas Gleixner
  2020-05-20 20:18                                 ` Jens Axboe
  1 sibling, 1 reply; 47+ messages in thread
From: Thomas Gleixner @ 2020-05-20 19:41 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig, Ming Lei
  Cc: linux-kernel, linux-block, John Garry, Bart Van Assche,
	Hannes Reinecke, io-uring

Jens Axboe <axboe@kernel.dk> writes:
> On 5/20/20 8:45 AM, Jens Axboe wrote:
>> It just uses kthread_create_on_cpu(), nothing home grown. Pretty sure
>> they just break affinity if that CPU goes offline.
>
> Just checked, and it works fine for me. If I create an SQPOLL ring with
> SQ_AFF set and bound to CPU 3, if CPU 3 goes offline, then the kthread
> just appears unbound but runs just fine. When CPU 3 comes online again,
> the mask appears correct.

When exactly during the unplug operation is it unbound?

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: io_uring vs CPU hotplug, was Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-20 19:41                               ` Thomas Gleixner
@ 2020-05-20 20:18                                 ` Jens Axboe
  2020-05-20 22:14                                   ` Thomas Gleixner
  0 siblings, 1 reply; 47+ messages in thread
From: Jens Axboe @ 2020-05-20 20:18 UTC (permalink / raw)
  To: Thomas Gleixner, Christoph Hellwig, Ming Lei
  Cc: linux-kernel, linux-block, John Garry, Bart Van Assche,
	Hannes Reinecke, io-uring

On 5/20/20 1:41 PM, Thomas Gleixner wrote:
> Jens Axboe <axboe@kernel.dk> writes:
>> On 5/20/20 8:45 AM, Jens Axboe wrote:
>>> It just uses kthread_create_on_cpu(), nothing home grown. Pretty sure
>>> they just break affinity if that CPU goes offline.
>>
>> Just checked, and it works fine for me. If I create an SQPOLL ring with
>> SQ_AFF set and bound to CPU 3, if CPU 3 goes offline, then the kthread
>> just appears unbound but runs just fine. When CPU 3 comes online again,
>> the mask appears correct.
> 
> When exactly during the unplug operation is it unbound?

When the CPU has been fully offlined. I check the affinity mask, it
reports 0. But it's still being scheduled, and it's processing work.
Here's an example, PID 420 is the thread in question:

[root@archlinux cpu3]# taskset -p 420
pid 420's current affinity mask: 8
[root@archlinux cpu3]# echo 0 > online 
[root@archlinux cpu3]# taskset -p 420
pid 420's current affinity mask: 0
[root@archlinux cpu3]# echo 1 > online 
[root@archlinux cpu3]# taskset -p 420
pid 420's current affinity mask: 8

So as far as I can tell, it's working fine for me with the goals
I have for that kthread.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: io_uring vs CPU hotplug, was Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-20 20:18                                 ` Jens Axboe
@ 2020-05-20 22:14                                   ` Thomas Gleixner
  2020-05-20 22:40                                     ` Jens Axboe
  2020-05-21  2:27                                     ` Ming Lei
  0 siblings, 2 replies; 47+ messages in thread
From: Thomas Gleixner @ 2020-05-20 22:14 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig, Ming Lei
  Cc: linux-kernel, linux-block, John Garry, Bart Van Assche,
	Hannes Reinecke, io-uring, Peter Zijlstra

Jens Axboe <axboe@kernel.dk> writes:

> On 5/20/20 1:41 PM, Thomas Gleixner wrote:
>> Jens Axboe <axboe@kernel.dk> writes:
>>> On 5/20/20 8:45 AM, Jens Axboe wrote:
>>>> It just uses kthread_create_on_cpu(), nothing home grown. Pretty sure
>>>> they just break affinity if that CPU goes offline.
>>>
>>> Just checked, and it works fine for me. If I create an SQPOLL ring with
>>> SQ_AFF set and bound to CPU 3, if CPU 3 goes offline, then the kthread
>>> just appears unbound but runs just fine. When CPU 3 comes online again,
>>> the mask appears correct.
>> 
>> When exactly during the unplug operation is it unbound?
>
> When the CPU has been fully offlined. I check the affinity mask, it
> reports 0. But it's still being scheduled, and it's processing work.
> Here's an example, PID 420 is the thread in question:
>
> [root@archlinux cpu3]# taskset -p 420
> pid 420's current affinity mask: 8
> [root@archlinux cpu3]# echo 0 > online 
> [root@archlinux cpu3]# taskset -p 420
> pid 420's current affinity mask: 0
> [root@archlinux cpu3]# echo 1 > online 
> [root@archlinux cpu3]# taskset -p 420
> pid 420's current affinity mask: 8
>
> So as far as I can tell, it's working fine for me with the goals
> I have for that kthread.

Works for me is not really useful information and does not answer my
question:

>> When exactly during the unplug operation is it unbound?

The problem Ming and Christoph are trying to solve requires that the
thread is migrated _before_ the hardware queue is shut down and
drained. That's why I asked for the exact point where this happens.

When the CPU is finally offlined, i.e. the CPU cleared the online bit in
the online mask is definitely too late simply because it still runs on
that outgoing CPU _after_ the hardware queue is shut down and drained.

This needs more thought and changes to sched and kthread so that the
kthread breaks affinity once the CPU goes offline. Too tired to figure
that out right now.

Thanks,

        tglx





^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: io_uring vs CPU hotplug, was Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-20 22:14                                   ` Thomas Gleixner
@ 2020-05-20 22:40                                     ` Jens Axboe
  2020-05-21  2:27                                     ` Ming Lei
  1 sibling, 0 replies; 47+ messages in thread
From: Jens Axboe @ 2020-05-20 22:40 UTC (permalink / raw)
  To: Thomas Gleixner, Christoph Hellwig, Ming Lei
  Cc: linux-kernel, linux-block, John Garry, Bart Van Assche,
	Hannes Reinecke, io-uring, Peter Zijlstra

On 5/20/20 4:14 PM, Thomas Gleixner wrote:
> Jens Axboe <axboe@kernel.dk> writes:
> 
>> On 5/20/20 1:41 PM, Thomas Gleixner wrote:
>>> Jens Axboe <axboe@kernel.dk> writes:
>>>> On 5/20/20 8:45 AM, Jens Axboe wrote:
>>>>> It just uses kthread_create_on_cpu(), nothing home grown. Pretty sure
>>>>> they just break affinity if that CPU goes offline.
>>>>
>>>> Just checked, and it works fine for me. If I create an SQPOLL ring with
>>>> SQ_AFF set and bound to CPU 3, if CPU 3 goes offline, then the kthread
>>>> just appears unbound but runs just fine. When CPU 3 comes online again,
>>>> the mask appears correct.
>>>
>>> When exactly during the unplug operation is it unbound?
>>
>> When the CPU has been fully offlined. I check the affinity mask, it
>> reports 0. But it's still being scheduled, and it's processing work.
>> Here's an example, PID 420 is the thread in question:
>>
>> [root@archlinux cpu3]# taskset -p 420
>> pid 420's current affinity mask: 8
>> [root@archlinux cpu3]# echo 0 > online 
>> [root@archlinux cpu3]# taskset -p 420
>> pid 420's current affinity mask: 0
>> [root@archlinux cpu3]# echo 1 > online 
>> [root@archlinux cpu3]# taskset -p 420
>> pid 420's current affinity mask: 8
>>
>> So as far as I can tell, it's working fine for me with the goals
>> I have for that kthread.
> 
> Works for me is not really useful information and does not answer my
> question:
> 
>>> When exactly during the unplug operation is it unbound?

I agree, and that question is relevant to the block side of things. What
Christoph asked in this particular sub-thread was specifically for the
io_uring sqpoll thread, and that's what I was adressing. For that, it
doesn't matter _when_ it becomes unbound. All that matters it that it
breaks affinity and keeps working.

> The problem Ming and Christoph are trying to solve requires that the
> thread is migrated _before_ the hardware queue is shut down and
> drained. That's why I asked for the exact point where this happens.

Right, and I haven't looked into that at all, so don't know the answer
to that question.

> When the CPU is finally offlined, i.e. the CPU cleared the online bit in
> the online mask is definitely too late simply because it still runs on
> that outgoing CPU _after_ the hardware queue is shut down and drained.
> 
> This needs more thought and changes to sched and kthread so that the
> kthread breaks affinity once the CPU goes offline. Too tired to figure
> that out right now.

Yes, to provide the needed guarantees for the block ctx and hctx
mappings we'll need to know exactly at what stage it ceases to run on
that CPU.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: io_uring vs CPU hotplug, was Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-20 22:14                                   ` Thomas Gleixner
  2020-05-20 22:40                                     ` Jens Axboe
@ 2020-05-21  2:27                                     ` Ming Lei
  2020-05-21  8:13                                       ` Thomas Gleixner
  1 sibling, 1 reply; 47+ messages in thread
From: Ming Lei @ 2020-05-21  2:27 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jens Axboe, Christoph Hellwig, linux-kernel, linux-block,
	John Garry, Bart Van Assche, Hannes Reinecke, io-uring,
	Peter Zijlstra

On Thu, May 21, 2020 at 12:14:18AM +0200, Thomas Gleixner wrote:
> Jens Axboe <axboe@kernel.dk> writes:
> 
> > On 5/20/20 1:41 PM, Thomas Gleixner wrote:
> >> Jens Axboe <axboe@kernel.dk> writes:
> >>> On 5/20/20 8:45 AM, Jens Axboe wrote:
> >>>> It just uses kthread_create_on_cpu(), nothing home grown. Pretty sure
> >>>> they just break affinity if that CPU goes offline.
> >>>
> >>> Just checked, and it works fine for me. If I create an SQPOLL ring with
> >>> SQ_AFF set and bound to CPU 3, if CPU 3 goes offline, then the kthread
> >>> just appears unbound but runs just fine. When CPU 3 comes online again,
> >>> the mask appears correct.
> >> 
> >> When exactly during the unplug operation is it unbound?
> >
> > When the CPU has been fully offlined. I check the affinity mask, it
> > reports 0. But it's still being scheduled, and it's processing work.
> > Here's an example, PID 420 is the thread in question:
> >
> > [root@archlinux cpu3]# taskset -p 420
> > pid 420's current affinity mask: 8
> > [root@archlinux cpu3]# echo 0 > online 
> > [root@archlinux cpu3]# taskset -p 420
> > pid 420's current affinity mask: 0
> > [root@archlinux cpu3]# echo 1 > online 
> > [root@archlinux cpu3]# taskset -p 420
> > pid 420's current affinity mask: 8
> >
> > So as far as I can tell, it's working fine for me with the goals
> > I have for that kthread.
> 
> Works for me is not really useful information and does not answer my
> question:
> 
> >> When exactly during the unplug operation is it unbound?
> 
> The problem Ming and Christoph are trying to solve requires that the
> thread is migrated _before_ the hardware queue is shut down and
> drained. That's why I asked for the exact point where this happens.
> 
> When the CPU is finally offlined, i.e. the CPU cleared the online bit in
> the online mask is definitely too late simply because it still runs on
> that outgoing CPU _after_ the hardware queue is shut down and drained.

IMO, the patch in Christoph's blk-mq-hotplug.2 still works for percpu
kthread.

It is just not optimal in the retrying, but it should be fine. When the
percpu kthread is scheduled on the CPU to be offlined:

- if the kthread doesn't observe the INACTIVE flag, the allocated request
will be drained.

- otherwise, the kthread just retries and retries to allocate & release,
and sooner or later, its time slice is consumed, and migrated out, and the
cpu hotplug handler will get chance to run and move on, then the cpu is
shutdown.

- After the cpu is shutdown, the percpu kthread becomes unbound, and
the allocation from new online cpu will succeed.

Thanks,
Ming


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: io_uring vs CPU hotplug, was Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-21  2:27                                     ` Ming Lei
@ 2020-05-21  8:13                                       ` Thomas Gleixner
  2020-05-21  9:23                                         ` Ming Lei
  0 siblings, 1 reply; 47+ messages in thread
From: Thomas Gleixner @ 2020-05-21  8:13 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Christoph Hellwig, linux-kernel, linux-block,
	John Garry, Bart Van Assche, Hannes Reinecke, io-uring,
	Peter Zijlstra

Ming Lei <ming.lei@redhat.com> writes:
> On Thu, May 21, 2020 at 12:14:18AM +0200, Thomas Gleixner wrote:
>> When the CPU is finally offlined, i.e. the CPU cleared the online bit in
>> the online mask is definitely too late simply because it still runs on
>> that outgoing CPU _after_ the hardware queue is shut down and drained.
>
> IMO, the patch in Christoph's blk-mq-hotplug.2 still works for percpu
> kthread.
>
> It is just not optimal in the retrying, but it should be fine. When the
> percpu kthread is scheduled on the CPU to be offlined:
>
> - if the kthread doesn't observe the INACTIVE flag, the allocated request
> will be drained.
>
> - otherwise, the kthread just retries and retries to allocate & release,
> and sooner or later, its time slice is consumed, and migrated out, and the
> cpu hotplug handler will get chance to run and move on, then the cpu is
> shutdown.

1) This is based on the assumption that the kthread is in the SCHED_OTHER
   scheduling class. Is that really a valid assumption?

2) What happens in the following scenario:

   unplug

     mq_offline
       set_ctx_inactive()
       drain_io()
       
   io_kthread()
       try_queue()
       wait_on_ctx()

   Can this happen and if so what will wake up that thread?

I'm not familiar enough with that code to answer #2, but this really
wants to be properly described and documented.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: io_uring vs CPU hotplug, was Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-21  8:13                                       ` Thomas Gleixner
@ 2020-05-21  9:23                                         ` Ming Lei
  2020-05-21 18:39                                           ` Thomas Gleixner
  0 siblings, 1 reply; 47+ messages in thread
From: Ming Lei @ 2020-05-21  9:23 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jens Axboe, Christoph Hellwig, linux-kernel, linux-block,
	John Garry, Bart Van Assche, Hannes Reinecke, io-uring,
	Peter Zijlstra

Hi Thomas,

On Thu, May 21, 2020 at 10:13:59AM +0200, Thomas Gleixner wrote:
> Ming Lei <ming.lei@redhat.com> writes:
> > On Thu, May 21, 2020 at 12:14:18AM +0200, Thomas Gleixner wrote:
> >> When the CPU is finally offlined, i.e. the CPU cleared the online bit in
> >> the online mask is definitely too late simply because it still runs on
> >> that outgoing CPU _after_ the hardware queue is shut down and drained.
> >
> > IMO, the patch in Christoph's blk-mq-hotplug.2 still works for percpu
> > kthread.
> >
> > It is just not optimal in the retrying, but it should be fine. When the
> > percpu kthread is scheduled on the CPU to be offlined:
> >
> > - if the kthread doesn't observe the INACTIVE flag, the allocated request
> > will be drained.
> >
> > - otherwise, the kthread just retries and retries to allocate & release,
> > and sooner or later, its time slice is consumed, and migrated out, and the
> > cpu hotplug handler will get chance to run and move on, then the cpu is
> > shutdown.
> 
> 1) This is based on the assumption that the kthread is in the SCHED_OTHER
>    scheduling class. Is that really a valid assumption?

Given it is unlikely path, we can add msleep() before retrying when INACTIVE bit
is observed by current thread, and this way can avoid spinning and should work
for other schedulers.

> 
> 2) What happens in the following scenario:
> 
>    unplug
> 
>      mq_offline
>        set_ctx_inactive()
>        drain_io()
>        
>    io_kthread()
>        try_queue()
>        wait_on_ctx()
> 
>    Can this happen and if so what will wake up that thread?

drain_io() releases all tag of this hctx, then wait_on_ctx() will be waken up
after any tag is released.

If wait_on_ctx() waits for other generic resource, it will be waken up
after this resource is available.

thanks,
Ming


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: io_uring vs CPU hotplug, was Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-21  9:23                                         ` Ming Lei
@ 2020-05-21 18:39                                           ` Thomas Gleixner
  2020-05-21 18:45                                             ` Jens Axboe
  2020-05-22  1:57                                             ` Ming Lei
  0 siblings, 2 replies; 47+ messages in thread
From: Thomas Gleixner @ 2020-05-21 18:39 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, Christoph Hellwig, linux-kernel, linux-block,
	John Garry, Bart Van Assche, Hannes Reinecke, io-uring,
	Peter Zijlstra

Ming,

Ming Lei <ming.lei@redhat.com> writes:
> On Thu, May 21, 2020 at 10:13:59AM +0200, Thomas Gleixner wrote:
>> Ming Lei <ming.lei@redhat.com> writes:
>> > On Thu, May 21, 2020 at 12:14:18AM +0200, Thomas Gleixner wrote:
>> > - otherwise, the kthread just retries and retries to allocate & release,
>> > and sooner or later, its time slice is consumed, and migrated out, and the
>> > cpu hotplug handler will get chance to run and move on, then the cpu is
>> > shutdown.
>> 
>> 1) This is based on the assumption that the kthread is in the SCHED_OTHER
>>    scheduling class. Is that really a valid assumption?
>
> Given it is unlikely path, we can add msleep() before retrying when INACTIVE bit
> is observed by current thread, and this way can avoid spinning and should work
> for other schedulers.

That should work, but pretty is something else

>> 
>> 2) What happens in the following scenario:
>> 
>>    unplug
>> 
>>      mq_offline
>>        set_ctx_inactive()
>>        drain_io()
>>        
>>    io_kthread()
>>        try_queue()
>>        wait_on_ctx()
>> 
>>    Can this happen and if so what will wake up that thread?
>
> drain_io() releases all tag of this hctx, then wait_on_ctx() will be waken up
> after any tag is released.

drain_io() is already done ...

So looking at that thread function:

static int io_sq_thread(void *data)
{
	struct io_ring_ctx *ctx = data;

        while (...) {
              ....
	      to_submit = io_sqring_entries(ctx);

--> preemption

hotplug runs
   mq_offline()
      set_ctx_inactive();
      drain_io();
      finished();

--> thread runs again

      mutex_lock(&ctx->uring_lock);
      ret = io_submit_sqes(ctx, to_submit, NULL, -1, true);
      mutex_unlock(&ctx->uring_lock);

      ....

      if (!to_submit || ret == -EBUSY)
          ...
      	  wait_on_ctx();

Can this happen or did drain_io() already take care of the 'to_submit'
items and the call to io_submit_sqes() turns into a zero action ?

If the above happens then nothing will wake it up because the context
draining is done and finished.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: io_uring vs CPU hotplug, was Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-21 18:39                                           ` Thomas Gleixner
@ 2020-05-21 18:45                                             ` Jens Axboe
  2020-05-21 20:00                                               ` Thomas Gleixner
  2020-05-22  1:57                                             ` Ming Lei
  1 sibling, 1 reply; 47+ messages in thread
From: Jens Axboe @ 2020-05-21 18:45 UTC (permalink / raw)
  To: Thomas Gleixner, Ming Lei
  Cc: Christoph Hellwig, linux-kernel, linux-block, John Garry,
	Bart Van Assche, Hannes Reinecke, io-uring, Peter Zijlstra

On 5/21/20 12:39 PM, Thomas Gleixner wrote:
> Ming,
> 
> Ming Lei <ming.lei@redhat.com> writes:
>> On Thu, May 21, 2020 at 10:13:59AM +0200, Thomas Gleixner wrote:
>>> Ming Lei <ming.lei@redhat.com> writes:
>>>> On Thu, May 21, 2020 at 12:14:18AM +0200, Thomas Gleixner wrote:
>>>> - otherwise, the kthread just retries and retries to allocate & release,
>>>> and sooner or later, its time slice is consumed, and migrated out, and the
>>>> cpu hotplug handler will get chance to run and move on, then the cpu is
>>>> shutdown.
>>>
>>> 1) This is based on the assumption that the kthread is in the SCHED_OTHER
>>>    scheduling class. Is that really a valid assumption?
>>
>> Given it is unlikely path, we can add msleep() before retrying when INACTIVE bit
>> is observed by current thread, and this way can avoid spinning and should work
>> for other schedulers.
> 
> That should work, but pretty is something else
> 
>>>
>>> 2) What happens in the following scenario:
>>>
>>>    unplug
>>>
>>>      mq_offline
>>>        set_ctx_inactive()
>>>        drain_io()
>>>        
>>>    io_kthread()
>>>        try_queue()
>>>        wait_on_ctx()
>>>
>>>    Can this happen and if so what will wake up that thread?
>>
>> drain_io() releases all tag of this hctx, then wait_on_ctx() will be waken up
>> after any tag is released.
> 
> drain_io() is already done ...
> 
> So looking at that thread function:
> 
> static int io_sq_thread(void *data)
> {
> 	struct io_ring_ctx *ctx = data;
> 
>         while (...) {
>               ....
> 	      to_submit = io_sqring_entries(ctx);
> 
> --> preemption
> 
> hotplug runs
>    mq_offline()
>       set_ctx_inactive();
>       drain_io();
>       finished();
> 
> --> thread runs again
> 
>       mutex_lock(&ctx->uring_lock);
>       ret = io_submit_sqes(ctx, to_submit, NULL, -1, true);
>       mutex_unlock(&ctx->uring_lock);
> 
>       ....
> 
>       if (!to_submit || ret == -EBUSY)
>           ...
>       	  wait_on_ctx();
> 
> Can this happen or did drain_io() already take care of the 'to_submit'
> items and the call to io_submit_sqes() turns into a zero action ?
> 
> If the above happens then nothing will wake it up because the context
> draining is done and finished.

Again, this is mixing up io_uring and blk-mq. Maybe it's the fact that
both use 'ctx' that makes this confusing. On the blk-mq side, the 'ctx'
is the per-cpu queue context, for io_uring it's the io_uring instance.

io_sq_thread() doesn't care about any sort of percpu mappings, it's
happy as long as it'll keep running regardless of whether or not the
optional pinned CPU is selected and then offlined.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: io_uring vs CPU hotplug, was Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-21 18:45                                             ` Jens Axboe
@ 2020-05-21 20:00                                               ` Thomas Gleixner
  0 siblings, 0 replies; 47+ messages in thread
From: Thomas Gleixner @ 2020-05-21 20:00 UTC (permalink / raw)
  To: Jens Axboe, Ming Lei
  Cc: Christoph Hellwig, linux-kernel, linux-block, John Garry,
	Bart Van Assche, Hannes Reinecke, io-uring, Peter Zijlstra

Jens Axboe <axboe@kernel.dk> writes:
> Again, this is mixing up io_uring and blk-mq. Maybe it's the fact that
> both use 'ctx' that makes this confusing. On the blk-mq side, the 'ctx'
> is the per-cpu queue context, for io_uring it's the io_uring instance.

Yes, that got me horribly confused. :)

> io_sq_thread() doesn't care about any sort of percpu mappings, it's
> happy as long as it'll keep running regardless of whether or not the
> optional pinned CPU is selected and then offlined.

Fair enough.

So aside of the potential spin forever if the uring thread is lifted to
an RT scheduling class, this looks all good.

Though I assume that if that thread is pinned and an admin pushs it into
RT scheduling the spinning live lock can happen independent of cpu
hotplug.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 47+ messages in thread

* Re: io_uring vs CPU hotplug, was Re: [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx
  2020-05-21 18:39                                           ` Thomas Gleixner
  2020-05-21 18:45                                             ` Jens Axboe
@ 2020-05-22  1:57                                             ` Ming Lei
  1 sibling, 0 replies; 47+ messages in thread
From: Ming Lei @ 2020-05-22  1:57 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Jens Axboe, Christoph Hellwig, linux-kernel, linux-block,
	John Garry, Bart Van Assche, Hannes Reinecke, io-uring,
	Peter Zijlstra

On Thu, May 21, 2020 at 08:39:16PM +0200, Thomas Gleixner wrote:
> Ming,
> 
> Ming Lei <ming.lei@redhat.com> writes:
> > On Thu, May 21, 2020 at 10:13:59AM +0200, Thomas Gleixner wrote:
> >> Ming Lei <ming.lei@redhat.com> writes:
> >> > On Thu, May 21, 2020 at 12:14:18AM +0200, Thomas Gleixner wrote:
> >> > - otherwise, the kthread just retries and retries to allocate & release,
> >> > and sooner or later, its time slice is consumed, and migrated out, and the
> >> > cpu hotplug handler will get chance to run and move on, then the cpu is
> >> > shutdown.
> >> 
> >> 1) This is based on the assumption that the kthread is in the SCHED_OTHER
> >>    scheduling class. Is that really a valid assumption?
> >
> > Given it is unlikely path, we can add msleep() before retrying when INACTIVE bit
> > is observed by current thread, and this way can avoid spinning and should work
> > for other schedulers.
> 
> That should work, but pretty is something else
> 
> >> 
> >> 2) What happens in the following scenario:
> >> 
> >>    unplug
> >> 
> >>      mq_offline
> >>        set_ctx_inactive()
> >>        drain_io()
> >>        
> >>    io_kthread()
> >>        try_queue()
> >>        wait_on_ctx()
> >> 
> >>    Can this happen and if so what will wake up that thread?
> >
> > drain_io() releases all tag of this hctx, then wait_on_ctx() will be waken up
> > after any tag is released.
> 
> drain_io() is already done ...
> 
> So looking at that thread function:
> 
> static int io_sq_thread(void *data)
> {
> 	struct io_ring_ctx *ctx = data;
> 
>         while (...) {
>               ....
> 	      to_submit = io_sqring_entries(ctx);
> 
> --> preemption
> 
> hotplug runs
>    mq_offline()
>       set_ctx_inactive();
>       drain_io();
>       finished();
> 
> --> thread runs again
> 
>       mutex_lock(&ctx->uring_lock);
>       ret = io_submit_sqes(ctx, to_submit, NULL, -1, true);
>       mutex_unlock(&ctx->uring_lock);
> 
>       ....
> 
>       if (!to_submit || ret == -EBUSY)
>           ...
>       	  wait_on_ctx();
> 
> Can this happen or did drain_io() already take care of the 'to_submit'
> items and the call to io_submit_sqes() turns into a zero action ?
> 
> If the above happens then nothing will wake it up because the context
> draining is done and finished.

As Jens replied, you mixed the ctx from io uring and blk-mq, both are in
two worlds.

Any wait in this percpu kthread should just wait for generic resource,
not directly related with blk-mq's inactive hctx. Once this thread is
migrated to other online cpu, it will move on.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 47+ messages in thread

end of thread, back to index

Thread overview: 47+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-18  6:39 blk-mq: improvement CPU hotplug (simplified version) v2 Christoph Hellwig
2020-05-18  6:39 ` [PATCH 1/9] blk-mq: split out a __blk_mq_get_driver_tag helper Christoph Hellwig
2020-05-18  6:39 ` [PATCH 2/9] blk-mq: remove the bio argument to ->prepare_request Christoph Hellwig
2020-05-18  6:39 ` [PATCH 3/9] blk-mq: simplify the blk_mq_get_request calling convention Christoph Hellwig
2020-05-18  6:39 ` [PATCH 4/9] blk-mq: merge blk_mq_rq_ctx_init into __blk_mq_alloc_request Christoph Hellwig
2020-05-18  6:39 ` [PATCH 5/9] blk-mq: don't set data->ctx and data->hctx in blk_mq_alloc_request_hctx Christoph Hellwig
2020-05-18  8:32   ` Thomas Gleixner
2020-05-18  9:31     ` Ming Lei
2020-05-18 10:42       ` Thomas Gleixner
2020-05-18 11:54         ` Ming Lei
2020-05-18 13:16           ` Christoph Hellwig
2020-05-18 14:11             ` Ming Lei
2020-05-18 16:56               ` Christoph Hellwig
2020-05-18 18:38                 ` Thomas Gleixner
2020-05-18 18:45                   ` Christoph Hellwig
2020-05-18 18:59                     ` Thomas Gleixner
2020-05-19  1:54                 ` Ming Lei
2020-05-19 15:30                   ` Christoph Hellwig
2020-05-20  1:18                     ` Ming Lei
2020-05-20  3:04                       ` Ming Lei
2020-05-20  8:03                         ` io_uring vs CPU hotplug, was " Christoph Hellwig
2020-05-20 14:45                           ` Jens Axboe
2020-05-20 15:20                             ` Jens Axboe
2020-05-20 15:31                               ` Christoph Hellwig
2020-05-20 19:41                               ` Thomas Gleixner
2020-05-20 20:18                                 ` Jens Axboe
2020-05-20 22:14                                   ` Thomas Gleixner
2020-05-20 22:40                                     ` Jens Axboe
2020-05-21  2:27                                     ` Ming Lei
2020-05-21  8:13                                       ` Thomas Gleixner
2020-05-21  9:23                                         ` Ming Lei
2020-05-21 18:39                                           ` Thomas Gleixner
2020-05-21 18:45                                             ` Jens Axboe
2020-05-21 20:00                                               ` Thomas Gleixner
2020-05-22  1:57                                             ` Ming Lei
2020-05-18 18:47             ` Thomas Gleixner
2020-05-18 13:18           ` Thomas Gleixner
2020-05-18  6:39 ` [PATCH 6/9] blk-mq: don't set data->ctx and data->hctx in __blk_mq_alloc_request Christoph Hellwig
2020-05-18  6:39 ` [PATCH 7/9] blk-mq: disable preemption during allocating request tag Christoph Hellwig
2020-05-18  6:39 ` [PATCH 8/9] blk-mq: add blk_mq_all_tag_iter Christoph Hellwig
2020-05-18  6:39 ` [PATCH 9/9] blk-mq: drain I/O when all CPUs in a hctx are offline Christoph Hellwig
2020-05-18  8:42   ` John Garry
2020-05-18  9:21     ` Ming Lei
2020-05-18 11:49 ` blk-mq: improvement CPU hotplug (simplified version) v2 John Garry
2020-05-19 15:30   ` Christoph Hellwig
2020-05-19 17:17     ` John Garry
2020-05-20 14:35     ` John Garry

Linux-Block Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-block/0 linux-block/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-block linux-block/ https://lore.kernel.org/linux-block \
		linux-block@vger.kernel.org
	public-inbox-index linux-block

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-block


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git