All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] blk-mq: support physical CPU hotplug
@ 2018-01-12  2:53 Ming Lei
  2018-01-12  2:53 ` [PATCH 1/2] genirq/affinity: assign vectors to all possible CPUs Ming Lei
                   ` (3 more replies)
  0 siblings, 4 replies; 33+ messages in thread
From: Ming Lei @ 2018-01-12  2:53 UTC (permalink / raw)
  To: Jens Axboe, linux-block, Christoph Hellwig
  Cc: Christian Borntraeger, Stefan Haberland, Thomas Gleixner,
	linux-kernel, Ming Lei

Hi,

This two patches support physical CPU hotplug, so that we can make blk-mq
scale well when new physical CPU is added or removed, and this use case
is normal for VM world.

Also this patchset fixes the following warning reported by Christian
Borntraeger:

	https://marc.info/?l=linux-block&m=151092973417143&w=2

Christoph Hellwig (2):
  genirq/affinity: assign vectors to all possible CPUs
  blk-mq: simplify queue mapping & schedule with each possisble CPU

 block/blk-mq.c        | 19 ++++++++-----------
 kernel/irq/affinity.c | 30 +++++++++++++++---------------
 2 files changed, 23 insertions(+), 26 deletions(-)

-- 
2.9.5

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/2] genirq/affinity: assign vectors to all possible CPUs
  2018-01-12  2:53 [PATCH 0/2] blk-mq: support physical CPU hotplug Ming Lei
@ 2018-01-12  2:53 ` Ming Lei
  2018-01-12 19:35   ` Thomas Gleixner
  2018-01-12  2:53 ` [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU Ming Lei
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 33+ messages in thread
From: Ming Lei @ 2018-01-12  2:53 UTC (permalink / raw)
  To: Jens Axboe, linux-block, Christoph Hellwig
  Cc: Christian Borntraeger, Stefan Haberland, Thomas Gleixner,
	linux-kernel, Christoph Hellwig

From: Christoph Hellwig <hch@lst.de>

Currently we assign managed interrupt vectors to all present CPUs.  This
works fine for systems were we only online/offline CPUs.  But in case of
systems that support physical CPU hotplug (or the virtualized version of
it) this means the additional CPUs covered for in the ACPI tables or on
the command line are not catered for.  To fix this we'd either need to
introduce new hotplug CPU states just for this case, or we can start
assining vectors to possible but not present CPUs.

Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Tested-by: Stefan Haberland <sth@linux.vnet.ibm.com>
Cc: linux-kernel@vger.kernel.org
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 kernel/irq/affinity.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e12d35108225..a37a3b4b6342 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
 	}
 }
 
-static cpumask_var_t *alloc_node_to_present_cpumask(void)
+static cpumask_var_t *alloc_node_to_possible_cpumask(void)
 {
 	cpumask_var_t *masks;
 	int node;
@@ -62,7 +62,7 @@ static cpumask_var_t *alloc_node_to_present_cpumask(void)
 	return NULL;
 }
 
-static void free_node_to_present_cpumask(cpumask_var_t *masks)
+static void free_node_to_possible_cpumask(cpumask_var_t *masks)
 {
 	int node;
 
@@ -71,22 +71,22 @@ static void free_node_to_present_cpumask(cpumask_var_t *masks)
 	kfree(masks);
 }
 
-static void build_node_to_present_cpumask(cpumask_var_t *masks)
+static void build_node_to_possible_cpumask(cpumask_var_t *masks)
 {
 	int cpu;
 
-	for_each_present_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
 }
 
-static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask,
+static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
 				const struct cpumask *mask, nodemask_t *nodemsk)
 {
 	int n, nodes = 0;
 
 	/* Calculate the number of nodes in the supplied affinity mask */
 	for_each_node(n) {
-		if (cpumask_intersects(mask, node_to_present_cpumask[n])) {
+		if (cpumask_intersects(mask, node_to_possible_cpumask[n])) {
 			node_set(n, *nodemsk);
 			nodes++;
 		}
@@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	int last_affv = affv + affd->pre_vectors;
 	nodemask_t nodemsk = NODE_MASK_NONE;
 	struct cpumask *masks;
-	cpumask_var_t nmsk, *node_to_present_cpumask;
+	cpumask_var_t nmsk, *node_to_possible_cpumask;
 
 	/*
 	 * If there aren't any vectors left after applying the pre/post
@@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	if (!masks)
 		goto out;
 
-	node_to_present_cpumask = alloc_node_to_present_cpumask();
-	if (!node_to_present_cpumask)
+	node_to_possible_cpumask = alloc_node_to_possible_cpumask();
+	if (!node_to_possible_cpumask)
 		goto out;
 
 	/* Fill out vectors at the beginning that don't need affinity */
@@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 
 	/* Stabilize the cpumasks */
 	get_online_cpus();
-	build_node_to_present_cpumask(node_to_present_cpumask);
-	nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask,
+	build_node_to_possible_cpumask(node_to_possible_cpumask);
+	nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask,
 				     &nodemsk);
 
 	/*
@@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	if (affv <= nodes) {
 		for_each_node_mask(n, nodemsk) {
 			cpumask_copy(masks + curvec,
-				     node_to_present_cpumask[n]);
+				     node_to_possible_cpumask[n]);
 			if (++curvec == last_affv)
 				break;
 		}
@@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 		vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
 
 		/* Get the cpus on this node which are in the mask */
-		cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]);
+		cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]);
 
 		/* Calculate the number of cpus per vector */
 		ncpus = cpumask_weight(nmsk);
@@ -192,7 +192,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	/* Fill out vectors at the end that don't need affinity */
 	for (; curvec < nvecs; curvec++)
 		cpumask_copy(masks + curvec, irq_default_affinity);
-	free_node_to_present_cpumask(node_to_present_cpumask);
+	free_node_to_possible_cpumask(node_to_possible_cpumask);
 out:
 	free_cpumask_var(nmsk);
 	return masks;
@@ -214,7 +214,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity
 		return 0;
 
 	get_online_cpus();
-	ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv;
+	ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv;
 	put_online_cpus();
 	return ret;
 }
-- 
2.9.5

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-12  2:53 [PATCH 0/2] blk-mq: support physical CPU hotplug Ming Lei
  2018-01-12  2:53 ` [PATCH 1/2] genirq/affinity: assign vectors to all possible CPUs Ming Lei
@ 2018-01-12  2:53 ` Ming Lei
  2018-01-16 10:00   ` Stefan Haberland
  2018-01-16 10:12   ` jianchao.wang
  2018-01-12  8:12 ` [PATCH 0/2] blk-mq: support physical CPU hotplug Christian Borntraeger
  2018-01-12 18:02 ` Jens Axboe
  3 siblings, 2 replies; 33+ messages in thread
From: Ming Lei @ 2018-01-12  2:53 UTC (permalink / raw)
  To: Jens Axboe, linux-block, Christoph Hellwig
  Cc: Christian Borntraeger, Stefan Haberland, Thomas Gleixner,
	linux-kernel, Christoph Hellwig, Ming Lei

From: Christoph Hellwig <hch@lst.de>

The previous patch assigns interrupt vectors to all possible CPUs, so
now hctx can be mapped to possible CPUs, this patch applies this fact
to simplify queue mapping & schedule so that we don't need to handle
CPU hotplug for dealing with physical CPU plug & unplug. With this
simplication, we can work well on physical CPU plug & unplug, which
is a normal use case for VM at least.

Make sure we allocate blk_mq_ctx structures for all possible CPUs, and
set hctx->numa_node for possible CPUs which are mapped to this hctx. And
only choose the online CPUs for schedule.

Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Tested-by: Stefan Haberland <sth@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
(merged the three into one because any single one may not work, and fix
selecting online CPUs for scheduler)
Signed-off-by: Ming Lei <ming.lei@redhat.com>
---
 block/blk-mq.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8000ba6db07d..ef9beca2d117 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -440,7 +440,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 		blk_queue_exit(q);
 		return ERR_PTR(-EXDEV);
 	}
-	cpu = cpumask_first(alloc_data.hctx->cpumask);
+	cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
 	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
 
 	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
@@ -1323,9 +1323,10 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 	if (--hctx->next_cpu_batch <= 0) {
 		int next_cpu;
 
-		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
+		next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
+				cpu_online_mask);
 		if (next_cpu >= nr_cpu_ids)
-			next_cpu = cpumask_first(hctx->cpumask);
+			next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
 
 		hctx->next_cpu = next_cpu;
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
@@ -2219,16 +2220,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		INIT_LIST_HEAD(&__ctx->rq_list);
 		__ctx->queue = q;
 
-		/* If the cpu isn't present, the cpu is mapped to first hctx */
-		if (!cpu_present(i))
-			continue;
-
-		hctx = blk_mq_map_queue(q, i);
-
 		/*
 		 * Set local node, IFF we have more than one hw queue. If
 		 * not, we remain on the home node of the device
 		 */
+		hctx = blk_mq_map_queue(q, i);
 		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
 			hctx->numa_node = local_memory_node(cpu_to_node(i));
 	}
@@ -2285,7 +2281,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	 *
 	 * If the cpu isn't present, the cpu is mapped to first hctx.
 	 */
-	for_each_present_cpu(i) {
+	for_each_possible_cpu(i) {
 		hctx_idx = q->mq_map[i];
 		/* unmapped hw queue can be remapped after CPU topo changed */
 		if (!set->tags[hctx_idx] &&
@@ -2339,7 +2335,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 		/*
 		 * Initialize batch roundrobin counts
 		 */
-		hctx->next_cpu = cpumask_first(hctx->cpumask);
+		hctx->next_cpu = cpumask_first_and(hctx->cpumask,
+				cpu_online_mask);
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
 }
-- 
2.9.5

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 0/2] blk-mq: support physical CPU hotplug
  2018-01-12  2:53 [PATCH 0/2] blk-mq: support physical CPU hotplug Ming Lei
  2018-01-12  2:53 ` [PATCH 1/2] genirq/affinity: assign vectors to all possible CPUs Ming Lei
  2018-01-12  2:53 ` [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU Ming Lei
@ 2018-01-12  8:12 ` Christian Borntraeger
  2018-01-12 10:47     ` Johannes Thumshirn
  2018-01-12 18:02 ` Jens Axboe
  3 siblings, 1 reply; 33+ messages in thread
From: Christian Borntraeger @ 2018-01-12  8:12 UTC (permalink / raw)
  To: Ming Lei, Jens Axboe, linux-block, Christoph Hellwig
  Cc: Stefan Haberland, Thomas Gleixner, linux-kernel

I think we also need cc stable for this. The bug was introduced with 
 
commit 4b855ad37194f7bdbb200ce7a1c7051fecb56a08 upstream.
("blk-mq: Create hctx for each present CPU")

and that was even backported into 4.12 stable.



On 01/12/2018 03:53 AM, Ming Lei wrote:
> Hi,
> 
> This two patches support physical CPU hotplug, so that we can make blk-mq
> scale well when new physical CPU is added or removed, and this use case
> is normal for VM world.
> 
> Also this patchset fixes the following warning reported by Christian
> Borntraeger:
> 
> 	https://marc.info/?l=linux-block&m=151092973417143&w=2
> 
> Christoph Hellwig (2):
>   genirq/affinity: assign vectors to all possible CPUs
>   blk-mq: simplify queue mapping & schedule with each possisble CPU
> 
>  block/blk-mq.c        | 19 ++++++++-----------
>  kernel/irq/affinity.c | 30 +++++++++++++++---------------
>  2 files changed, 23 insertions(+), 26 deletions(-)
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 0/2] blk-mq: support physical CPU hotplug
  2018-01-12  8:12 ` [PATCH 0/2] blk-mq: support physical CPU hotplug Christian Borntraeger
@ 2018-01-12 10:47     ` Johannes Thumshirn
  0 siblings, 0 replies; 33+ messages in thread
From: Johannes Thumshirn @ 2018-01-12 10:47 UTC (permalink / raw)
  To: Christian Borntraeger
  Cc: Ming Lei, Jens Axboe, linux-block, Christoph Hellwig,
	Stefan Haberland, Thomas Gleixner, linux-kernel

On Fri, Jan 12, 2018 at 09:12:24AM +0100, Christian Borntraeger wrote:
> I think we also need cc stable for this. The bug was introduced with 
>  
> commit 4b855ad37194f7bdbb200ce7a1c7051fecb56a08 upstream.
> ("blk-mq: Create hctx for each present CPU")
> 
> and that was even backported into 4.12 stable.

Yes, and Ming (or Jens?), can you please add a Fixes tag as well? It helps
with identifying patches which fix backports.

Thanks,
	Johannes
-- 
Johannes Thumshirn                                          Storage
jthumshirn@suse.de                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N�rnberg
GF: Felix Imend�rffer, Jane Smithard, Graham Norton
HRB 21284 (AG N�rnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 0/2] blk-mq: support physical CPU hotplug
@ 2018-01-12 10:47     ` Johannes Thumshirn
  0 siblings, 0 replies; 33+ messages in thread
From: Johannes Thumshirn @ 2018-01-12 10:47 UTC (permalink / raw)
  To: Christian Borntraeger
  Cc: Ming Lei, Jens Axboe, linux-block, Christoph Hellwig,
	Stefan Haberland, Thomas Gleixner, linux-kernel

On Fri, Jan 12, 2018 at 09:12:24AM +0100, Christian Borntraeger wrote:
> I think we also need cc stable for this. The bug was introduced with 
>  
> commit 4b855ad37194f7bdbb200ce7a1c7051fecb56a08 upstream.
> ("blk-mq: Create hctx for each present CPU")
> 
> and that was even backported into 4.12 stable.

Yes, and Ming (or Jens?), can you please add a Fixes tag as well? It helps
with identifying patches which fix backports.

Thanks,
	Johannes
-- 
Johannes Thumshirn                                          Storage
jthumshirn@suse.de                                +49 911 74053 689
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)
Key fingerprint = EC38 9CAB C2C4 F25D 8600 D0D0 0393 969D 2D76 0850

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 0/2] blk-mq: support physical CPU hotplug
  2018-01-12  2:53 [PATCH 0/2] blk-mq: support physical CPU hotplug Ming Lei
                   ` (2 preceding siblings ...)
  2018-01-12  8:12 ` [PATCH 0/2] blk-mq: support physical CPU hotplug Christian Borntraeger
@ 2018-01-12 18:02 ` Jens Axboe
  3 siblings, 0 replies; 33+ messages in thread
From: Jens Axboe @ 2018-01-12 18:02 UTC (permalink / raw)
  To: Ming Lei, linux-block, Christoph Hellwig
  Cc: Christian Borntraeger, Stefan Haberland, Thomas Gleixner, linux-kernel

On 1/11/18 7:53 PM, Ming Lei wrote:
> Hi,
> 
> This two patches support physical CPU hotplug, so that we can make blk-mq
> scale well when new physical CPU is added or removed, and this use case
> is normal for VM world.
> 
> Also this patchset fixes the following warning reported by Christian
> Borntraeger:
> 
> 	https://marc.info/?l=linux-block&m=151092973417143&w=2
> 
> Christoph Hellwig (2):
>   genirq/affinity: assign vectors to all possible CPUs
>   blk-mq: simplify queue mapping & schedule with each possisble CPU

Applied, thanks Ming.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/2] genirq/affinity: assign vectors to all possible CPUs
  2018-01-12  2:53 ` [PATCH 1/2] genirq/affinity: assign vectors to all possible CPUs Ming Lei
@ 2018-01-12 19:35   ` Thomas Gleixner
  0 siblings, 0 replies; 33+ messages in thread
From: Thomas Gleixner @ 2018-01-12 19:35 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Christoph Hellwig,
	Christian Borntraeger, Stefan Haberland, linux-kernel,
	Christoph Hellwig

On Fri, 12 Jan 2018, Ming Lei wrote:

> From: Christoph Hellwig <hch@lst.de>
> 
> Currently we assign managed interrupt vectors to all present CPUs.  This
> works fine for systems were we only online/offline CPUs.  But in case of
> systems that support physical CPU hotplug (or the virtualized version of
> it) this means the additional CPUs covered for in the ACPI tables or on
> the command line are not catered for.  To fix this we'd either need to
> introduce new hotplug CPU states just for this case, or we can start
> assining vectors to possible but not present CPUs.
> 
> Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
> Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
> Tested-by: Stefan Haberland <sth@linux.vnet.ibm.com>
> Cc: linux-kernel@vger.kernel.org
> Cc: Thomas Gleixner <tglx@linutronix.de>

FWIW, Acked-by: Thomas Gleixner <tglx@linutronix.de>

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-12  2:53 ` [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU Ming Lei
@ 2018-01-16 10:00   ` Stefan Haberland
  2018-01-16 10:12   ` jianchao.wang
  1 sibling, 0 replies; 33+ messages in thread
From: Stefan Haberland @ 2018-01-16 10:00 UTC (permalink / raw)
  To: Ming Lei, Christoph Hellwig, Christoph Hellwig
  Cc: Jens Axboe, linux-block, Christian Borntraeger, Thomas Gleixner,
	linux-kernel

Ming or Christoph,

would you mind to send this patch to stable #4.12? Or is the fixes tag 
enough to get this fixed in all related releases?

Regards,
Stefan

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-12  2:53 ` [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU Ming Lei
  2018-01-16 10:00   ` Stefan Haberland
@ 2018-01-16 10:12   ` jianchao.wang
  2018-01-16 12:10     ` Ming Lei
  1 sibling, 1 reply; 33+ messages in thread
From: jianchao.wang @ 2018-01-16 10:12 UTC (permalink / raw)
  To: Ming Lei, Jens Axboe, linux-block, Christoph Hellwig
  Cc: Christian Borntraeger, Stefan Haberland, Thomas Gleixner,
	linux-kernel, Christoph Hellwig

Hi Ming

On 01/12/2018 10:53 AM, Ming Lei wrote:
> From: Christoph Hellwig <hch@lst.de>
> 
> The previous patch assigns interrupt vectors to all possible CPUs, so
> now hctx can be mapped to possible CPUs, this patch applies this fact
> to simplify queue mapping & schedule so that we don't need to handle
> CPU hotplug for dealing with physical CPU plug & unplug. With this
> simplication, we can work well on physical CPU plug & unplug, which
> is a normal use case for VM at least.
> 
> Make sure we allocate blk_mq_ctx structures for all possible CPUs, and
> set hctx->numa_node for possible CPUs which are mapped to this hctx. And
> only choose the online CPUs for schedule.
> 
> Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
> Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
> Tested-by: Stefan Haberland <sth@linux.vnet.ibm.com>
> Cc: Thomas Gleixner <tglx@linutronix.de>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> (merged the three into one because any single one may not work, and fix
> selecting online CPUs for scheduler)
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> ---
>  block/blk-mq.c | 19 ++++++++-----------
>  1 file changed, 8 insertions(+), 11 deletions(-)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 8000ba6db07d..ef9beca2d117 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -440,7 +440,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
>  		blk_queue_exit(q);
>  		return ERR_PTR(-EXDEV);
>  	}
> -	cpu = cpumask_first(alloc_data.hctx->cpumask);
> +	cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
>  	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
>  
>  	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
> @@ -1323,9 +1323,10 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
>  	if (--hctx->next_cpu_batch <= 0) {
>  		int next_cpu;
>  
> -		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
> +		next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
> +				cpu_online_mask);
>  		if (next_cpu >= nr_cpu_ids)
> -			next_cpu = cpumask_first(hctx->cpumask);
> +			next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);

the next_cpu here could be >= nr_cpu_ids when the none of on hctx->cpumask is online.
This could be reproduced on NVMe with a patch that could hold some rqs on ctx->rq_list,
meanwhile a script online and offline the cpus. Then a panic occurred in __queue_work().

maybe cpu_possible_mask here, the workers in the pool of the offlined cpu has been unbound.
It should be ok to queue on them.

Thanks
Jianchao 


>  		hctx->next_cpu = next_cpu;
>  		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
> @@ -2219,16 +2220,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
>  		INIT_LIST_HEAD(&__ctx->rq_list);
>  		__ctx->queue = q;
>  
> -		/* If the cpu isn't present, the cpu is mapped to first hctx */
> -		if (!cpu_present(i))
> -			continue;
> -
> -		hctx = blk_mq_map_queue(q, i);
> -
>  		/*
>  		 * Set local node, IFF we have more than one hw queue. If
>  		 * not, we remain on the home node of the device
>  		 */
> +		hctx = blk_mq_map_queue(q, i);
>  		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
>  			hctx->numa_node = local_memory_node(cpu_to_node(i));
>  	}
> @@ -2285,7 +2281,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
>  	 *
>  	 * If the cpu isn't present, the cpu is mapped to first hctx.
>  	 */
> -	for_each_present_cpu(i) {
> +	for_each_possible_cpu(i) {
>  		hctx_idx = q->mq_map[i];
>  		/* unmapped hw queue can be remapped after CPU topo changed */
>  		if (!set->tags[hctx_idx] &&
> @@ -2339,7 +2335,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
>  		/*
>  		 * Initialize batch roundrobin counts
>  		 */
> -		hctx->next_cpu = cpumask_first(hctx->cpumask);
> +		hctx->next_cpu = cpumask_first_and(hctx->cpumask,
> +				cpu_online_mask);
>  		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
>  	}
>  }
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-16 10:12   ` jianchao.wang
@ 2018-01-16 12:10     ` Ming Lei
  2018-01-16 14:31       ` jianchao.wang
  0 siblings, 1 reply; 33+ messages in thread
From: Ming Lei @ 2018-01-16 12:10 UTC (permalink / raw)
  To: jianchao.wang
  Cc: Jens Axboe, linux-block, Christoph Hellwig,
	Christian Borntraeger, Stefan Haberland, Thomas Gleixner,
	linux-kernel, Christoph Hellwig

Hi Jianchao,

On Tue, Jan 16, 2018 at 06:12:09PM +0800, jianchao.wang wrote:
> Hi Ming
> 
> On 01/12/2018 10:53 AM, Ming Lei wrote:
> > From: Christoph Hellwig <hch@lst.de>
> > 
> > The previous patch assigns interrupt vectors to all possible CPUs, so
> > now hctx can be mapped to possible CPUs, this patch applies this fact
> > to simplify queue mapping & schedule so that we don't need to handle
> > CPU hotplug for dealing with physical CPU plug & unplug. With this
> > simplication, we can work well on physical CPU plug & unplug, which
> > is a normal use case for VM at least.
> > 
> > Make sure we allocate blk_mq_ctx structures for all possible CPUs, and
> > set hctx->numa_node for possible CPUs which are mapped to this hctx. And
> > only choose the online CPUs for schedule.
> > 
> > Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
> > Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
> > Tested-by: Stefan Haberland <sth@linux.vnet.ibm.com>
> > Cc: Thomas Gleixner <tglx@linutronix.de>
> > Signed-off-by: Christoph Hellwig <hch@lst.de>
> > (merged the three into one because any single one may not work, and fix
> > selecting online CPUs for scheduler)
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > ---
> >  block/blk-mq.c | 19 ++++++++-----------
> >  1 file changed, 8 insertions(+), 11 deletions(-)
> > 
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 8000ba6db07d..ef9beca2d117 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -440,7 +440,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
> >  		blk_queue_exit(q);
> >  		return ERR_PTR(-EXDEV);
> >  	}
> > -	cpu = cpumask_first(alloc_data.hctx->cpumask);
> > +	cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
> >  	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
> >  
> >  	rq = blk_mq_get_request(q, NULL, op, &alloc_data);
> > @@ -1323,9 +1323,10 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
> >  	if (--hctx->next_cpu_batch <= 0) {
> >  		int next_cpu;
> >  
> > -		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
> > +		next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
> > +				cpu_online_mask);
> >  		if (next_cpu >= nr_cpu_ids)
> > -			next_cpu = cpumask_first(hctx->cpumask);
> > +			next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
> 
> the next_cpu here could be >= nr_cpu_ids when the none of on hctx->cpumask is online.

That supposes not happen because storage device(blk-mq hw queue) is
generally C/S model, that means the queue becomes only active when
there is online CPU mapped to it.

But it won't be true for non-block-IO queue, such as HPSA's queues[1], and
network controller RX queues.

[1] https://marc.info/?l=linux-kernel&m=151601867018444&w=2

One thing I am still not sure(but generic irq affinity supposes to deal with
well) is that the CPU may become offline after the IO is just submitted,
then where the IRQ controller delivers the interrupt of this hw queue
to?

> This could be reproduced on NVMe with a patch that could hold some rqs on ctx->rq_list,
> meanwhile a script online and offline the cpus. Then a panic occurred in __queue_work().

That shouldn't happen, when CPU offline happens the rqs in ctx->rq_list
are dispatched directly, please see blk_mq_hctx_notify_dead().

> 
> maybe cpu_possible_mask here, the workers in the pool of the offlined cpu has been unbound.
> It should be ok to queue on them.

That is the original version of this patch, and both Christian and Stefan
reported that system can't boot from DASD in this way[2], and I changed
to AND with cpu_online_mask, then their system can boot well.

[2] https://marc.info/?l=linux-kernel&m=151256312722285&w=2

-- 
Ming

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-16 12:10     ` Ming Lei
@ 2018-01-16 14:31       ` jianchao.wang
  2018-01-16 15:11         ` jianchao.wang
  2018-01-16 15:32         ` Ming Lei
  0 siblings, 2 replies; 33+ messages in thread
From: jianchao.wang @ 2018-01-16 14:31 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Christoph Hellwig,
	Christian Borntraeger, Stefan Haberland, Thomas Gleixner,
	linux-kernel, Christoph Hellwig

Hi minglei

On 01/16/2018 08:10 PM, Ming Lei wrote:
>>> -		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
>>> +		next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
>>> +				cpu_online_mask);
>>>  		if (next_cpu >= nr_cpu_ids)
>>> -			next_cpu = cpumask_first(hctx->cpumask);
>>> +			next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
>> the next_cpu here could be >= nr_cpu_ids when the none of on hctx->cpumask is online.
> That supposes not happen because storage device(blk-mq hw queue) is
> generally C/S model, that means the queue becomes only active when
> there is online CPU mapped to it.
> 
> But it won't be true for non-block-IO queue, such as HPSA's queues[1], and
> network controller RX queues.
> 
> [1] https://urldefense.proofpoint.com/v2/url?u=https-3A__marc.info_-3Fl-3Dlinux-2Dkernel-26m-3D151601867018444-26w-3D2&d=DwIBAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=tCZdQH6JUW1dkNCN92ycoUoKfDU_qWj-7EsUoYpOeJ0&s=vgHC9sbjYQb7mtY9MUJzbVXyVEyjoNJPWEx4_rfrHxU&e=
> 
> One thing I am still not sure(but generic irq affinity supposes to deal with
> well) is that the CPU may become offline after the IO is just submitted,
> then where the IRQ controller delivers the interrupt of this hw queue
> to?
> 
>> This could be reproduced on NVMe with a patch that could hold some rqs on ctx->rq_list,
>> meanwhile a script online and offline the cpus. Then a panic occurred in __queue_work().
> That shouldn't happen, when CPU offline happens the rqs in ctx->rq_list
> are dispatched directly, please see blk_mq_hctx_notify_dead().

Yes, I know. The  blk_mq_hctx_notify_dead will be invoked after the cpu has been set offlined.
Please refer to the following diagram.

CPU A                      CPU T
                 kick  
  _cpu_down()     ->       cpuhp_thread_fun (cpuhpT kthread)
                               AP_ACTIVE           (clear cpu_active_mask)
                                 |
                                 v
                               AP_WORKQUEUE_ONLINE (unbind workers)
                                 |
                                 v
                               TEARDOWN_CPU        (stop_machine)
                                    ,                   | execute
                                     \_ _ _ _ _ _       v
                                        preempt  V  take_cpu_down ( migration kthread)
                                                    set_cpu_online(smp_processor_id(), false) (__cpu_disable)  ------> Here !!!
                                                    TEARDOWN_CPU
                                                        |
             cpuhpT kthead is    |                      v
             migrated away       ,                    AP_SCHED_STARTING (migrate_tasks)
                 _ _ _ _ _ _ _ _/                       |
                V                                       v
              CPU X                                   AP_OFFLINE
                                                        
                                                        |
                                                        ,
                                             _ _ _ _ _ /
                                            V
                                      do_idle (idle task)
 <_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ cpuhp_report_idle_dead
                         complete st->done_down
           __cpu_die (cpuhpT kthread, teardown_cpu) 

 AP_OFFLINE
   |
   v
 BRINGUP_CPU
   |
   v
 BLK_MQ_DEAD    -------> Here !!!
   |
   v
 OFFLINE

The cpu has been cleared in cpu_online_mask when blk_mq_hctx_notify_dead is invoked.
If the device is NVMe which only has one cpu mapped on the hctx, 
cpumask_first_and(hctx->cpumask,cpu_online_mask) will return a bad value.

I even got a backtrace showed that the panic occurred in blk_mq_hctx_notify_dead -> kblocked_schedule_delayed_work_on -> __queue_work.
Kdump doesn't work well on my machine, so I cannot share the backtrace here, that's really sad.
I even added BUG_ON as following, it could be triggered.
>>>>
	if (next_cpu >= nr_cpu_ids)
		next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
	BUG_ON(next_cpu >= nr_cpu_ids);
>>>>

Thanks
Jianchao
> 
>> maybe cpu_possible_mask here, the workers in the pool of the offlined cpu has been unbound.
>> It should be ok to queue on them.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-16 14:31       ` jianchao.wang
@ 2018-01-16 15:11         ` jianchao.wang
  2018-01-16 15:32         ` Ming Lei
  1 sibling, 0 replies; 33+ messages in thread
From: jianchao.wang @ 2018-01-16 15:11 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Christoph Hellwig,
	Christian Borntraeger, Stefan Haberland, Thomas Gleixner,
	linux-kernel, Christoph Hellwig

[-- Attachment #1: Type: text/plain, Size: 4852 bytes --]

Hi ming

Thanks for your kindly response and directive.

On 01/16/2018 10:31 PM, jianchao.wang wrote:
> Hi minglei
> 
> On 01/16/2018 08:10 PM, Ming Lei wrote:
>>>> -		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
>>>> +		next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
>>>> +				cpu_online_mask);
>>>>  		if (next_cpu >= nr_cpu_ids)
>>>> -			next_cpu = cpumask_first(hctx->cpumask);
>>>> +			next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
>>> the next_cpu here could be >= nr_cpu_ids when the none of on hctx->cpumask is online.
>> That supposes not happen because storage device(blk-mq hw queue) is
>> generally C/S model, that means the queue becomes only active when
>> there is online CPU mapped to it.
>>
>> But it won't be true for non-block-IO queue, such as HPSA's queues[1], and
>> network controller RX queues.
>>
>> [1] https://urldefense.proofpoint.com/v2/url?u=https-3A__marc.info_-3Fl-3Dlinux-2Dkernel-26m-3D151601867018444-26w-3D2&d=DwIBAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=tCZdQH6JUW1dkNCN92ycoUoKfDU_qWj-7EsUoYpOeJ0&s=vgHC9sbjYQb7mtY9MUJzbVXyVEyjoNJPWEx4_rfrHxU&e=
>>
>> One thing I am still not sure(but generic irq affinity supposes to deal with
>> well) is that the CPU may become offline after the IO is just submitted,
>> then where the IRQ controller delivers the interrupt of this hw queue
>> to?
>>
>>> This could be reproduced on NVMe with a patch that could hold some rqs on ctx->rq_list,
>>> meanwhile a script online and offline the cpus. Then a panic occurred in __queue_work().
>> That shouldn't happen, when CPU offline happens the rqs in ctx->rq_list
>> are dispatched directly, please see blk_mq_hctx_notify_dead().
> 
> Yes, I know. The  blk_mq_hctx_notify_dead will be invoked after the cpu has been set offlined.
> Please refer to the following diagram.
> 
> CPU A                      CPU T
>                  kick  
>   _cpu_down()     ->       cpuhp_thread_fun (cpuhpT kthread)
>                                AP_ACTIVE           (clear cpu_active_mask)
>                                  |
>                                  v
>                                AP_WORKQUEUE_ONLINE (unbind workers)
>                                  |
>                                  v
>                                TEARDOWN_CPU        (stop_machine)
>                                     ,                   | execute
>                                      \_ _ _ _ _ _       v
>                                         preempt  V  take_cpu_down ( migration kthread)
>                                                     set_cpu_online(smp_processor_id(), false) (__cpu_disable)  ------> Here !!!
>                                                     TEARDOWN_CPU
>                                                         |
>              cpuhpT kthead is    |                      v
>              migrated away       ,                    AP_SCHED_STARTING (migrate_tasks)
>                  _ _ _ _ _ _ _ _/                       |
>                 V                                       v
>               CPU X                                   AP_OFFLINE
>                                                         
>                                                         |
>                                                         ,
>                                              _ _ _ _ _ /
>                                             V
>                                       do_idle (idle task)
>  <_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ cpuhp_report_idle_dead
>                          complete st->done_down
>            __cpu_die (cpuhpT kthread, teardown_cpu) 
> 
>  AP_OFFLINE
>    |
>    v
>  BRINGUP_CPU
>    |
>    v
>  BLK_MQ_DEAD    -------> Here !!!
>    |
>    v
>  OFFLINE
> 
> The cpu has been cleared in cpu_online_mask when blk_mq_hctx_notify_dead is invoked.
> If the device is NVMe which only has one cpu mapped on the hctx, 
> cpumask_first_and(hctx->cpumask,cpu_online_mask) will return a bad value.
> 
> I even got a backtrace showed that the panic occurred in blk_mq_hctx_notify_dead -> kblocked_schedule_delayed_work_on -> __queue_work.
> Kdump doesn't work well on my machine, so I cannot share the backtrace here, that's really sad.
> I even added BUG_ON as following, it could be triggered.
>>>>>
> 	if (next_cpu >= nr_cpu_ids)
> 		next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
> 	BUG_ON(next_cpu >= nr_cpu_ids);
>>>>>
> 
I find two pictures taken when I did the test. Share them here. 
Please refer.
The PANIC.jpg is the panic scenario.
The BUG_ON.jpg is the scenario when the BUG_ON added as above was triggered. 

Thanks
Jianchao

>>
>>> maybe cpu_possible_mask here, the workers in the pool of the offlined cpu has been unbound.
>>> It should be ok to queue on them.
> 

[-- Attachment #2: BUG_ON.jpg --]
[-- Type: image/jpeg, Size: 176796 bytes --]

[-- Attachment #3: Panic.jpg --]
[-- Type: image/jpeg, Size: 176502 bytes --]

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-16 14:31       ` jianchao.wang
  2018-01-16 15:11         ` jianchao.wang
@ 2018-01-16 15:32         ` Ming Lei
  2018-01-17  2:56           ` jianchao.wang
  1 sibling, 1 reply; 33+ messages in thread
From: Ming Lei @ 2018-01-16 15:32 UTC (permalink / raw)
  To: jianchao.wang
  Cc: Jens Axboe, linux-block, Christoph Hellwig,
	Christian Borntraeger, Stefan Haberland, Thomas Gleixner,
	linux-kernel, Christoph Hellwig

On Tue, Jan 16, 2018 at 10:31:42PM +0800, jianchao.wang wrote:
> Hi minglei
> 
> On 01/16/2018 08:10 PM, Ming Lei wrote:
> >>> -		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
> >>> +		next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
> >>> +				cpu_online_mask);
> >>>  		if (next_cpu >= nr_cpu_ids)
> >>> -			next_cpu = cpumask_first(hctx->cpumask);
> >>> +			next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
> >> the next_cpu here could be >= nr_cpu_ids when the none of on hctx->cpumask is online.
> > That supposes not happen because storage device(blk-mq hw queue) is
> > generally C/S model, that means the queue becomes only active when
> > there is online CPU mapped to it.
> > 
> > But it won't be true for non-block-IO queue, such as HPSA's queues[1], and
> > network controller RX queues.
> > 
> > [1] https://urldefense.proofpoint.com/v2/url?u=https-3A__marc.info_-3Fl-3Dlinux-2Dkernel-26m-3D151601867018444-26w-3D2&d=DwIBAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=tCZdQH6JUW1dkNCN92ycoUoKfDU_qWj-7EsUoYpOeJ0&s=vgHC9sbjYQb7mtY9MUJzbVXyVEyjoNJPWEx4_rfrHxU&e=
> > 
> > One thing I am still not sure(but generic irq affinity supposes to deal with
> > well) is that the CPU may become offline after the IO is just submitted,
> > then where the IRQ controller delivers the interrupt of this hw queue
> > to?
> > 
> >> This could be reproduced on NVMe with a patch that could hold some rqs on ctx->rq_list,
> >> meanwhile a script online and offline the cpus. Then a panic occurred in __queue_work().
> > That shouldn't happen, when CPU offline happens the rqs in ctx->rq_list
> > are dispatched directly, please see blk_mq_hctx_notify_dead().
> 
> Yes, I know. The  blk_mq_hctx_notify_dead will be invoked after the cpu has been set offlined.
> Please refer to the following diagram.
> 
> CPU A                      CPU T
>                  kick  
>   _cpu_down()     ->       cpuhp_thread_fun (cpuhpT kthread)
>                                AP_ACTIVE           (clear cpu_active_mask)
>                                  |
>                                  v
>                                AP_WORKQUEUE_ONLINE (unbind workers)
>                                  |
>                                  v
>                                TEARDOWN_CPU        (stop_machine)
>                                     ,                   | execute
>                                      \_ _ _ _ _ _       v
>                                         preempt  V  take_cpu_down ( migration kthread)
>                                                     set_cpu_online(smp_processor_id(), false) (__cpu_disable)  ------> Here !!!
>                                                     TEARDOWN_CPU
>                                                         |
>              cpuhpT kthead is    |                      v
>              migrated away       ,                    AP_SCHED_STARTING (migrate_tasks)
>                  _ _ _ _ _ _ _ _/                       |
>                 V                                       v
>               CPU X                                   AP_OFFLINE
>                                                         
>                                                         |
>                                                         ,
>                                              _ _ _ _ _ /
>                                             V
>                                       do_idle (idle task)
>  <_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ cpuhp_report_idle_dead
>                          complete st->done_down
>            __cpu_die (cpuhpT kthread, teardown_cpu) 
> 
>  AP_OFFLINE
>    |
>    v
>  BRINGUP_CPU
>    |
>    v
>  BLK_MQ_DEAD    -------> Here !!!
>    |
>    v
>  OFFLINE
> 
> The cpu has been cleared in cpu_online_mask when blk_mq_hctx_notify_dead is invoked.
> If the device is NVMe which only has one cpu mapped on the hctx, 
> cpumask_first_and(hctx->cpumask,cpu_online_mask) will return a bad value.

Hi Jianchao,

OK, I got it, and it should have been the only corner case in which
all CPUs mapped to this hctx become offline, and I believe the following
patch should address this case, could you give a test?

---
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c376d1b6309a..23f0f3ddffcf 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1416,21 +1416,44 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
  */
 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 {
+	bool tried = false;
+
 	if (hctx->queue->nr_hw_queues == 1)
 		return WORK_CPU_UNBOUND;
 
 	if (--hctx->next_cpu_batch <= 0) {
 		int next_cpu;
+select_cpu:
 
 		next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
 				cpu_online_mask);
 		if (next_cpu >= nr_cpu_ids)
 			next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
 
-		hctx->next_cpu = next_cpu;
+		/*
+		 * No online CPU can be found here when running from
+		 * blk_mq_hctx_notify_dead(), so make sure hctx->next_cpu
+		 * is set correctly.
+		 */
+		if (next_cpu >= nr_cpu_ids)
+			hctx->next_cpu = cpumask_first_and(hctx->cpumask,
+					cpu_possible_mask);
+		else
+			hctx->next_cpu = next_cpu;
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
 
+	/*
+	 * Do unbound schedule if we can't find a online CPU for this hctx,
+	 * and it should happen only if hctx->next_cpu is becoming DEAD.
+	 */
+	if (!cpu_online(hctx->next_cpu)) {
+		if (!tried) {
+			tried = true;
+			goto select_cpu;
+		}
+		return WORK_CPU_UNBOUND;
+	}
 	return hctx->next_cpu;
 }
 

Thanks,
Ming

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-16 15:32         ` Ming Lei
@ 2018-01-17  2:56           ` jianchao.wang
  2018-01-17  3:52             ` Ming Lei
  0 siblings, 1 reply; 33+ messages in thread
From: jianchao.wang @ 2018-01-17  2:56 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Christoph Hellwig,
	Christian Borntraeger, Stefan Haberland, Thomas Gleixner,
	linux-kernel, Christoph Hellwig

Hi ming

Thanks for your patch and kindly response.

On 01/16/2018 11:32 PM, Ming Lei wrote:
> OK, I got it, and it should have been the only corner case in which
> all CPUs mapped to this hctx become offline, and I believe the following
> patch should address this case, could you give a test?
> 
> ---
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index c376d1b6309a..23f0f3ddffcf 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1416,21 +1416,44 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
>   */
>  static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
>  {
> +	bool tried = false;
> +
>  	if (hctx->queue->nr_hw_queues == 1)
>  		return WORK_CPU_UNBOUND;
>  
>  	if (--hctx->next_cpu_batch <= 0) {
>  		int next_cpu;
> +select_cpu:
>  
>  		next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
>  				cpu_online_mask);
>  		if (next_cpu >= nr_cpu_ids)
>  			next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
>  
> -		hctx->next_cpu = next_cpu;
> +		/*
> +		 * No online CPU can be found here when running from
> +		 * blk_mq_hctx_notify_dead(), so make sure hctx->next_cpu
> +		 * is set correctly.
> +		 */
> +		if (next_cpu >= nr_cpu_ids)
> +			hctx->next_cpu = cpumask_first_and(hctx->cpumask,
> +					cpu_possible_mask);
> +		else
> +			hctx->next_cpu = next_cpu;
>  		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
>  	}
>  
> +	/*
> +	 * Do unbound schedule if we can't find a online CPU for this hctx,
> +	 * and it should happen only if hctx->next_cpu is becoming DEAD.
> +	 */
> +	if (!cpu_online(hctx->next_cpu)) {
> +		if (!tried) {
> +			tried = true;
> +			goto select_cpu;
> +		}
> +		return WORK_CPU_UNBOUND;
> +	}
>  	return hctx->next_cpu;
>  }

I have tested this patch. The panic was gone, but I got the following:

[  231.674464] WARNING: CPU: 0 PID: 263 at /home/will/u04/source_code/linux-block/block/blk-mq.c:1315 __blk_mq_run_hw_queue+0x92/0xa0
[  231.674466] Modules linked in: .....
[  231.674494] CPU: 0 PID: 263 Comm: kworker/2:1H Not tainted 4.15.0-rc7+ #12
[  231.674495] Hardware name: LENOVO 10MLS0E339/3106, BIOS M1AKT22A 06/27/2017
[  231.674496] Workqueue: kblockd blk_mq_run_work_fn
[  231.674498] RIP: 0010:__blk_mq_run_hw_queue+0x92/0xa0
[  231.674499] RSP: 0018:ffffa9c801fcfe60 EFLAGS: 00010202
[  231.674500] RAX: 0000000000000001 RBX: ffff9c7c90231400 RCX: 0000000000000000
[  231.674500] RDX: ffff9c7c9255b0f8 RSI: 0000000000000000 RDI: ffff9c7c90231400
[  231.674500] RBP: ffff9c7ca2ca2140 R08: 0000000000000000 R09: 0000000000000000
[  231.674501] R10: 00000000000003cb R11: 0000000000000000 R12: ffff9c7ca2ca8200
[  231.674501] R13: 0000000000000000 R14: 0ffff9c7ca2ca820 R15: ffff9c7c3df25240
[  231.674502] FS:  0000000000000000(0000) GS:ffff9c7ca2c00000(0000) knlGS:0000000000000000
[  231.674502] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  231.674503] CR2: 0000000001727008 CR3: 0000000336409003 CR4: 00000000003606f0
[  231.674504] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  231.674504] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  231.674504] Call Trace:
[  231.674509]  process_one_work+0x14b/0x420
[  231.674510]  worker_thread+0x47/0x420
[  231.674512]  kthread+0xf5/0x130
[  231.674514]  ? process_one_work+0x420/0x420
[  231.674515]  ? kthread_associate_blkcg+0xe0/0xe0
[  231.674517]  ? do_group_exit+0x3a/0xa0
[  231.674518]  ret_from_fork+0x24/0x30
[  231.674520] Code: ff 48 89 df e8 e0 6b 00 00 8b 74 24 04 48 89 df e8 a4 fc ff ff 48 8b 44 24 08 65 48 33 04 25 28 00 00 00 75 0e 48 83 c4 10 5b c3 <0f> ff eb b7 0f ff eb c1 e8 e1 02 c6 ff 90 0f 1f 44 00 00 48 8b 
[  231.674537] ---[ end trace cc2de957e0e0fed4 ]---

It is here.
__blk_mq_run_hw_queue()
....
    WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
        cpu_online(hctx->next_cpu));
....

To eliminate this risk totally, we could blk_mq_hctx_next_cpu return the cpu  even if the cpu is offlined and modify the cpu_online above to cpu_active.
The kworkers of the per-cpu pool must have be migrated back when the cpu is set active.
But there seems to be issues in DASD as your previous comment.
>>>>
That is the original version of this patch, and both Christian and Stefan
reported that system can't boot from DASD in this way[2], and I changed
to AND with cpu_online_mask, then their system can boot well
>>>>

On the other hand, there is also risk in 

@@ -440,7 +440,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 		blk_queue_exit(q);
 		return ERR_PTR(-EXDEV);
 	}
-	cpu = cpumask_first(alloc_data.hctx->cpumask);
+	cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
 	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);

what if the cpus in alloc_data.hctx->cpumask are all offlined ?

Thanks
Jianchao

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-17  2:56           ` jianchao.wang
@ 2018-01-17  3:52             ` Ming Lei
  2018-01-17  5:24               ` jianchao.wang
  0 siblings, 1 reply; 33+ messages in thread
From: Ming Lei @ 2018-01-17  3:52 UTC (permalink / raw)
  To: jianchao.wang
  Cc: Jens Axboe, linux-block, Christoph Hellwig,
	Christian Borntraeger, Stefan Haberland, Thomas Gleixner,
	linux-kernel, Christoph Hellwig

Hi Jianchao,

On Wed, Jan 17, 2018 at 10:56:13AM +0800, jianchao.wang wrote:
> Hi ming
> 
> Thanks for your patch and kindly response.

You are welcome!

> 
> On 01/16/2018 11:32 PM, Ming Lei wrote:
> > OK, I got it, and it should have been the only corner case in which
> > all CPUs mapped to this hctx become offline, and I believe the following
> > patch should address this case, could you give a test?
> > 
> > ---
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index c376d1b6309a..23f0f3ddffcf 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -1416,21 +1416,44 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
> >   */
> >  static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
> >  {
> > +	bool tried = false;
> > +
> >  	if (hctx->queue->nr_hw_queues == 1)
> >  		return WORK_CPU_UNBOUND;
> >  
> >  	if (--hctx->next_cpu_batch <= 0) {
> >  		int next_cpu;
> > +select_cpu:
> >  
> >  		next_cpu = cpumask_next_and(hctx->next_cpu, hctx->cpumask,
> >  				cpu_online_mask);
> >  		if (next_cpu >= nr_cpu_ids)
> >  			next_cpu = cpumask_first_and(hctx->cpumask,cpu_online_mask);
> >  
> > -		hctx->next_cpu = next_cpu;
> > +		/*
> > +		 * No online CPU can be found here when running from
> > +		 * blk_mq_hctx_notify_dead(), so make sure hctx->next_cpu
> > +		 * is set correctly.
> > +		 */
> > +		if (next_cpu >= nr_cpu_ids)
> > +			hctx->next_cpu = cpumask_first_and(hctx->cpumask,
> > +					cpu_possible_mask);
> > +		else
> > +			hctx->next_cpu = next_cpu;
> >  		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
> >  	}
> >  
> > +	/*
> > +	 * Do unbound schedule if we can't find a online CPU for this hctx,
> > +	 * and it should happen only if hctx->next_cpu is becoming DEAD.
> > +	 */
> > +	if (!cpu_online(hctx->next_cpu)) {
> > +		if (!tried) {
> > +			tried = true;
> > +			goto select_cpu;
> > +		}
> > +		return WORK_CPU_UNBOUND;
> > +	}
> >  	return hctx->next_cpu;
> >  }
> 
> I have tested this patch. The panic was gone, but I got the following:
> 
> [  231.674464] WARNING: CPU: 0 PID: 263 at /home/will/u04/source_code/linux-block/block/blk-mq.c:1315 __blk_mq_run_hw_queue+0x92/0xa0
>

......

> It is here.
> __blk_mq_run_hw_queue()
> ....
>     WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
>         cpu_online(hctx->next_cpu));

I think this warning is triggered after the CPU of hctx->next_cpu becomes
online again, and it should have been dealt with by the following trick,
and this patch is against the previous one, please test it and see if
the warning can be fixed.

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 23f0f3ddffcf..0620ccb65e4e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1452,6 +1452,9 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 			tried = true;
 			goto select_cpu;
 		}
+
+		/* handle after this CPU of hctx->next_cpu becomes online again */
+		hctx->next_cpu_batch = 1;
 		return WORK_CPU_UNBOUND;
 	}
 	return hctx->next_cpu;

> ....
> 
> To eliminate this risk totally, we could blk_mq_hctx_next_cpu return the cpu  even if the cpu is offlined and modify the cpu_online above to cpu_active.
> The kworkers of the per-cpu pool must have be migrated back when the cpu is set active.
> But there seems to be issues in DASD as your previous comment.

Yes, we can't break DASD.

> >>>>
> That is the original version of this patch, and both Christian and Stefan
> reported that system can't boot from DASD in this way[2], and I changed
> to AND with cpu_online_mask, then their system can boot well
> >>>>
> 
> On the other hand, there is also risk in 
> 
> @@ -440,7 +440,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
>  		blk_queue_exit(q);
>  		return ERR_PTR(-EXDEV);
>  	}
> -	cpu = cpumask_first(alloc_data.hctx->cpumask);
> +	cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
>  	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
> 
> what if the cpus in alloc_data.hctx->cpumask are all offlined ?

This one is crazy, and is used by NVMe only, it should be fine if
the passed 'hctx_idx' is retrieved by the current running CPU, such
as the way of blk_mq_map_queue(). But if not, bad thing may happen.

Thanks,
Ming

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-17  3:52             ` Ming Lei
@ 2018-01-17  5:24               ` jianchao.wang
  2018-01-17  6:22                   ` Ming Lei
  0 siblings, 1 reply; 33+ messages in thread
From: jianchao.wang @ 2018-01-17  5:24 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, Christoph Hellwig,
	Christian Borntraeger, Stefan Haberland, Thomas Gleixner,
	linux-kernel, Christoph Hellwig

Hi ming

Thanks for your kindly response.

On 01/17/2018 11:52 AM, Ming Lei wrote:
>> It is here.
>> __blk_mq_run_hw_queue()
>> ....
>>     WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
>>         cpu_online(hctx->next_cpu));
> I think this warning is triggered after the CPU of hctx->next_cpu becomes
> online again, and it should have been dealt with by the following trick,
> and this patch is against the previous one, please test it and see if
> the warning can be fixed.
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 23f0f3ddffcf..0620ccb65e4e 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1452,6 +1452,9 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
>  			tried = true;
>  			goto select_cpu;
>  		}
> +
> +		/* handle after this CPU of hctx->next_cpu becomes online again */
> +		hctx->next_cpu_batch = 1;
>  		return WORK_CPU_UNBOUND;
>  	}
>  	return hctx->next_cpu;
> 

The WARNING still could be triggered.

[  282.194532] WARNING: CPU: 0 PID: 222 at /home/will/u04/source_code/linux-block/block/blk-mq.c:1315 __blk_mq_run_hw_queue+0x92/0xa0
[  282.194534] Modules linked in: ....
[  282.194561] CPU: 0 PID: 222 Comm: kworker/2:1H Tainted: G        W        4.15.0-rc7+ #17
[  282.194562] Hardware name: LENOVO 10MLS0E339/3106, BIOS M1AKT22A 06/27/2017
[  282.194563] Workqueue: kblockd blk_mq_run_work_fn
[  282.194565] RIP: 0010:__blk_mq_run_hw_queue+0x92/0xa0
[  282.194565] RSP: 0018:ffff96c50223be60 EFLAGS: 00010202
[  282.194566] RAX: 0000000000000001 RBX: ffff8a7110233800 RCX: 0000000000000000
[  282.194567] RDX: ffff8a711255f2d0 RSI: 0000000000000000 RDI: ffff8a7110233800
[  282.194567] RBP: ffff8a7122ca2140 R08: 0000000000000000 R09: 0000000000000000
[  282.194568] R10: 0000000000000263 R11: 0000000000000000 R12: ffff8a7122ca8200
[  282.194568] R13: 0000000000000000 R14: 0ffff8a7122ca820 R15: ffff8a70bcef0780
[  282.194569] FS:  0000000000000000(0000) GS:ffff8a7122c00000(0000) knlGS:0000000000000000
[  282.194569] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  282.194570] CR2: 0000555e14d89d60 CR3: 000000003c809002 CR4: 00000000003606f0
[  282.194571] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  282.194571] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  282.194571] Call Trace:
[  282.194576]  process_one_work+0x14b/0x420
[  282.194577]  worker_thread+0x47/0x420
[  282.194579]  kthread+0xf5/0x130
[  282.194581]  ? process_one_work+0x420/0x420
[  282.194581]  ? kthread_associate_blkcg+0xe0/0xe0
[  282.194583]  ret_from_fork+0x24/0x30
[  282.194585] Code: ff 48 89 df e8 e0 6b 00 00 8b 74 24 04 48 89 df e8 a4 fc ff ff 48 8b 44 24 08 65 48 33 04 25 28 00 00 00 75 0e 48 83 c4 10 5b c3 <0f> ff eb b7 0f ff eb c1 e8 e1 02 c6 ff 90 0f 1f 44 00 00 48 8b 
[  282.194601] ---[ end trace c104f0a63d3df31b ]---

>> ....
>>
>> To eliminate this risk totally, we could blk_mq_hctx_next_cpu return the cpu  even if the cpu is offlined and modify the cpu_online above to cpu_active.
>> The kworkers of the per-cpu pool must have be migrated back when the cpu is set active.
>> But there seems to be issues in DASD as your previous comment.
> Yes, we can't break DASD.
> 
>> That is the original version of this patch, and both Christian and Stefan
>> reported that system can't boot from DASD in this way[2], and I changed
>> to AND with cpu_online_mask, then their system can boot well
>> On the other hand, there is also risk in 
>>
>> @@ -440,7 +440,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
>>  		blk_queue_exit(q);
>>  		return ERR_PTR(-EXDEV);
>>  	}
>> -	cpu = cpumask_first(alloc_data.hctx->cpumask);
>> +	cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
>>  	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
>>
>> what if the cpus in alloc_data.hctx->cpumask are all offlined ?
> This one is crazy, and is used by NVMe only, it should be fine if
> the passed 'hctx_idx' is retrieved by the current running CPU, such
> as the way of blk_mq_map_queue(). But if not, bad thing may happen.
Even if retrieved by current running cpu, the task still could be migrated away when the cpu is offlined.

Thanks
Jianchao

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-17  5:24               ` jianchao.wang
@ 2018-01-17  6:22                   ` Ming Lei
  0 siblings, 0 replies; 33+ messages in thread
From: Ming Lei @ 2018-01-17  6:22 UTC (permalink / raw)
  To: jianchao.wang
  Cc: Jens Axboe, linux-block, Christoph Hellwig,
	Christian Borntraeger, Stefan Haberland, Thomas Gleixner,
	linux-kernel, Christoph Hellwig, James Smart, Keith Busch,
	Sagi Grimberg, linux-nvme

Hi Jianchao,

On Wed, Jan 17, 2018 at 01:24:23PM +0800, jianchao.wang wrote:
> Hi ming
> 
> Thanks for your kindly response.
> 
> On 01/17/2018 11:52 AM, Ming Lei wrote:
> >> It is here.
> >> __blk_mq_run_hw_queue()
> >> ....
> >>     WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
> >>         cpu_online(hctx->next_cpu));
> > I think this warning is triggered after the CPU of hctx->next_cpu becomes
> > online again, and it should have been dealt with by the following trick,
> > and this patch is against the previous one, please test it and see if
> > the warning can be fixed.
> > 
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 23f0f3ddffcf..0620ccb65e4e 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -1452,6 +1452,9 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
> >  			tried = true;
> >  			goto select_cpu;
> >  		}
> > +
> > +		/* handle after this CPU of hctx->next_cpu becomes online again */
> > +		hctx->next_cpu_batch = 1;
> >  		return WORK_CPU_UNBOUND;
> >  	}
> >  	return hctx->next_cpu;
> > 
> 
> The WARNING still could be triggered.
> 
> [  282.194532] WARNING: CPU: 0 PID: 222 at /home/will/u04/source_code/linux-block/block/blk-mq.c:1315 __blk_mq_run_hw_queue+0x92/0xa0
> [  282.194534] Modules linked in: ....
> [  282.194561] CPU: 0 PID: 222 Comm: kworker/2:1H Tainted: G        W        4.15.0-rc7+ #17
> 

This warning can't be removed completely, for example, the CPU figured
in blk_mq_hctx_next_cpu(hctx) can be put on again just after the
following call returns and before __blk_mq_run_hw_queue() is scheduled
to run.

	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs))

Just be curious how you trigger this issue? And is it triggered in CPU
hotplug stress test? Or in a normal use case?

> >> ....
> >>
> >> To eliminate this risk totally, we could blk_mq_hctx_next_cpu return the cpu  even if the cpu is offlined and modify the cpu_online above to cpu_active.
> >> The kworkers of the per-cpu pool must have be migrated back when the cpu is set active.
> >> But there seems to be issues in DASD as your previous comment.
> > Yes, we can't break DASD.
> > 
> >> That is the original version of this patch, and both Christian and Stefan
> >> reported that system can't boot from DASD in this way[2], and I changed
> >> to AND with cpu_online_mask, then their system can boot well
> >> On the other hand, there is also risk in 
> >>
> >> @@ -440,7 +440,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
> >>  		blk_queue_exit(q);
> >>  		return ERR_PTR(-EXDEV);
> >>  	}
> >> -	cpu = cpumask_first(alloc_data.hctx->cpumask);
> >> +	cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
> >>  	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
> >>
> >> what if the cpus in alloc_data.hctx->cpumask are all offlined ?
> > This one is crazy, and is used by NVMe only, it should be fine if
> > the passed 'hctx_idx' is retrieved by the current running CPU, such
> > as the way of blk_mq_map_queue(). But if not, bad thing may happen.
> Even if retrieved by current running cpu, the task still could be migrated away when the cpu is offlined.
> 

I just checked NVMe code, looks only nvmf_connect_io_queue() passes
one specific 'qid' and calls blk_mq_alloc_request_hctx(); and for others,
NVME_QID_ANY is passed, then blk_mq_alloc_request() is called in
nvme_alloc_request().

CC NVMe list and guys.

Hello James, Keith, Christoph and Sagi,

Looks nvmf_connect_io_queue() is run in the following fragile way:

	for (i = 1; i < ctrl->ctrl.queue_count; i++) {
		...
		nvmf_connect_io_queue(&ctrl->ctrl, i);
		...
	}

The hardware queue idx is passed to nvmf_connect_io_queue() directly
and finally blk_mq_alloc_request_hctx() is called to allocate request
for the queue, but all CPUs mapped to this hw queue may become offline
when running in blk_mq_alloc_request_hctx(), so what is the supposed way
to handle this situation?

Is it fine to return failure simply in blk_mq_alloc_request_hctx()
under this situation? But the CPU may become online later...

Thanks,
Ming

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
@ 2018-01-17  6:22                   ` Ming Lei
  0 siblings, 0 replies; 33+ messages in thread
From: Ming Lei @ 2018-01-17  6:22 UTC (permalink / raw)


Hi Jianchao,

On Wed, Jan 17, 2018@01:24:23PM +0800, jianchao.wang wrote:
> Hi ming
> 
> Thanks for your kindly response.
> 
> On 01/17/2018 11:52 AM, Ming Lei wrote:
> >> It is here.
> >> __blk_mq_run_hw_queue()
> >> ....
> >>     WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
> >>         cpu_online(hctx->next_cpu));
> > I think this warning is triggered after the CPU of hctx->next_cpu becomes
> > online again, and it should have been dealt with by the following trick,
> > and this patch is against the previous one, please test it and see if
> > the warning can be fixed.
> > 
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index 23f0f3ddffcf..0620ccb65e4e 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -1452,6 +1452,9 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
> >  			tried = true;
> >  			goto select_cpu;
> >  		}
> > +
> > +		/* handle after this CPU of hctx->next_cpu becomes online again */
> > +		hctx->next_cpu_batch = 1;
> >  		return WORK_CPU_UNBOUND;
> >  	}
> >  	return hctx->next_cpu;
> > 
> 
> The WARNING still could be triggered.
> 
> [  282.194532] WARNING: CPU: 0 PID: 222 at /home/will/u04/source_code/linux-block/block/blk-mq.c:1315 __blk_mq_run_hw_queue+0x92/0xa0
> [  282.194534] Modules linked in: ....
> [  282.194561] CPU: 0 PID: 222 Comm: kworker/2:1H Tainted: G        W        4.15.0-rc7+ #17
> 

This warning can't be removed completely, for example, the CPU figured
in blk_mq_hctx_next_cpu(hctx) can be put on again just after the
following call returns and before __blk_mq_run_hw_queue() is scheduled
to run.

	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs))

Just be curious how you trigger this issue? And is it triggered in CPU
hotplug stress test? Or in a normal use case?

> >> ....
> >>
> >> To eliminate this risk totally, we could blk_mq_hctx_next_cpu return the cpu  even if the cpu is offlined and modify the cpu_online above to cpu_active.
> >> The kworkers of the per-cpu pool must have be migrated back when the cpu is set active.
> >> But there seems to be issues in DASD as your previous comment.
> > Yes, we can't break DASD.
> > 
> >> That is the original version of this patch, and both Christian and Stefan
> >> reported that system can't boot from DASD in this way[2], and I changed
> >> to AND with cpu_online_mask, then their system can boot well
> >> On the other hand, there is also risk in 
> >>
> >> @@ -440,7 +440,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
> >>  		blk_queue_exit(q);
> >>  		return ERR_PTR(-EXDEV);
> >>  	}
> >> -	cpu = cpumask_first(alloc_data.hctx->cpumask);
> >> +	cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
> >>  	alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
> >>
> >> what if the cpus in alloc_data.hctx->cpumask are all offlined ?
> > This one is crazy, and is used by NVMe only, it should be fine if
> > the passed 'hctx_idx' is retrieved by the current running CPU, such
> > as the way of blk_mq_map_queue(). But if not, bad thing may happen.
> Even if retrieved by current running cpu, the task still could be migrated away when the cpu is offlined.
> 

I just checked NVMe code, looks only nvmf_connect_io_queue() passes
one specific 'qid' and calls blk_mq_alloc_request_hctx(); and for others,
NVME_QID_ANY is passed, then blk_mq_alloc_request() is called in
nvme_alloc_request().

CC NVMe list and guys.

Hello James, Keith, Christoph and Sagi,

Looks nvmf_connect_io_queue() is run in the following fragile way:

	for (i = 1; i < ctrl->ctrl.queue_count; i++) {
		...
		nvmf_connect_io_queue(&ctrl->ctrl, i);
		...
	}

The hardware queue idx is passed to nvmf_connect_io_queue() directly
and finally blk_mq_alloc_request_hctx() is called to allocate request
for the queue, but all CPUs mapped to this hw queue may become offline
when running in blk_mq_alloc_request_hctx(), so what is the supposed way
to handle this situation?

Is it fine to return failure simply in blk_mq_alloc_request_hctx()
under this situation? But the CPU may become online later...

Thanks,
Ming

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-17  6:22                   ` Ming Lei
@ 2018-01-17  8:09                     ` jianchao.wang
  -1 siblings, 0 replies; 33+ messages in thread
From: jianchao.wang @ 2018-01-17  8:09 UTC (permalink / raw)
  To: Ming Lei
  Cc: linux-block, Keith Busch, Sagi Grimberg, Christoph Hellwig,
	Stefan Haberland, linux-kernel, linux-nvme, James Smart,
	Jens Axboe, Christian Borntraeger, Thomas Gleixner,
	Christoph Hellwig

Hi ming 

Thanks for your kindly response.

On 01/17/2018 02:22 PM, Ming Lei wrote:
> This warning can't be removed completely, for example, the CPU figured
> in blk_mq_hctx_next_cpu(hctx) can be put on again just after the
> following call returns and before __blk_mq_run_hw_queue() is scheduled
> to run.
> 
> 	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs))
We could use cpu_active in __blk_mq_run_hw_queue() to narrow the window.
There is a big gap between cpu_online and cpu_active. rebind_workers is also between them.

> 
> Just be curious how you trigger this issue? And is it triggered in CPU
> hotplug stress test? Or in a normal use case?

In fact, this is my own investigation about whether the .queue_rq to one hardware queue could be executed on
the cpu where it is not mapped. Finally, found this hole when cpu hotplug.
I did the test on NVMe device which has 1-to-1 mapping between cpu and hctx.
 - A special patch that could hold some requests on ctx->rq_list though .get_budget
 - A script issues IOs with fio
 - A script online/offline the cpus continuously
At first, just the warning above. Then after this patch was introduced, panic came up.

Thanks
Jianchao

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
@ 2018-01-17  8:09                     ` jianchao.wang
  0 siblings, 0 replies; 33+ messages in thread
From: jianchao.wang @ 2018-01-17  8:09 UTC (permalink / raw)


Hi ming 

Thanks for your kindly response.

On 01/17/2018 02:22 PM, Ming Lei wrote:
> This warning can't be removed completely, for example, the CPU figured
> in blk_mq_hctx_next_cpu(hctx) can be put on again just after the
> following call returns and before __blk_mq_run_hw_queue() is scheduled
> to run.
> 
> 	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs))
We could use cpu_active in __blk_mq_run_hw_queue() to narrow the window.
There is a big gap between cpu_online and cpu_active. rebind_workers is also between them.

> 
> Just be curious how you trigger this issue? And is it triggered in CPU
> hotplug stress test? Or in a normal use case?

In fact, this is my own investigation about whether the .queue_rq to one hardware queue could be executed on
the cpu where it is not mapped. Finally, found this hole when cpu hotplug.
I did the test on NVMe device which has 1-to-1 mapping between cpu and hctx.
 - A special patch that could hold some requests on ctx->rq_list though .get_budget
 - A script issues IOs with fio
 - A script online/offline the cpus continuously
At first, just the warning above. Then after this patch was introduced, panic came up.

Thanks
Jianchao

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-17  8:09                     ` jianchao.wang
@ 2018-01-17  9:57                       ` Ming Lei
  -1 siblings, 0 replies; 33+ messages in thread
From: Ming Lei @ 2018-01-17  9:57 UTC (permalink / raw)
  To: jianchao.wang
  Cc: linux-block, Keith Busch, Sagi Grimberg, Christoph Hellwig,
	Stefan Haberland, linux-kernel, linux-nvme, James Smart,
	Jens Axboe, Christian Borntraeger, Thomas Gleixner,
	Christoph Hellwig

Hi Jianchao,

On Wed, Jan 17, 2018 at 04:09:11PM +0800, jianchao.wang wrote:
> Hi ming 
> 
> Thanks for your kindly response.
> 
> On 01/17/2018 02:22 PM, Ming Lei wrote:
> > This warning can't be removed completely, for example, the CPU figured
> > in blk_mq_hctx_next_cpu(hctx) can be put on again just after the
> > following call returns and before __blk_mq_run_hw_queue() is scheduled
> > to run.
> > 
> > 	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs))
> We could use cpu_active in __blk_mq_run_hw_queue() to narrow the window.
> There is a big gap between cpu_online and cpu_active. rebind_workers is also between them.

This warning is harmless, also you can't reproduce it without help of your
special patch, I guess, :-) So the window shouldn't be a big deal. 

But it can be a problem about the delay(msecs_to_jiffies(msecs)) passed to
kblockd_mod_delayed_work_on(), because during the period:

1) hctx->next_cpu can become online from offline before __blk_mq_run_hw_queue
is run, your warning is triggered, but it is harmless

2) hctx->next_cpu can become offline from online before __blk_mq_run_hw_queue
is run, there isn't warning, but once the IO is submitted to hardware,
after it is completed, how does the HBA/hw queue notify CPU since CPUs
assigned to this hw queue(irq vector) are offline? blk-mq's timeout
handler may cover that, but looks too tricky.

> 
> > 
> > Just be curious how you trigger this issue? And is it triggered in CPU
> > hotplug stress test? Or in a normal use case?
> 
> In fact, this is my own investigation about whether the .queue_rq to one hardware queue could be executed on
> the cpu where it is not mapped. Finally, found this hole when cpu hotplug.
> I did the test on NVMe device which has 1-to-1 mapping between cpu and hctx.
>  - A special patch that could hold some requests on ctx->rq_list though .get_budget
>  - A script issues IOs with fio
>  - A script online/offline the cpus continuously

Thanks for sharing your reproduction approach.

Without a handler for CPU hotplug, it isn't easy to avoid the warning
completely in __blk_mq_run_hw_queue().

> At first, just the warning above. Then after this patch was introduced, panic came up.

We have to fix the panic, so I will post the patch you tested in this thread.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
@ 2018-01-17  9:57                       ` Ming Lei
  0 siblings, 0 replies; 33+ messages in thread
From: Ming Lei @ 2018-01-17  9:57 UTC (permalink / raw)


Hi Jianchao,

On Wed, Jan 17, 2018@04:09:11PM +0800, jianchao.wang wrote:
> Hi ming 
> 
> Thanks for your kindly response.
> 
> On 01/17/2018 02:22 PM, Ming Lei wrote:
> > This warning can't be removed completely, for example, the CPU figured
> > in blk_mq_hctx_next_cpu(hctx) can be put on again just after the
> > following call returns and before __blk_mq_run_hw_queue() is scheduled
> > to run.
> > 
> > 	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs))
> We could use cpu_active in __blk_mq_run_hw_queue() to narrow the window.
> There is a big gap between cpu_online and cpu_active. rebind_workers is also between them.

This warning is harmless, also you can't reproduce it without help of your
special patch, I guess, :-) So the window shouldn't be a big deal. 

But it can be a problem about the delay(msecs_to_jiffies(msecs)) passed to
kblockd_mod_delayed_work_on(), because during the period:

1) hctx->next_cpu can become online from offline before __blk_mq_run_hw_queue
is run, your warning is triggered, but it is harmless

2) hctx->next_cpu can become offline from online before __blk_mq_run_hw_queue
is run, there isn't warning, but once the IO is submitted to hardware,
after it is completed, how does the HBA/hw queue notify CPU since CPUs
assigned to this hw queue(irq vector) are offline? blk-mq's timeout
handler may cover that, but looks too tricky.

> 
> > 
> > Just be curious how you trigger this issue? And is it triggered in CPU
> > hotplug stress test? Or in a normal use case?
> 
> In fact, this is my own investigation about whether the .queue_rq to one hardware queue could be executed on
> the cpu where it is not mapped. Finally, found this hole when cpu hotplug.
> I did the test on NVMe device which has 1-to-1 mapping between cpu and hctx.
>  - A special patch that could hold some requests on ctx->rq_list though .get_budget
>  - A script issues IOs with fio
>  - A script online/offline the cpus continuously

Thanks for sharing your reproduction approach.

Without a handler for CPU hotplug, it isn't easy to avoid the warning
completely in __blk_mq_run_hw_queue().

> At first, just the warning above. Then after this patch was introduced, panic came up.

We have to fix the panic, so I will post the patch you tested in this thread.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-17  9:57                       ` Ming Lei
@ 2018-01-17 10:07                         ` Christian Borntraeger
  -1 siblings, 0 replies; 33+ messages in thread
From: Christian Borntraeger @ 2018-01-17 10:07 UTC (permalink / raw)
  To: Ming Lei, jianchao.wang
  Cc: linux-block, Keith Busch, Sagi Grimberg, Christoph Hellwig,
	Stefan Haberland, linux-kernel, linux-nvme, James Smart,
	Jens Axboe, Thomas Gleixner, Christoph Hellwig



On 01/17/2018 10:57 AM, Ming Lei wrote:
> Hi Jianchao,
> 
> On Wed, Jan 17, 2018 at 04:09:11PM +0800, jianchao.wang wrote:
>> Hi ming 
>>
>> Thanks for your kindly response.
>>
>> On 01/17/2018 02:22 PM, Ming Lei wrote:
>>> This warning can't be removed completely, for example, the CPU figured
>>> in blk_mq_hctx_next_cpu(hctx) can be put on again just after the
>>> following call returns and before __blk_mq_run_hw_queue() is scheduled
>>> to run.
>>>
>>> 	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs))
>> We could use cpu_active in __blk_mq_run_hw_queue() to narrow the window.
>> There is a big gap between cpu_online and cpu_active. rebind_workers is also between them.
> 
> This warning is harmless, also you can't reproduce it without help of your
> special patch, I guess, :-) So the window shouldn't be a big deal. 

FWIW, every WARN_ON is problematic since there are people running with panic_on_warn.
If a condition can happen we should not use WARN_ON but something else.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
@ 2018-01-17 10:07                         ` Christian Borntraeger
  0 siblings, 0 replies; 33+ messages in thread
From: Christian Borntraeger @ 2018-01-17 10:07 UTC (permalink / raw)




On 01/17/2018 10:57 AM, Ming Lei wrote:
> Hi Jianchao,
> 
> On Wed, Jan 17, 2018@04:09:11PM +0800, jianchao.wang wrote:
>> Hi ming 
>>
>> Thanks for your kindly response.
>>
>> On 01/17/2018 02:22 PM, Ming Lei wrote:
>>> This warning can't be removed completely, for example, the CPU figured
>>> in blk_mq_hctx_next_cpu(hctx) can be put on again just after the
>>> following call returns and before __blk_mq_run_hw_queue() is scheduled
>>> to run.
>>>
>>> 	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs))
>> We could use cpu_active in __blk_mq_run_hw_queue() to narrow the window.
>> There is a big gap between cpu_online and cpu_active. rebind_workers is also between them.
> 
> This warning is harmless, also you can't reproduce it without help of your
> special patch, I guess, :-) So the window shouldn't be a big deal. 

FWIW, every WARN_ON is problematic since there are people running with panic_on_warn.
If a condition can happen we should not use WARN_ON but something else.

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-17 10:07                         ` Christian Borntraeger
@ 2018-01-17 10:14                           ` Christian Borntraeger
  -1 siblings, 0 replies; 33+ messages in thread
From: Christian Borntraeger @ 2018-01-17 10:14 UTC (permalink / raw)
  To: Ming Lei, jianchao.wang
  Cc: linux-block, Keith Busch, Sagi Grimberg, Christoph Hellwig,
	Stefan Haberland, linux-kernel, linux-nvme, James Smart,
	Jens Axboe, Thomas Gleixner, Christoph Hellwig



On 01/17/2018 11:07 AM, Christian Borntraeger wrote:
> 
> 
> On 01/17/2018 10:57 AM, Ming Lei wrote:
>> Hi Jianchao,
>>
>> On Wed, Jan 17, 2018 at 04:09:11PM +0800, jianchao.wang wrote:
>>> Hi ming 
>>>
>>> Thanks for your kindly response.
>>>
>>> On 01/17/2018 02:22 PM, Ming Lei wrote:
>>>> This warning can't be removed completely, for example, the CPU figured
>>>> in blk_mq_hctx_next_cpu(hctx) can be put on again just after the
>>>> following call returns and before __blk_mq_run_hw_queue() is scheduled
>>>> to run.
>>>>
>>>> 	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs))
>>> We could use cpu_active in __blk_mq_run_hw_queue() to narrow the window.
>>> There is a big gap between cpu_online and cpu_active. rebind_workers is also between them.
>>
>> This warning is harmless, also you can't reproduce it without help of your
>> special patch, I guess, :-) So the window shouldn't be a big deal. 
> 
> FWIW, every WARN_ON is problematic since there are people running with panic_on_warn.

To make it more clear. Every WARN_ON that can happen in real life without actually being
an error is problematic.

> If a condition can happen we should not use WARN_ON but something else.
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
@ 2018-01-17 10:14                           ` Christian Borntraeger
  0 siblings, 0 replies; 33+ messages in thread
From: Christian Borntraeger @ 2018-01-17 10:14 UTC (permalink / raw)




On 01/17/2018 11:07 AM, Christian Borntraeger wrote:
> 
> 
> On 01/17/2018 10:57 AM, Ming Lei wrote:
>> Hi Jianchao,
>>
>> On Wed, Jan 17, 2018@04:09:11PM +0800, jianchao.wang wrote:
>>> Hi ming 
>>>
>>> Thanks for your kindly response.
>>>
>>> On 01/17/2018 02:22 PM, Ming Lei wrote:
>>>> This warning can't be removed completely, for example, the CPU figured
>>>> in blk_mq_hctx_next_cpu(hctx) can be put on again just after the
>>>> following call returns and before __blk_mq_run_hw_queue() is scheduled
>>>> to run.
>>>>
>>>> 	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs))
>>> We could use cpu_active in __blk_mq_run_hw_queue() to narrow the window.
>>> There is a big gap between cpu_online and cpu_active. rebind_workers is also between them.
>>
>> This warning is harmless, also you can't reproduce it without help of your
>> special patch, I guess, :-) So the window shouldn't be a big deal. 
> 
> FWIW, every WARN_ON is problematic since there are people running with panic_on_warn.

To make it more clear. Every WARN_ON that can happen in real life without actually being
an error is problematic.

> If a condition can happen we should not use WARN_ON but something else.
> 

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-17 10:07                         ` Christian Borntraeger
@ 2018-01-17 10:17                           ` Ming Lei
  -1 siblings, 0 replies; 33+ messages in thread
From: Ming Lei @ 2018-01-17 10:17 UTC (permalink / raw)
  To: Christian Borntraeger
  Cc: jianchao.wang, linux-block, Keith Busch, Sagi Grimberg,
	Christoph Hellwig, Stefan Haberland, linux-kernel, linux-nvme,
	James Smart, Jens Axboe, Thomas Gleixner, Christoph Hellwig

On Wed, Jan 17, 2018 at 11:07:48AM +0100, Christian Borntraeger wrote:
> 
> 
> On 01/17/2018 10:57 AM, Ming Lei wrote:
> > Hi Jianchao,
> > 
> > On Wed, Jan 17, 2018 at 04:09:11PM +0800, jianchao.wang wrote:
> >> Hi ming 
> >>
> >> Thanks for your kindly response.
> >>
> >> On 01/17/2018 02:22 PM, Ming Lei wrote:
> >>> This warning can't be removed completely, for example, the CPU figured
> >>> in blk_mq_hctx_next_cpu(hctx) can be put on again just after the
> >>> following call returns and before __blk_mq_run_hw_queue() is scheduled
> >>> to run.
> >>>
> >>> 	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs))
> >> We could use cpu_active in __blk_mq_run_hw_queue() to narrow the window.
> >> There is a big gap between cpu_online and cpu_active. rebind_workers is also between them.
> > 
> > This warning is harmless, also you can't reproduce it without help of your
> > special patch, I guess, :-) So the window shouldn't be a big deal. 
> 
> FWIW, every WARN_ON is problematic since there are people running with panic_on_warn.
> If a condition can happen we should not use WARN_ON but something else.

Agree, printk() should be fine, IMO.


-- 
Ming

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
@ 2018-01-17 10:17                           ` Ming Lei
  0 siblings, 0 replies; 33+ messages in thread
From: Ming Lei @ 2018-01-17 10:17 UTC (permalink / raw)


On Wed, Jan 17, 2018@11:07:48AM +0100, Christian Borntraeger wrote:
> 
> 
> On 01/17/2018 10:57 AM, Ming Lei wrote:
> > Hi Jianchao,
> > 
> > On Wed, Jan 17, 2018@04:09:11PM +0800, jianchao.wang wrote:
> >> Hi ming 
> >>
> >> Thanks for your kindly response.
> >>
> >> On 01/17/2018 02:22 PM, Ming Lei wrote:
> >>> This warning can't be removed completely, for example, the CPU figured
> >>> in blk_mq_hctx_next_cpu(hctx) can be put on again just after the
> >>> following call returns and before __blk_mq_run_hw_queue() is scheduled
> >>> to run.
> >>>
> >>> 	kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs))
> >> We could use cpu_active in __blk_mq_run_hw_queue() to narrow the window.
> >> There is a big gap between cpu_online and cpu_active. rebind_workers is also between them.
> > 
> > This warning is harmless, also you can't reproduce it without help of your
> > special patch, I guess, :-) So the window shouldn't be a big deal. 
> 
> FWIW, every WARN_ON is problematic since there are people running with panic_on_warn.
> If a condition can happen we should not use WARN_ON but something else.

Agree, printk() should be fine, IMO.


-- 
Ming

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-17  9:57                       ` Ming Lei
@ 2018-01-19  3:05                         ` jianchao.wang
  -1 siblings, 0 replies; 33+ messages in thread
From: jianchao.wang @ 2018-01-19  3:05 UTC (permalink / raw)
  To: Ming Lei
  Cc: Keith Busch, Sagi Grimberg, Christoph Hellwig, Jens Axboe,
	Stefan Haberland, linux-kernel, linux-nvme, James Smart,
	linux-block, Christian Borntraeger, Thomas Gleixner,
	Christoph Hellwig

Hi ming

Sorry for delayed report this.

On 01/17/2018 05:57 PM, Ming Lei wrote:
> 2) hctx->next_cpu can become offline from online before __blk_mq_run_hw_queue
> is run, there isn't warning, but once the IO is submitted to hardware,
> after it is completed, how does the HBA/hw queue notify CPU since CPUs
> assigned to this hw queue(irq vector) are offline? blk-mq's timeout
> handler may cover that, but looks too tricky.

In theory, the irq affinity will be migrated to other cpu. This is done by
fixup_irqs() in the context of stop_machine.
However, in my test, I found this log:

[  267.161043] do_IRQ: 7.33 No irq handler for vector

The 33 is the vector used by nvme cq.
The irq seems to be missed and sometimes IO hang occurred.
It is not every time, I think maybe due to nvme_process_cq in nvme_queue_rq.

I add dump stack behind the error log and get following:
[  267.161043] do_IRQ: 7.33 No irq handler for vector migration/7
[  267.161045] CPU: 7 PID: 52 Comm: migration/7 Not tainted 4.15.0-rc7+ #27
[  267.161045] Hardware name: LENOVO 10MLS0E339/3106, BIOS M1AKT22A 06/27/2017
[  267.161046] Call Trace:
[  267.161047]  <IRQ>
[  267.161052]  dump_stack+0x7c/0xb5
[  267.161054]  do_IRQ+0xb9/0xf0
[  267.161056]  common_interrupt+0xa2/0xa2
[  267.161057]  </IRQ>
[  267.161059] RIP: 0010:multi_cpu_stop+0xb0/0x120
[  267.161060] RSP: 0018:ffffbb6c81af7e70 EFLAGS: 00000202 ORIG_RAX: ffffffffffffffde
[  267.161061] RAX: 0000000000000001 RBX: 0000000000000004 RCX: 0000000000000000
[  267.161062] RDX: 0000000000000006 RSI: ffffffff898c4591 RDI: 0000000000000202
[  267.161063] RBP: ffffbb6c826e7c88 R08: ffff991abc1256bc R09: 0000000000000005
[  267.161063] R10: ffffbb6c81af7db8 R11: ffffffff89c91d20 R12: 0000000000000001
[  267.161064] R13: ffffbb6c826e7cac R14: 0000000000000003 R15: 0000000000000000
[  267.161067]  ? cpu_stop_queue_work+0x90/0x90
[  267.161068]  cpu_stopper_thread+0x83/0x100
[  267.161070]  smpboot_thread_fn+0x161/0x220
[  267.161072]  kthread+0xf5/0x130
[  267.161073]  ? sort_range+0x20/0x20
[  267.161074]  ? kthread_associate_blkcg+0xe0/0xe0
[  267.161076]  ret_from_fork+0x24/0x30

The irq just occurred after the irq is enabled in multi_cpu_stop.

0xffffffff8112d655 is in multi_cpu_stop (/home/will/u04/source_code/linux-block/kernel/stop_machine.c:223).
218				 */
219				touch_nmi_watchdog();
220			}
221		} while (curstate != MULTI_STOP_EXIT);
222	
223		local_irq_restore(flags);
224		return err;
225	}


Thanks
Jianchao

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
@ 2018-01-19  3:05                         ` jianchao.wang
  0 siblings, 0 replies; 33+ messages in thread
From: jianchao.wang @ 2018-01-19  3:05 UTC (permalink / raw)


Hi ming

Sorry for delayed report this.

On 01/17/2018 05:57 PM, Ming Lei wrote:
> 2) hctx->next_cpu can become offline from online before __blk_mq_run_hw_queue
> is run, there isn't warning, but once the IO is submitted to hardware,
> after it is completed, how does the HBA/hw queue notify CPU since CPUs
> assigned to this hw queue(irq vector) are offline? blk-mq's timeout
> handler may cover that, but looks too tricky.

In theory, the irq affinity will be migrated to other cpu. This is done by
fixup_irqs() in the context of stop_machine.
However, in my test, I found this log:

[  267.161043] do_IRQ: 7.33 No irq handler for vector

The 33 is the vector used by nvme cq.
The irq seems to be missed and sometimes IO hang occurred.
It is not every time, I think maybe due to nvme_process_cq in nvme_queue_rq.

I add dump stack behind the error log and get following:
[  267.161043] do_IRQ: 7.33 No irq handler for vector migration/7
[  267.161045] CPU: 7 PID: 52 Comm: migration/7 Not tainted 4.15.0-rc7+ #27
[  267.161045] Hardware name: LENOVO 10MLS0E339/3106, BIOS M1AKT22A 06/27/2017
[  267.161046] Call Trace:
[  267.161047]  <IRQ>
[  267.161052]  dump_stack+0x7c/0xb5
[  267.161054]  do_IRQ+0xb9/0xf0
[  267.161056]  common_interrupt+0xa2/0xa2
[  267.161057]  </IRQ>
[  267.161059] RIP: 0010:multi_cpu_stop+0xb0/0x120
[  267.161060] RSP: 0018:ffffbb6c81af7e70 EFLAGS: 00000202 ORIG_RAX: ffffffffffffffde
[  267.161061] RAX: 0000000000000001 RBX: 0000000000000004 RCX: 0000000000000000
[  267.161062] RDX: 0000000000000006 RSI: ffffffff898c4591 RDI: 0000000000000202
[  267.161063] RBP: ffffbb6c826e7c88 R08: ffff991abc1256bc R09: 0000000000000005
[  267.161063] R10: ffffbb6c81af7db8 R11: ffffffff89c91d20 R12: 0000000000000001
[  267.161064] R13: ffffbb6c826e7cac R14: 0000000000000003 R15: 0000000000000000
[  267.161067]  ? cpu_stop_queue_work+0x90/0x90
[  267.161068]  cpu_stopper_thread+0x83/0x100
[  267.161070]  smpboot_thread_fn+0x161/0x220
[  267.161072]  kthread+0xf5/0x130
[  267.161073]  ? sort_range+0x20/0x20
[  267.161074]  ? kthread_associate_blkcg+0xe0/0xe0
[  267.161076]  ret_from_fork+0x24/0x30

The irq just occurred after the irq is enabled in multi_cpu_stop.

0xffffffff8112d655 is in multi_cpu_stop (/home/will/u04/source_code/linux-block/kernel/stop_machine.c:223).
218				 */
219				touch_nmi_watchdog();
220			}
221		} while (curstate != MULTI_STOP_EXIT);
222	
223		local_irq_restore(flags);
224		return err;
225	}


Thanks
Jianchao

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
  2018-01-19  3:05                         ` jianchao.wang
@ 2018-01-26  9:31                           ` Ming Lei
  -1 siblings, 0 replies; 33+ messages in thread
From: Ming Lei @ 2018-01-26  9:31 UTC (permalink / raw)
  To: jianchao.wang
  Cc: Keith Busch, Sagi Grimberg, Christoph Hellwig, Jens Axboe,
	Stefan Haberland, linux-kernel, linux-nvme, James Smart,
	linux-block, Christian Borntraeger, Thomas Gleixner,
	Christoph Hellwig

Hi Jianchao,

On Fri, Jan 19, 2018 at 11:05:35AM +0800, jianchao.wang wrote:
> Hi ming
> 
> Sorry for delayed report this.
> 
> On 01/17/2018 05:57 PM, Ming Lei wrote:
> > 2) hctx->next_cpu can become offline from online before __blk_mq_run_hw_queue
> > is run, there isn't warning, but once the IO is submitted to hardware,
> > after it is completed, how does the HBA/hw queue notify CPU since CPUs
> > assigned to this hw queue(irq vector) are offline? blk-mq's timeout
> > handler may cover that, but looks too tricky.
> 
> In theory, the irq affinity will be migrated to other cpu. This is done by

Yes, but the other CPU should belong to this irq's affinity, and if all
CPUs in the irq's affinity is DEAD, this irq vector will be shutdown,
and if there is in-flight IO or will be, then the completion for this
IOs won't be delivered to CPUs. And now seems we depend on queue's timeout
handler to handle them.

> fixup_irqs() in the context of stop_machine.


> However, in my test, I found this log:
> 
> [  267.161043] do_IRQ: 7.33 No irq handler for vector
> 
> The 33 is the vector used by nvme cq.
> The irq seems to be missed and sometimes IO hang occurred.

As I mentioned above, it shouldn't be strange to see in CPU offline/online
stress test.


-- 
Ming

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU
@ 2018-01-26  9:31                           ` Ming Lei
  0 siblings, 0 replies; 33+ messages in thread
From: Ming Lei @ 2018-01-26  9:31 UTC (permalink / raw)


Hi Jianchao,

On Fri, Jan 19, 2018@11:05:35AM +0800, jianchao.wang wrote:
> Hi ming
> 
> Sorry for delayed report this.
> 
> On 01/17/2018 05:57 PM, Ming Lei wrote:
> > 2) hctx->next_cpu can become offline from online before __blk_mq_run_hw_queue
> > is run, there isn't warning, but once the IO is submitted to hardware,
> > after it is completed, how does the HBA/hw queue notify CPU since CPUs
> > assigned to this hw queue(irq vector) are offline? blk-mq's timeout
> > handler may cover that, but looks too tricky.
> 
> In theory, the irq affinity will be migrated to other cpu. This is done by

Yes, but the other CPU should belong to this irq's affinity, and if all
CPUs in the irq's affinity is DEAD, this irq vector will be shutdown,
and if there is in-flight IO or will be, then the completion for this
IOs won't be delivered to CPUs. And now seems we depend on queue's timeout
handler to handle them.

> fixup_irqs() in the context of stop_machine.


> However, in my test, I found this log:
> 
> [  267.161043] do_IRQ: 7.33 No irq handler for vector
> 
> The 33 is the vector used by nvme cq.
> The irq seems to be missed and sometimes IO hang occurred.

As I mentioned above, it shouldn't be strange to see in CPU offline/online
stress test.


-- 
Ming

^ permalink raw reply	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2018-01-26  9:31 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-01-12  2:53 [PATCH 0/2] blk-mq: support physical CPU hotplug Ming Lei
2018-01-12  2:53 ` [PATCH 1/2] genirq/affinity: assign vectors to all possible CPUs Ming Lei
2018-01-12 19:35   ` Thomas Gleixner
2018-01-12  2:53 ` [PATCH 2/2] blk-mq: simplify queue mapping & schedule with each possisble CPU Ming Lei
2018-01-16 10:00   ` Stefan Haberland
2018-01-16 10:12   ` jianchao.wang
2018-01-16 12:10     ` Ming Lei
2018-01-16 14:31       ` jianchao.wang
2018-01-16 15:11         ` jianchao.wang
2018-01-16 15:32         ` Ming Lei
2018-01-17  2:56           ` jianchao.wang
2018-01-17  3:52             ` Ming Lei
2018-01-17  5:24               ` jianchao.wang
2018-01-17  6:22                 ` Ming Lei
2018-01-17  6:22                   ` Ming Lei
2018-01-17  8:09                   ` jianchao.wang
2018-01-17  8:09                     ` jianchao.wang
2018-01-17  9:57                     ` Ming Lei
2018-01-17  9:57                       ` Ming Lei
2018-01-17 10:07                       ` Christian Borntraeger
2018-01-17 10:07                         ` Christian Borntraeger
2018-01-17 10:14                         ` Christian Borntraeger
2018-01-17 10:14                           ` Christian Borntraeger
2018-01-17 10:17                         ` Ming Lei
2018-01-17 10:17                           ` Ming Lei
2018-01-19  3:05                       ` jianchao.wang
2018-01-19  3:05                         ` jianchao.wang
2018-01-26  9:31                         ` Ming Lei
2018-01-26  9:31                           ` Ming Lei
2018-01-12  8:12 ` [PATCH 0/2] blk-mq: support physical CPU hotplug Christian Borntraeger
2018-01-12 10:47   ` Johannes Thumshirn
2018-01-12 10:47     ` Johannes Thumshirn
2018-01-12 18:02 ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.