linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug
@ 2019-06-24 15:24 Wenbin Zeng
  2019-06-25  1:30 ` Dongli Zhang
  2019-06-25  1:55 ` [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug Ming Lei
  0 siblings, 2 replies; 8+ messages in thread
From: Wenbin Zeng @ 2019-06-24 15:24 UTC (permalink / raw)
  To: axboe, keith.busch, hare, ming.lei, osandov, sagi, bvanassche
  Cc: linux-block, linux-kernel, Wenbin Zeng

Currently hctx->cpumask is not updated when hot-plugging new cpus,
as there are many chances kblockd_mod_delayed_work_on() getting
called with WORK_CPU_UNBOUND, workqueue blk_mq_run_work_fn may run
on the newly-plugged cpus, consequently __blk_mq_run_hw_queue()
reporting excessive "run queue from wrong CPU" messages because
cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) returns false.

This patch added a cpu-hotplug handler into blk-mq, updating
hctx->cpumask at cpu-hotplug.

Signed-off-by: Wenbin Zeng <wenbinzeng@tencent.com>
---
 block/blk-mq.c         | 29 +++++++++++++++++++++++++++++
 include/linux/blk-mq.h |  1 +
 2 files changed, 30 insertions(+)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index ce0f5f4..2e465fc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -39,6 +39,8 @@
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 
+static enum cpuhp_state cpuhp_blk_mq_online;
+
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
@@ -2215,6 +2217,21 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 	return -ENOMEM;
 }
 
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct blk_mq_hw_ctx *hctx;
+
+	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online);
+
+	if (!cpumask_test_cpu(cpu, hctx->cpumask)) {
+		mutex_lock(&hctx->queue->sysfs_lock);
+		cpumask_set_cpu(cpu, hctx->cpumask);
+		mutex_unlock(&hctx->queue->sysfs_lock);
+	}
+
+	return 0;
+}
+
 /*
  * 'cpu' is going away. splice any existing rq_list entries from this
  * software queue to the hw queue dispatch list, and ensure that it
@@ -2251,6 +2268,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 
 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
 {
+	if (cpuhp_blk_mq_online > 0)
+		cpuhp_state_remove_instance_nocalls(cpuhp_blk_mq_online,
+						    &hctx->cpuhp_online);
 	cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
 					    &hctx->cpuhp_dead);
 }
@@ -2310,6 +2330,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
 {
 	hctx->queue_num = hctx_idx;
 
+	if (cpuhp_blk_mq_online > 0)
+		cpuhp_state_add_instance_nocalls(cpuhp_blk_mq_online,
+						 &hctx->cpuhp_online);
 	cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
 
 	hctx->tags = set->tags[hctx_idx];
@@ -3544,6 +3567,12 @@ unsigned int blk_mq_rq_cpu(struct request *rq)
 
 static int __init blk_mq_init(void)
 {
+	cpuhp_blk_mq_online = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+					"block/mq:online",
+					blk_mq_hctx_notify_online, NULL);
+	if (cpuhp_blk_mq_online <= 0)
+		pr_warn("blk_mq_init: failed to setup cpu online callback\n");
+
 	cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
 				blk_mq_hctx_notify_dead);
 	return 0;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 15d1aa5..5241659 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -58,6 +58,7 @@ struct blk_mq_hw_ctx {
 
 	atomic_t		nr_active;
 
+	struct hlist_node	cpuhp_online;
 	struct hlist_node	cpuhp_dead;
 	struct kobject		kobj;
 
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug
  2019-06-24 15:24 [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug Wenbin Zeng
@ 2019-06-25  1:30 ` Dongli Zhang
  2019-06-25  2:21   ` [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug(Internet mail) wenbinzeng(曾文斌)
  2019-06-25  1:55 ` [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug Ming Lei
  1 sibling, 1 reply; 8+ messages in thread
From: Dongli Zhang @ 2019-06-25  1:30 UTC (permalink / raw)
  To: Wenbin Zeng
  Cc: axboe, keith.busch, hare, ming.lei, osandov, sagi, bvanassche,
	linux-block, linux-kernel, Wenbin Zeng

Hi Wenbin,

On 6/24/19 11:24 PM, Wenbin Zeng wrote:
> Currently hctx->cpumask is not updated when hot-plugging new cpus,
> as there are many chances kblockd_mod_delayed_work_on() getting
> called with WORK_CPU_UNBOUND, workqueue blk_mq_run_work_fn may run
> on the newly-plugged cpus, consequently __blk_mq_run_hw_queue()
> reporting excessive "run queue from wrong CPU" messages because
> cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) returns false.
> 
> This patch added a cpu-hotplug handler into blk-mq, updating
> hctx->cpumask at cpu-hotplug.
> 
> Signed-off-by: Wenbin Zeng <wenbinzeng@tencent.com>
> ---
>  block/blk-mq.c         | 29 +++++++++++++++++++++++++++++
>  include/linux/blk-mq.h |  1 +
>  2 files changed, 30 insertions(+)
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index ce0f5f4..2e465fc 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -39,6 +39,8 @@
>  #include "blk-mq-sched.h"
>  #include "blk-rq-qos.h"
>  
> +static enum cpuhp_state cpuhp_blk_mq_online;
> +
>  static void blk_mq_poll_stats_start(struct request_queue *q);
>  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>  
> @@ -2215,6 +2217,21 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
>  	return -ENOMEM;
>  }
>  
> +static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
> +{
> +	struct blk_mq_hw_ctx *hctx;
> +
> +	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online);
> +
> +	if (!cpumask_test_cpu(cpu, hctx->cpumask)) {
> +		mutex_lock(&hctx->queue->sysfs_lock);
> +		cpumask_set_cpu(cpu, hctx->cpumask);
> +		mutex_unlock(&hctx->queue->sysfs_lock);
> +	}
> +
> +	return 0;
> +}
> +

As this callback is registered for each hctx, when a cpu is online, it is called
for each hctx.

Just taking a 4-queue nvme as example (regardless about other block like loop).
Suppose cpu=2 (out of 0, 1, 2 and 3) is offline. When we online cpu=2,

blk_mq_hctx_notify_online() called: cpu=2 and blk_mq_hw_ctx->queue_num=3
blk_mq_hctx_notify_online() called: cpu=2 and blk_mq_hw_ctx->queue_num=2
blk_mq_hctx_notify_online() called: cpu=2 and blk_mq_hw_ctx->queue_num=1
blk_mq_hctx_notify_online() called: cpu=2 and blk_mq_hw_ctx->queue_num=0

There is no need to set cpu 2 for blk_mq_hw_ctx->queue_num=[3, 1, 0]. I am
afraid this patch would erroneously set cpumask for blk_mq_hw_ctx->queue_num=[3,
1, 0].

I used to submit the below patch explaining above for removing a cpu and it is
unfortunately not merged yet.

https://patchwork.kernel.org/patch/10889307/


Another thing is during initialization, the hctx->cpumask should already been
set and even the cpu is offline. Would you please explain the case hctx->cpumask
is not set correctly, e.g., how to reproduce with a kvm guest running
scsi/virtio/nvme/loop?

Dongli Zhang

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug
  2019-06-24 15:24 [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug Wenbin Zeng
  2019-06-25  1:30 ` Dongli Zhang
@ 2019-06-25  1:55 ` Ming Lei
  2019-06-25  2:14   ` [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug(Internet mail) wenbinzeng(曾文斌)
  1 sibling, 1 reply; 8+ messages in thread
From: Ming Lei @ 2019-06-25  1:55 UTC (permalink / raw)
  To: Wenbin Zeng
  Cc: axboe, keith.busch, hare, osandov, sagi, bvanassche, linux-block,
	linux-kernel, Wenbin Zeng

On Mon, Jun 24, 2019 at 11:24:07PM +0800, Wenbin Zeng wrote:
> Currently hctx->cpumask is not updated when hot-plugging new cpus,
> as there are many chances kblockd_mod_delayed_work_on() getting
> called with WORK_CPU_UNBOUND, workqueue blk_mq_run_work_fn may run

There are only two cases in which WORK_CPU_UNBOUND is applied:

1) single hw queue

2) multiple hw queue, and all CPUs in this hctx become offline

For 1), all CPUs can be found in hctx->cpumask.

> on the newly-plugged cpus, consequently __blk_mq_run_hw_queue()
> reporting excessive "run queue from wrong CPU" messages because
> cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) returns false.

The message means CPU hotplug race is triggered.

Yeah, there is big problem in blk_mq_hctx_notify_dead() which is called
after one CPU is dead, but still run this hw queue to dispatch request,
and all CPUs in this hctx might become offline.

We have some discussion before on this issue:

https://lore.kernel.org/linux-block/CACVXFVN729SgFQGUgmu1iN7P6Mv5+puE78STz8hj9J5bS828Ng@mail.gmail.com/

> 
> This patch added a cpu-hotplug handler into blk-mq, updating
> hctx->cpumask at cpu-hotplug.

This way isn't correct, hctx->cpumask should be kept as sync with
queue mapping.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug(Internet mail)
  2019-06-25  1:55 ` [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug Ming Lei
@ 2019-06-25  2:14   ` wenbinzeng(曾文斌)
  2019-06-25  2:27     ` Ming Lei
  0 siblings, 1 reply; 8+ messages in thread
From: wenbinzeng(曾文斌) @ 2019-06-25  2:14 UTC (permalink / raw)
  To: Ming Lei, Wenbin Zeng
  Cc: axboe, keith.busch, hare, osandov, sagi, bvanassche, linux-block,
	linux-kernel

Hi Ming,

> -----Original Message-----
> From: Ming Lei <ming.lei@redhat.com>
> Sent: Tuesday, June 25, 2019 9:55 AM
> To: Wenbin Zeng <wenbin.zeng@gmail.com>
> Cc: axboe@kernel.dk; keith.busch@intel.com; hare@suse.com; osandov@fb.com;
> sagi@grimberg.me; bvanassche@acm.org; linux-block@vger.kernel.org;
> linux-kernel@vger.kernel.org; wenbinzeng(曾文斌) <wenbinzeng@tencent.com>
> Subject: Re: [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug(Internet mail)
> 
> On Mon, Jun 24, 2019 at 11:24:07PM +0800, Wenbin Zeng wrote:
> > Currently hctx->cpumask is not updated when hot-plugging new cpus,
> > as there are many chances kblockd_mod_delayed_work_on() getting
> > called with WORK_CPU_UNBOUND, workqueue blk_mq_run_work_fn may run
> 
> There are only two cases in which WORK_CPU_UNBOUND is applied:
> 
> 1) single hw queue
> 
> 2) multiple hw queue, and all CPUs in this hctx become offline
> 
> For 1), all CPUs can be found in hctx->cpumask.
> 
> > on the newly-plugged cpus, consequently __blk_mq_run_hw_queue()
> > reporting excessive "run queue from wrong CPU" messages because
> > cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) returns false.
> 
> The message means CPU hotplug race is triggered.
> 
> Yeah, there is big problem in blk_mq_hctx_notify_dead() which is called
> after one CPU is dead, but still run this hw queue to dispatch request,
> and all CPUs in this hctx might become offline.
> 
> We have some discussion before on this issue:
> 
> https://lore.kernel.org/linux-block/CACVXFVN729SgFQGUgmu1iN7P6Mv5+puE78STz8hj
> 9J5bS828Ng@mail.gmail.com/
> 

There is another scenario, you can reproduce it by hot-plugging cpus to kvm guests via qemu monitor (I believe virsh setvcpus --live can do the same thing), for example:
(qemu) cpu-add 1
(qemu) cpu-add 2
(qemu) cpu-add 3

In such scenario, cpu 1, 2 and 3 are not visible at boot, hctx->cpumask doesn't get synced when these cpus are added.

> >
> > This patch added a cpu-hotplug handler into blk-mq, updating
> > hctx->cpumask at cpu-hotplug.
> 
> This way isn't correct, hctx->cpumask should be kept as sync with
> queue mapping.

Please advise what should I do to deal with the above situation? Thanks a lot.

> 
> Thanks,
> Ming


^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug(Internet mail)
  2019-06-25  1:30 ` Dongli Zhang
@ 2019-06-25  2:21   ` wenbinzeng(曾文斌)
  0 siblings, 0 replies; 8+ messages in thread
From: wenbinzeng(曾文斌) @ 2019-06-25  2:21 UTC (permalink / raw)
  To: Dongli Zhang, Wenbin Zeng
  Cc: axboe, keith.busch, hare, ming.lei, osandov, sagi, bvanassche,
	linux-block, linux-kernel

Hi Dongli,

> -----Original Message-----
> From: Dongli Zhang <dongli.zhang@oracle.com>
> Sent: Tuesday, June 25, 2019 9:30 AM
> To: Wenbin Zeng <wenbin.zeng@gmail.com>
> Cc: axboe@kernel.dk; keith.busch@intel.com; hare@suse.com; ming.lei@redhat.com;
> osandov@fb.com; sagi@grimberg.me; bvanassche@acm.org;
> linux-block@vger.kernel.org; linux-kernel@vger.kernel.org; wenbinzeng(曾文斌)
> <wenbinzeng@tencent.com>
> Subject: Re: [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug(Internet mail)
> 
> Hi Wenbin,
> 
> On 6/24/19 11:24 PM, Wenbin Zeng wrote:
> > Currently hctx->cpumask is not updated when hot-plugging new cpus,
> > as there are many chances kblockd_mod_delayed_work_on() getting
> > called with WORK_CPU_UNBOUND, workqueue blk_mq_run_work_fn may run
> > on the newly-plugged cpus, consequently __blk_mq_run_hw_queue()
> > reporting excessive "run queue from wrong CPU" messages because
> > cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) returns false.
> >
> > This patch added a cpu-hotplug handler into blk-mq, updating
> > hctx->cpumask at cpu-hotplug.
> >
> > Signed-off-by: Wenbin Zeng <wenbinzeng@tencent.com>
> > ---
> >  block/blk-mq.c         | 29 +++++++++++++++++++++++++++++
> >  include/linux/blk-mq.h |  1 +
> >  2 files changed, 30 insertions(+)
> >
> > diff --git a/block/blk-mq.c b/block/blk-mq.c
> > index ce0f5f4..2e465fc 100644
> > --- a/block/blk-mq.c
> > +++ b/block/blk-mq.c
> > @@ -39,6 +39,8 @@
> >  #include "blk-mq-sched.h"
> >  #include "blk-rq-qos.h"
> >
> > +static enum cpuhp_state cpuhp_blk_mq_online;
> > +
> >  static void blk_mq_poll_stats_start(struct request_queue *q);
> >  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
> >
> > @@ -2215,6 +2217,21 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct
> blk_mq_tags *tags,
> >  	return -ENOMEM;
> >  }
> >
> > +static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
> > +{
> > +	struct blk_mq_hw_ctx *hctx;
> > +
> > +	hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online);
> > +
> > +	if (!cpumask_test_cpu(cpu, hctx->cpumask)) {
> > +		mutex_lock(&hctx->queue->sysfs_lock);
> > +		cpumask_set_cpu(cpu, hctx->cpumask);
> > +		mutex_unlock(&hctx->queue->sysfs_lock);
> > +	}
> > +
> > +	return 0;
> > +}
> > +
> 
> As this callback is registered for each hctx, when a cpu is online, it is called
> for each hctx.
> 
> Just taking a 4-queue nvme as example (regardless about other block like loop).
> Suppose cpu=2 (out of 0, 1, 2 and 3) is offline. When we online cpu=2,
> 
> blk_mq_hctx_notify_online() called: cpu=2 and blk_mq_hw_ctx->queue_num=3
> blk_mq_hctx_notify_online() called: cpu=2 and blk_mq_hw_ctx->queue_num=2
> blk_mq_hctx_notify_online() called: cpu=2 and blk_mq_hw_ctx->queue_num=1
> blk_mq_hctx_notify_online() called: cpu=2 and blk_mq_hw_ctx->queue_num=0
> 
> There is no need to set cpu 2 for blk_mq_hw_ctx->queue_num=[3, 1, 0]. I am
> afraid this patch would erroneously set cpumask for blk_mq_hw_ctx->queue_num=[3,
> 1, 0].
> 
> I used to submit the below patch explaining above for removing a cpu and it is
> unfortunately not merged yet.
> 
> https://patchwork.kernel.org/patch/10889307/
> 
> 
> Another thing is during initialization, the hctx->cpumask should already been
> set and even the cpu is offline. Would you please explain the case hctx->cpumask
> is not set correctly, e.g., how to reproduce with a kvm guest running
> scsi/virtio/nvme/loop?

My scenario is:
A kvm guest started with single cpu, during initialization only one cpu was visible by kernel.
After boot, I hot-add some cpus via qemu monitor (I believe virsh setvcpus --live can do the same thing), for example:
(qemu) cpu-add 1
(qemu) cpu-add 2
(qemu) cpu-add 3

In such scenario, hctx->cpumask doesn't get updated when these cpus are added.

> 
> Dongli Zhang


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug(Internet mail)
  2019-06-25  2:14   ` [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug(Internet mail) wenbinzeng(曾文斌)
@ 2019-06-25  2:27     ` Ming Lei
  2019-06-25  2:51       ` Dongli Zhang
  2019-06-25  3:28       ` wenbinzeng(曾文斌)
  0 siblings, 2 replies; 8+ messages in thread
From: Ming Lei @ 2019-06-25  2:27 UTC (permalink / raw)
  To: wenbinzeng(曾文斌)
  Cc: Wenbin Zeng, axboe, keith.busch, hare, osandov, sagi, bvanassche,
	linux-block, linux-kernel

On Tue, Jun 25, 2019 at 02:14:46AM +0000, wenbinzeng(曾文斌) wrote:
> Hi Ming,
> 
> > -----Original Message-----
> > From: Ming Lei <ming.lei@redhat.com>
> > Sent: Tuesday, June 25, 2019 9:55 AM
> > To: Wenbin Zeng <wenbin.zeng@gmail.com>
> > Cc: axboe@kernel.dk; keith.busch@intel.com; hare@suse.com; osandov@fb.com;
> > sagi@grimberg.me; bvanassche@acm.org; linux-block@vger.kernel.org;
> > linux-kernel@vger.kernel.org; wenbinzeng(曾文斌) <wenbinzeng@tencent.com>
> > Subject: Re: [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug(Internet mail)
> > 
> > On Mon, Jun 24, 2019 at 11:24:07PM +0800, Wenbin Zeng wrote:
> > > Currently hctx->cpumask is not updated when hot-plugging new cpus,
> > > as there are many chances kblockd_mod_delayed_work_on() getting
> > > called with WORK_CPU_UNBOUND, workqueue blk_mq_run_work_fn may run
> > 
> > There are only two cases in which WORK_CPU_UNBOUND is applied:
> > 
> > 1) single hw queue
> > 
> > 2) multiple hw queue, and all CPUs in this hctx become offline
> > 
> > For 1), all CPUs can be found in hctx->cpumask.
> > 
> > > on the newly-plugged cpus, consequently __blk_mq_run_hw_queue()
> > > reporting excessive "run queue from wrong CPU" messages because
> > > cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) returns false.
> > 
> > The message means CPU hotplug race is triggered.
> > 
> > Yeah, there is big problem in blk_mq_hctx_notify_dead() which is called
> > after one CPU is dead, but still run this hw queue to dispatch request,
> > and all CPUs in this hctx might become offline.
> > 
> > We have some discussion before on this issue:
> > 
> > https://lore.kernel.org/linux-block/CACVXFVN729SgFQGUgmu1iN7P6Mv5+puE78STz8hj
> > 9J5bS828Ng@mail.gmail.com/
> > 
> 
> There is another scenario, you can reproduce it by hot-plugging cpus to kvm guests via qemu monitor (I believe virsh setvcpus --live can do the same thing), for example:
> (qemu) cpu-add 1
> (qemu) cpu-add 2
> (qemu) cpu-add 3
> 
> In such scenario, cpu 1, 2 and 3 are not visible at boot, hctx->cpumask doesn't get synced when these cpus are added.

It is CPU cold-plug, we suppose to support it.

The new added CPUs should be visible to hctx, since we spread queues
among all possible CPUs(), please see blk_mq_map_queues() and
irq_build_affinity_masks(), which is like static allocation on CPU
resources.

Otherwise, you might use an old kernel or there is bug somewhere.

> 
> > >
> > > This patch added a cpu-hotplug handler into blk-mq, updating
> > > hctx->cpumask at cpu-hotplug.
> > 
> > This way isn't correct, hctx->cpumask should be kept as sync with
> > queue mapping.
> 
> Please advise what should I do to deal with the above situation? Thanks a lot.

As I shared in last email, there is one approach discussed, which seems
doable.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug(Internet mail)
  2019-06-25  2:27     ` Ming Lei
@ 2019-06-25  2:51       ` Dongli Zhang
  2019-06-25  3:28       ` wenbinzeng(曾文斌)
  1 sibling, 0 replies; 8+ messages in thread
From: Dongli Zhang @ 2019-06-25  2:51 UTC (permalink / raw)
  To: wenbinzeng(曾文斌)
  Cc: Ming Lei, Wenbin Zeng, axboe, keith.busch, hare, osandov, sagi,
	bvanassche, linux-block, linux-kernel



On 6/25/19 10:27 AM, Ming Lei wrote:
> On Tue, Jun 25, 2019 at 02:14:46AM +0000, wenbinzeng(曾文斌) wrote:
>> Hi Ming,
>>
>>> -----Original Message-----
>>> From: Ming Lei <ming.lei@redhat.com>
>>> Sent: Tuesday, June 25, 2019 9:55 AM
>>> To: Wenbin Zeng <wenbin.zeng@gmail.com>
>>> Cc: axboe@kernel.dk; keith.busch@intel.com; hare@suse.com; osandov@fb.com;
>>> sagi@grimberg.me; bvanassche@acm.org; linux-block@vger.kernel.org;
>>> linux-kernel@vger.kernel.org; wenbinzeng(曾文斌) <wenbinzeng@tencent.com>
>>> Subject: Re: [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug(Internet mail)
>>>
>>> On Mon, Jun 24, 2019 at 11:24:07PM +0800, Wenbin Zeng wrote:
>>>> Currently hctx->cpumask is not updated when hot-plugging new cpus,
>>>> as there are many chances kblockd_mod_delayed_work_on() getting
>>>> called with WORK_CPU_UNBOUND, workqueue blk_mq_run_work_fn may run
>>>
>>> There are only two cases in which WORK_CPU_UNBOUND is applied:
>>>
>>> 1) single hw queue
>>>
>>> 2) multiple hw queue, and all CPUs in this hctx become offline
>>>
>>> For 1), all CPUs can be found in hctx->cpumask.
>>>
>>>> on the newly-plugged cpus, consequently __blk_mq_run_hw_queue()
>>>> reporting excessive "run queue from wrong CPU" messages because
>>>> cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) returns false.
>>>
>>> The message means CPU hotplug race is triggered.
>>>
>>> Yeah, there is big problem in blk_mq_hctx_notify_dead() which is called
>>> after one CPU is dead, but still run this hw queue to dispatch request,
>>> and all CPUs in this hctx might become offline.
>>>
>>> We have some discussion before on this issue:
>>>
>>> https://lore.kernel.org/linux-block/CACVXFVN729SgFQGUgmu1iN7P6Mv5+puE78STz8hj
>>> 9J5bS828Ng@mail.gmail.com/
>>>
>>
>> There is another scenario, you can reproduce it by hot-plugging cpus to kvm guests via qemu monitor (I believe virsh setvcpus --live can do the same thing), for example:
>> (qemu) cpu-add 1
>> (qemu) cpu-add 2
>> (qemu) cpu-add 3
>>
>> In such scenario, cpu 1, 2 and 3 are not visible at boot, hctx->cpumask doesn't get synced when these cpus are added.

Here is how I play with it with the most recent qemu and linux.

Boot VM with 1 out of 4 vcpu online.

# qemu-system-x86_64 -hda disk.img \
-smp 1,maxcpus=4 \
-m 4096M -enable-kvm \
-device nvme,drive=lightnvme,serial=deadbeaf1 \
-drive file=nvme.img,if=none,id=lightnvme \
-vnc :0 \
-kernel /.../mainline-linux/arch/x86_64/boot/bzImage \
-append "root=/dev/sda1 init=/sbin/init text" \
-monitor stdio -net nic -net user,hostfwd=tcp::5022-:22


As Ming mentioned, after boot:

# cat /proc/cpuinfo  | grep processor
processor	: 0

# cat /sys/block/nvme0n1/mq/0/cpu_list
0
# cat /sys/block/nvme0n1/mq/1/cpu_list
1
# cat /sys/block/nvme0n1/mq/2/cpu_list
2
# cat /sys/block/nvme0n1/mq/3/cpu_list
3

# cat /proc/interrupts | grep nvme
 24:         11   PCI-MSI 65536-edge      nvme0q0
 25:         78   PCI-MSI 65537-edge      nvme0q1
 26:          0   PCI-MSI 65538-edge      nvme0q2
 27:          0   PCI-MSI 65539-edge      nvme0q3
 28:          0   PCI-MSI 65540-edge      nvme0q4

I hotplug with "device_add
qemu64-x86_64-cpu,id=core1,socket-id=1,core-id=0,thread-id=0" in qemu monitor.

Dongli Zhang

> 
> It is CPU cold-plug, we suppose to support it.
> 
> The new added CPUs should be visible to hctx, since we spread queues
> among all possible CPUs(), please see blk_mq_map_queues() and
> irq_build_affinity_masks(), which is like static allocation on CPU
> resources.
> 
> Otherwise, you might use an old kernel or there is bug somewhere.
> 
>>
>>>>
>>>> This patch added a cpu-hotplug handler into blk-mq, updating
>>>> hctx->cpumask at cpu-hotplug.
>>>
>>> This way isn't correct, hctx->cpumask should be kept as sync with
>>> queue mapping.
>>
>> Please advise what should I do to deal with the above situation? Thanks a lot.
> 
> As I shared in last email, there is one approach discussed, which seems
> doable.
> 
> Thanks,
> Ming
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug(Internet mail)
  2019-06-25  2:27     ` Ming Lei
  2019-06-25  2:51       ` Dongli Zhang
@ 2019-06-25  3:28       ` wenbinzeng(曾文斌)
  1 sibling, 0 replies; 8+ messages in thread
From: wenbinzeng(曾文斌) @ 2019-06-25  3:28 UTC (permalink / raw)
  To: Ming Lei
  Cc: Wenbin Zeng, axboe, keith.busch, hare, osandov, sagi, bvanassche,
	linux-block, linux-kernel

Hi Ming,

> -----Original Message-----
> From: Ming Lei <ming.lei@redhat.com>
> Sent: Tuesday, June 25, 2019 10:27 AM
> To: wenbinzeng(曾文斌) <wenbinzeng@tencent.com>
> Cc: Wenbin Zeng <wenbin.zeng@gmail.com>; axboe@kernel.dk; keith.busch@intel.com;
> hare@suse.com; osandov@fb.com; sagi@grimberg.me; bvanassche@acm.org;
> linux-block@vger.kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug(Internet mail)
> 
> On Tue, Jun 25, 2019 at 02:14:46AM +0000, wenbinzeng(曾文斌) wrote:
> > Hi Ming,
> >
> > > -----Original Message-----
> > > From: Ming Lei <ming.lei@redhat.com>
> > > Sent: Tuesday, June 25, 2019 9:55 AM
> > > To: Wenbin Zeng <wenbin.zeng@gmail.com>
> > > Cc: axboe@kernel.dk; keith.busch@intel.com; hare@suse.com;
> > > osandov@fb.com; sagi@grimberg.me; bvanassche@acm.org;
> > > linux-block@vger.kernel.org; linux-kernel@vger.kernel.org;
> > > wenbinzeng(曾文斌) <wenbinzeng@tencent.com>
> > > Subject: Re: [PATCH] blk-mq: update hctx->cpumask at
> > > cpu-hotplug(Internet mail)
> > >
> > > On Mon, Jun 24, 2019 at 11:24:07PM +0800, Wenbin Zeng wrote:
> > > > Currently hctx->cpumask is not updated when hot-plugging new cpus,
> > > > as there are many chances kblockd_mod_delayed_work_on() getting
> > > > called with WORK_CPU_UNBOUND, workqueue blk_mq_run_work_fn may run
> > >
> > > There are only two cases in which WORK_CPU_UNBOUND is applied:
> > >
> > > 1) single hw queue
> > >
> > > 2) multiple hw queue, and all CPUs in this hctx become offline
> > >
> > > For 1), all CPUs can be found in hctx->cpumask.
> > >
> > > > on the newly-plugged cpus, consequently __blk_mq_run_hw_queue()
> > > > reporting excessive "run queue from wrong CPU" messages because
> > > > cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) returns false.
> > >
> > > The message means CPU hotplug race is triggered.
> > >
> > > Yeah, there is big problem in blk_mq_hctx_notify_dead() which is
> > > called after one CPU is dead, but still run this hw queue to
> > > dispatch request, and all CPUs in this hctx might become offline.
> > >
> > > We have some discussion before on this issue:
> > >
> > > https://lore.kernel.org/linux-block/CACVXFVN729SgFQGUgmu1iN7P6Mv5+pu
> > > E78STz8hj
> > > 9J5bS828Ng@mail.gmail.com/
> > >
> >
> > There is another scenario, you can reproduce it by hot-plugging cpus to kvm guests
> via qemu monitor (I believe virsh setvcpus --live can do the same thing), for example:
> > (qemu) cpu-add 1
> > (qemu) cpu-add 2
> > (qemu) cpu-add 3
> >
> > In such scenario, cpu 1, 2 and 3 are not visible at boot, hctx->cpumask doesn't
> get synced when these cpus are added.
> 
> It is CPU cold-plug, we suppose to support it.
> 
> The new added CPUs should be visible to hctx, since we spread queues among all
> possible CPUs(), please see blk_mq_map_queues() and irq_build_affinity_masks(),
> which is like static allocation on CPU resources.
> 
> Otherwise, you might use an old kernel or there is bug somewhere.

It turns out that I was using old kernel, version 4.14, I tested the latest version, it works well as you said. Thank you very much.

> 
> >
> > > >
> > > > This patch added a cpu-hotplug handler into blk-mq, updating
> > > > hctx->cpumask at cpu-hotplug.
> > >
> > > This way isn't correct, hctx->cpumask should be kept as sync with
> > > queue mapping.
> >
> > Please advise what should I do to deal with the above situation? Thanks a lot.
> 
> As I shared in last email, there is one approach discussed, which seems doable.
> 
> Thanks,
> Ming


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2019-06-25  3:28 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-06-24 15:24 [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug Wenbin Zeng
2019-06-25  1:30 ` Dongli Zhang
2019-06-25  2:21   ` [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug(Internet mail) wenbinzeng(曾文斌)
2019-06-25  1:55 ` [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug Ming Lei
2019-06-25  2:14   ` [PATCH] blk-mq: update hctx->cpumask at cpu-hotplug(Internet mail) wenbinzeng(曾文斌)
2019-06-25  2:27     ` Ming Lei
2019-06-25  2:51       ` Dongli Zhang
2019-06-25  3:28       ` wenbinzeng(曾文斌)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).