From: Daniel Wagner <dwagner@suse.de>
To: Hannes Reinecke <hare@suse.de>
Cc: Sagi Grimberg <sagi@grimberg.me>, Ming Lei <ming.lei@redhat.com>,
Jens Axboe <axboe@kernel.dk>,
linux-block@vger.kernel.org, linux-nvme@lists.infradead.org,
Christoph Hellwig <hch@lst.de>, Wen Xiong <wenxiong@us.ibm.com>,
John Garry <john.garry@huawei.com>
Subject: Re: [PATCH 0/2] blk-mq: fix blk_mq_alloc_request_hctx
Date: Fri, 2 Jul 2021 11:47:56 +0200 [thread overview]
Message-ID: <20210702094756.34zkja2yc6am3gyc@beryllium.lan> (raw)
In-Reply-To: <89081624-fedd-aa94-1ba2-9a137708a1f1@suse.de>
On Wed, Jun 30, 2021 at 09:46:35PM +0200, Hannes Reinecke wrote:
> Actually, Daniel and me came to a slightly different idea: use cpu hotplug
> notifier.
> Thing is, blk-mq already has cpu hotplug notifier, which should ensure that
> no I/O is pending during cpu hotplug.
> If we now add a nvme cpu hotplug notifier which essentially kicks off a
> reset once all cpu in a hctx are offline the reset logic will rearrange the
> queues to match the current cpu layout.
> And when the cpus are getting onlined we'll do another reset.
>
> Daniel is currently preparing a patch; let's see how it goes.
FTR, I can't say it was a huge success. I observed a real problem with
this idea and 'maybe problem. First, we can't use nvme_ctrl_reset_sync()
in the cpuhp path as this will block in various places due to
percpu_rwsem_wait operations [1]. Not really a surprise I'd say.
The maybe problem: I tried the test this time though with
nvme_ctrl_reset() and this worked for a while (a couple of hours of FC
port toggling with stress cpu hotplugging). Though eventually the system
stopped to respond and died. Unfortunately, I don't have any core
dump. So it's not totally clear what's happening.
BTW, I used the same test setup with Ming's patches on top of the two of
mine. The system was still working fine after 4 hours of the stress
testing.
[1] https://paste.opensuse.org/view/raw/62287389
For the fun, here is the nvme_ctrl_reset() patch:
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 66973bb56305..444756ba8dbe 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -20,6 +20,7 @@
#include <linux/ptrace.h>
#include <linux/nvme_ioctl.h>
#include <linux/pm_qos.h>
+#include <linux/cpuhotplug.h>
#include <asm/unaligned.h>
#include "nvme.h"
@@ -4245,6 +4246,8 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl);
void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
{
+ cpuhp_state_remove_instance_nocalls(CPUHP_AP_NVME_QUEUE_ONLINE,
+ &ctrl->cpuhp_node);
nvme_hwmon_exit(ctrl);
nvme_fault_inject_fini(&ctrl->fault_inject);
dev_pm_qos_hide_latency_tolerance(ctrl->device);
@@ -4357,6 +4360,13 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
if (ret)
goto out_free_name;
+ ret = cpuhp_state_add_instance_nocalls(CPUHP_AP_NVME_QUEUE_ONLINE,
+ &ctrl->cpuhp_node);
+ if (ret) {
+ pr_debug("failed to cpuhp notifiers %d\n", ret);
+ goto out_del_device;
+ }
+
/*
* Initialize latency tolerance controls. The sysfs files won't
* be visible to userspace unless the device actually supports APST.
@@ -4369,6 +4379,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
nvme_mpath_init_ctrl(ctrl);
return 0;
+out_del_device:
+ cdev_device_del(&ctrl->cdev, ctrl->device);
out_free_name:
nvme_put_ctrl(ctrl);
kfree_const(ctrl->device->kobj.name);
@@ -4529,6 +4541,80 @@ static inline void _nvme_check_size(void)
BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
}
+static int nvme_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+ struct nvme_ctrl *ctrl = hlist_entry_safe(node,
+ struct nvme_ctrl, cpuhp_node);
+ struct nvme_ns *ns;
+ struct blk_mq_hw_ctx *hctx;
+ unsigned int i;
+ bool present = false;
+
+ pr_warn("NQN %s CPU %u online\n", ctrl->opts->subsysnqn, cpu);
+
+ down_read(&ctrl->namespaces_rwsem);
+ list_for_each_entry(ns, &ctrl->namespaces, list) {
+ queue_for_each_hw_ctx(ns->disk->queue, hctx, i) {
+ if (cpumask_test_cpu(cpu, hctx->cpumask)) {
+ present = true;
+ break;
+ }
+ }
+ if (present)
+ break;
+ }
+ up_read(&ctrl->namespaces_rwsem);
+ if (present) {
+ pr_warn("trigger reset... \n");
+ nvme_reset_ctrl_sync(ctrl);
+ pr_warn("executed\n");
+ }
+
+ return 0;
+}
+
+static inline bool nvme_last_cpu_in_hctx(unsigned int cpu,
+ struct blk_mq_hw_ctx *hctx)
+{
+ if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
+ return false;
+ if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
+ return false;
+ return true;
+}
+
+static int nvme_notify_offline(unsigned int cpu, struct hlist_node *node)
+{
+ struct nvme_ctrl *ctrl = hlist_entry_safe(node,
+ struct nvme_ctrl, cpuhp_node);
+ struct nvme_ns *ns;
+ struct blk_mq_hw_ctx *hctx;
+ unsigned int i;
+ bool offline = false;
+
+ pr_warn("NQN %s CPU %u offline\n", ctrl->opts->subsysnqn, cpu);
+
+ down_read(&ctrl->namespaces_rwsem);
+ list_for_each_entry(ns, &ctrl->namespaces, list) {
+ queue_for_each_hw_ctx(ns->disk->queue, hctx, i) {
+ pr_warn("htcx %p cpumask %*pbl state x%lx\n",
+ hctx, cpumask_pr_args(hctx->cpumask), hctx->state);
+ if (nvme_last_cpu_in_hctx(cpu, hctx)) {
+ offline = true;
+ break;
+ }
+ }
+ if (offline)
+ break;
+ }
+ up_read(&ctrl->namespaces_rwsem);
+ if (offline) {
+ pr_warn("trigger reset... \n");
+ nvme_reset_ctrl_sync(ctrl);
+ pr_warn("executed\n");
+ }
+ return 0;
+}
static int __init nvme_core_init(void)
{
@@ -4580,8 +4666,17 @@ static int __init nvme_core_init(void)
goto unregister_generic_ns;
}
+ result = cpuhp_setup_state_multi(CPUHP_AP_NVME_QUEUE_ONLINE,
+ "nvme:online",
+ nvme_notify_online,
+ nvme_notify_offline);
+ if (result < 0)
+ goto destroy_chr_class;
+
return 0;
+destroy_chr_class:
+ class_destroy(nvme_ns_chr_class);
unregister_generic_ns:
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
destroy_subsys_class:
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 0015860ec12b..34676ff79d40 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -352,6 +352,8 @@ struct nvme_ctrl {
unsigned long discard_page_busy;
struct nvme_fault_inject fault_inject;
+
+ struct hlist_node cpuhp_node;
};
enum nvme_iopolicy {
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 4a62b3980642..5f6cf60bee1f 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -159,6 +159,7 @@ enum cpuhp_state {
CPUHP_AP_X86_VDSO_VMA_ONLINE,
CPUHP_AP_IRQ_AFFINITY_ONLINE,
CPUHP_AP_BLK_MQ_ONLINE,
+ CPUHP_AP_NVME_QUEUE_ONLINE,
CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS,
CPUHP_AP_X86_INTEL_EPB_ONLINE,
CPUHP_AP_PERF_ONLINE,
_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme
prev parent reply other threads:[~2021-07-02 9:48 UTC|newest]
Thread overview: 25+ messages / expand[flat|nested] mbox.gz Atom feed top
2021-06-29 7:49 [PATCH 0/2] blk-mq: fix blk_mq_alloc_request_hctx Ming Lei
2021-06-29 7:49 ` [PATCH 1/2] blk-mq: not deactivate hctx if the device doesn't use managed irq Ming Lei
2021-06-29 12:39 ` Hannes Reinecke
2021-06-29 14:17 ` Ming Lei
2021-06-29 15:49 ` John Garry
2021-06-30 0:32 ` Ming Lei
2021-06-30 9:25 ` John Garry
2021-07-01 9:52 ` Christoph Hellwig
2021-06-29 23:30 ` Damien Le Moal
2021-06-30 18:58 ` Sagi Grimberg
2021-06-30 21:57 ` Damien Le Moal
2021-07-01 14:20 ` Keith Busch
2021-06-29 7:49 ` [PATCH 2/2] nvme: pass BLK_MQ_F_NOT_USE_MANAGED_IRQ for fc/rdma/tcp/loop Ming Lei
2021-06-30 8:15 ` Hannes Reinecke
2021-06-30 8:47 ` Ming Lei
2021-06-30 8:18 ` [PATCH 0/2] blk-mq: fix blk_mq_alloc_request_hctx Hannes Reinecke
2021-06-30 8:42 ` Ming Lei
2021-06-30 9:43 ` Hannes Reinecke
2021-06-30 9:53 ` Ming Lei
2021-06-30 18:59 ` Sagi Grimberg
2021-06-30 19:46 ` Hannes Reinecke
2021-06-30 23:59 ` Ming Lei
2021-07-01 8:00 ` Hannes Reinecke
2021-07-01 9:13 ` Ming Lei
2021-07-02 9:47 ` Daniel Wagner [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20210702094756.34zkja2yc6am3gyc@beryllium.lan \
--to=dwagner@suse.de \
--cc=axboe@kernel.dk \
--cc=hare@suse.de \
--cc=hch@lst.de \
--cc=john.garry@huawei.com \
--cc=linux-block@vger.kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=ming.lei@redhat.com \
--cc=sagi@grimberg.me \
--cc=wenxiong@us.ibm.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).