linux-nvme.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
From: Daniel Wagner <dwagner@suse.de>
To: Hannes Reinecke <hare@suse.de>
Cc: Sagi Grimberg <sagi@grimberg.me>, Ming Lei <ming.lei@redhat.com>,
	Jens Axboe <axboe@kernel.dk>,
	linux-block@vger.kernel.org, linux-nvme@lists.infradead.org,
	Christoph Hellwig <hch@lst.de>, Wen Xiong <wenxiong@us.ibm.com>,
	John Garry <john.garry@huawei.com>
Subject: Re: [PATCH 0/2] blk-mq: fix blk_mq_alloc_request_hctx
Date: Fri, 2 Jul 2021 11:47:56 +0200	[thread overview]
Message-ID: <20210702094756.34zkja2yc6am3gyc@beryllium.lan> (raw)
In-Reply-To: <89081624-fedd-aa94-1ba2-9a137708a1f1@suse.de>

On Wed, Jun 30, 2021 at 09:46:35PM +0200, Hannes Reinecke wrote:
> Actually, Daniel and me came to a slightly different idea: use cpu hotplug
> notifier.
> Thing is, blk-mq already has cpu hotplug notifier, which should ensure that
> no I/O is pending during cpu hotplug.
> If we now add a nvme cpu hotplug notifier which essentially kicks off a
> reset once all cpu in a hctx are offline the reset logic will rearrange the
> queues to match the current cpu layout.
> And when the cpus are getting onlined we'll do another reset.
> 
> Daniel is currently preparing a patch; let's see how it goes.

FTR, I can't say it was a huge success. I observed a real problem with
this idea and 'maybe problem. First, we can't use nvme_ctrl_reset_sync()
in the cpuhp path as this will block in various places due to
percpu_rwsem_wait operations [1]. Not really a surprise I'd say.

The maybe problem: I tried the test this time though with
nvme_ctrl_reset() and this worked for a while (a couple of hours of FC
port toggling with stress cpu hotplugging). Though eventually the system
stopped to respond and died. Unfortunately, I don't have any core
dump. So it's not totally clear what's happening.

BTW, I used the same test setup with Ming's patches on top of the two of
mine. The system was still working fine after 4 hours of the stress
testing.

[1] https://paste.opensuse.org/view/raw/62287389

For the fun, here is the nvme_ctrl_reset() patch:

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 66973bb56305..444756ba8dbe 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -20,6 +20,7 @@
 #include <linux/ptrace.h>
 #include <linux/nvme_ioctl.h>
 #include <linux/pm_qos.h>
+#include <linux/cpuhotplug.h>
 #include <asm/unaligned.h>
 
 #include "nvme.h"
@@ -4245,6 +4246,8 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl);
 
 void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
 {
+	cpuhp_state_remove_instance_nocalls(CPUHP_AP_NVME_QUEUE_ONLINE,
+					    &ctrl->cpuhp_node);
 	nvme_hwmon_exit(ctrl);
 	nvme_fault_inject_fini(&ctrl->fault_inject);
 	dev_pm_qos_hide_latency_tolerance(ctrl->device);
@@ -4357,6 +4360,13 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	if (ret)
 		goto out_free_name;
 
+	ret = cpuhp_state_add_instance_nocalls(CPUHP_AP_NVME_QUEUE_ONLINE,
+					       &ctrl->cpuhp_node);
+	if (ret) {
+		pr_debug("failed to cpuhp notifiers %d\n", ret);
+		goto out_del_device;
+	}
+
 	/*
 	 * Initialize latency tolerance controls.  The sysfs files won't
 	 * be visible to userspace unless the device actually supports APST.
@@ -4369,6 +4379,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	nvme_mpath_init_ctrl(ctrl);
 
 	return 0;
+out_del_device:
+	cdev_device_del(&ctrl->cdev, ctrl->device);
 out_free_name:
 	nvme_put_ctrl(ctrl);
 	kfree_const(ctrl->device->kobj.name);
@@ -4529,6 +4541,80 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_directive_cmd) != 64);
 }
 
+static int nvme_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+	struct nvme_ctrl *ctrl = hlist_entry_safe(node,
+			struct nvme_ctrl, cpuhp_node);
+	struct nvme_ns *ns;
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+	bool present = false;
+
+	pr_warn("NQN %s CPU %u online\n", ctrl->opts->subsysnqn, cpu);
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		queue_for_each_hw_ctx(ns->disk->queue, hctx, i) {
+			if (cpumask_test_cpu(cpu, hctx->cpumask)) {
+				present = true;
+				break;
+			}
+		}
+		if (present)
+			break;
+	}
+	up_read(&ctrl->namespaces_rwsem);
+	if (present) {
+		pr_warn("trigger reset... \n");
+		nvme_reset_ctrl_sync(ctrl);
+		pr_warn("executed\n");
+	}
+
+	return 0;
+}
+
+static inline bool nvme_last_cpu_in_hctx(unsigned int cpu,
+					struct blk_mq_hw_ctx *hctx)
+{
+	if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
+		return false;
+	if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
+		return false;
+	return true;
+}
+
+static int nvme_notify_offline(unsigned int cpu, struct hlist_node *node)
+{
+	struct nvme_ctrl *ctrl = hlist_entry_safe(node,
+			struct nvme_ctrl, cpuhp_node);
+	struct nvme_ns *ns;
+	struct blk_mq_hw_ctx *hctx;
+	unsigned int i;
+	bool offline = false;
+
+	pr_warn("NQN %s CPU %u offline\n", ctrl->opts->subsysnqn, cpu);
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		queue_for_each_hw_ctx(ns->disk->queue, hctx, i) {
+			pr_warn("htcx %p cpumask %*pbl state x%lx\n",
+				hctx, cpumask_pr_args(hctx->cpumask), hctx->state);
+			if (nvme_last_cpu_in_hctx(cpu, hctx)) {
+				offline = true;
+				break;
+			}
+		}
+		if (offline)
+			break;
+	}
+	up_read(&ctrl->namespaces_rwsem);
+	if (offline) {
+		pr_warn("trigger reset... \n");
+		nvme_reset_ctrl_sync(ctrl);
+		pr_warn("executed\n");
+	}
+	return 0;
+}
 
 static int __init nvme_core_init(void)
 {
@@ -4580,8 +4666,17 @@ static int __init nvme_core_init(void)
 		goto unregister_generic_ns;
 	}
 
+	result = cpuhp_setup_state_multi(CPUHP_AP_NVME_QUEUE_ONLINE,
+					 "nvme:online",
+					 nvme_notify_online,
+					 nvme_notify_offline);
+	if (result < 0)
+		goto destroy_chr_class;
+
 	return 0;
 
+destroy_chr_class:
+	class_destroy(nvme_ns_chr_class);
 unregister_generic_ns:
 	unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
 destroy_subsys_class:
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 0015860ec12b..34676ff79d40 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -352,6 +352,8 @@ struct nvme_ctrl {
 	unsigned long discard_page_busy;
 
 	struct nvme_fault_inject fault_inject;
+
+	struct hlist_node cpuhp_node;
 };
 
 enum nvme_iopolicy {
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 4a62b3980642..5f6cf60bee1f 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -159,6 +159,7 @@ enum cpuhp_state {
 	CPUHP_AP_X86_VDSO_VMA_ONLINE,
 	CPUHP_AP_IRQ_AFFINITY_ONLINE,
 	CPUHP_AP_BLK_MQ_ONLINE,
+	CPUHP_AP_NVME_QUEUE_ONLINE,
 	CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS,
 	CPUHP_AP_X86_INTEL_EPB_ONLINE,
 	CPUHP_AP_PERF_ONLINE,

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

      parent reply	other threads:[~2021-07-02  9:48 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-29  7:49 [PATCH 0/2] blk-mq: fix blk_mq_alloc_request_hctx Ming Lei
2021-06-29  7:49 ` [PATCH 1/2] blk-mq: not deactivate hctx if the device doesn't use managed irq Ming Lei
2021-06-29 12:39   ` Hannes Reinecke
2021-06-29 14:17     ` Ming Lei
2021-06-29 15:49   ` John Garry
2021-06-30  0:32     ` Ming Lei
2021-06-30  9:25       ` John Garry
2021-07-01  9:52       ` Christoph Hellwig
2021-06-29 23:30   ` Damien Le Moal
2021-06-30 18:58     ` Sagi Grimberg
2021-06-30 21:57       ` Damien Le Moal
2021-07-01 14:20         ` Keith Busch
2021-06-29  7:49 ` [PATCH 2/2] nvme: pass BLK_MQ_F_NOT_USE_MANAGED_IRQ for fc/rdma/tcp/loop Ming Lei
2021-06-30  8:15   ` Hannes Reinecke
2021-06-30  8:47     ` Ming Lei
2021-06-30  8:18 ` [PATCH 0/2] blk-mq: fix blk_mq_alloc_request_hctx Hannes Reinecke
2021-06-30  8:42   ` Ming Lei
2021-06-30  9:43     ` Hannes Reinecke
2021-06-30  9:53       ` Ming Lei
2021-06-30 18:59         ` Sagi Grimberg
2021-06-30 19:46           ` Hannes Reinecke
2021-06-30 23:59             ` Ming Lei
2021-07-01  8:00               ` Hannes Reinecke
2021-07-01  9:13                 ` Ming Lei
2021-07-02  9:47             ` Daniel Wagner [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210702094756.34zkja2yc6am3gyc@beryllium.lan \
    --to=dwagner@suse.de \
    --cc=axboe@kernel.dk \
    --cc=hare@suse.de \
    --cc=hch@lst.de \
    --cc=john.garry@huawei.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=ming.lei@redhat.com \
    --cc=sagi@grimberg.me \
    --cc=wenxiong@us.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).