linux-nvme.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
Search results ordered by [date|relevance]  view[summary|nested|Atom feed]
thread overview below | download mbox.gz: |
* [PATCHv2] nvme: use srcu for iterating namespace list
@ 2024-05-23 17:20  3% Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-23 17:20 UTC (permalink / raw)
  To: linux-nvme, shinichiro.kawasaki, hch, sagi; +Cc: axboe, Keith Busch

From: Keith Busch <kbusch@kernel.org>

The nvme pci driver synchronizes with all the namespace queues during a
reset to ensure that there's no pending timeout work.

Meanwhile the timeout work potentially iterates those same namespaces to
freeze their queues.

Each of those namespace iterations use the same read lock. If a write
lock should somehow get between the synchronize and freeze steps, then
forward progress is deadlocked.

We had been relying on the nvme controller state machine to ensure the
reset work wouldn't conflict with timeout work. That guarantee may be a
bit fragile to rely on, so iterate the namespace lists without taking a
lock to fix potential circular locks, as reported by lockdep.

Link: https://lore.kernel.org/all/20220930001943.zdbvolc3gkekfmcv@shindev/
Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
v2:
  Improved changelog

 drivers/nvme/host/core.c      | 97 +++++++++++++++++++++--------------
 drivers/nvme/host/ioctl.c     | 12 ++---
 drivers/nvme/host/multipath.c | 21 ++++----
 drivers/nvme/host/nvme.h      |  3 +-
 4 files changed, 78 insertions(+), 55 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7706df2373494..b7cd46f3cef69 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3684,9 +3684,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
 	struct nvme_ns *ns, *ret = NULL;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		if (ns->head->ns_id == nsid) {
 			if (!nvme_get_ns(ns))
 				continue;
@@ -3696,7 +3697,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		if (ns->head->ns_id > nsid)
 			break;
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return ret;
 }
 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
@@ -3710,7 +3711,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
 
 	list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
 		if (tmp->head->ns_id < ns->head->ns_id) {
-			list_add(&ns->list, &tmp->list);
+			list_add_rcu(&ns->list, &tmp->list);
 			return;
 		}
 	}
@@ -3776,17 +3777,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
 	if (nvme_update_ns_info(ns, info))
 		goto out_unlink_ns;
 
-	down_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
 	/*
 	 * Ensure that no namespaces are added to the ctrl list after the queues
 	 * are frozen, thereby avoiding a deadlock between scan and reset.
 	 */
 	if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
-		up_write(&ctrl->namespaces_rwsem);
+		mutex_unlock(&ctrl->namespaces_lock);
 		goto out_unlink_ns;
 	}
 	nvme_ns_add_to_ctrl_list(ns);
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_unlock(&ctrl->namespaces_lock);
+	synchronize_srcu(&ctrl->srcu);
 	nvme_get_ctrl(ctrl);
 
 	if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
@@ -3809,9 +3811,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
 
  out_cleanup_ns_from_list:
 	nvme_put_ctrl(ctrl);
-	down_write(&ctrl->namespaces_rwsem);
-	list_del_init(&ns->list);
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
+	list_del_rcu(&ns->list);
+	mutex_unlock(&ctrl->namespaces_lock);
+	synchronize_srcu(&ctrl->srcu);
  out_unlink_ns:
 	mutex_lock(&ctrl->subsys->lock);
 	list_del_rcu(&ns->siblings);
@@ -3861,9 +3864,10 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 		nvme_cdev_del(&ns->cdev, &ns->cdev_device);
 	del_gendisk(ns->disk);
 
-	down_write(&ns->ctrl->namespaces_rwsem);
-	list_del_init(&ns->list);
-	up_write(&ns->ctrl->namespaces_rwsem);
+	mutex_lock(&ns->ctrl->namespaces_lock);
+	list_del_rcu(&ns->list);
+	mutex_unlock(&ns->ctrl->namespaces_lock);
+	synchronize_srcu(&ns->ctrl->srcu);
 
 	if (last_path)
 		nvme_mpath_shutdown_disk(ns->head);
@@ -3953,16 +3957,17 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 	struct nvme_ns *ns, *next;
 	LIST_HEAD(rm_list);
 
-	down_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
 		if (ns->head->ns_id > nsid)
-			list_move_tail(&ns->list, &rm_list);
+			list_splice_init_rcu(&ns->list, &rm_list,
+					     synchronize_rcu);
 	}
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_unlock(&ctrl->namespaces_lock);
+	synchronize_srcu(&ctrl->srcu);
 
 	list_for_each_entry_safe(ns, next, &rm_list, list)
 		nvme_ns_remove(ns);
-
 }
 
 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
@@ -4132,9 +4137,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 	/* this is a no-op when called from the controller reset handler */
 	nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
 
-	down_write(&ctrl->namespaces_rwsem);
-	list_splice_init(&ctrl->namespaces, &ns_list);
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
+	list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
+	mutex_unlock(&ctrl->namespaces_lock);
+	synchronize_srcu(&ctrl->srcu);
 
 	list_for_each_entry_safe(ns, next, &ns_list, list)
 		nvme_ns_remove(ns);
@@ -4582,6 +4588,7 @@ static void nvme_free_ctrl(struct device *dev)
 	key_put(ctrl->tls_key);
 	nvme_free_cels(ctrl);
 	nvme_mpath_uninit(ctrl);
+	cleanup_srcu_struct(&ctrl->srcu);
 	nvme_auth_stop(ctrl);
 	nvme_auth_free(ctrl);
 	__free_page(ctrl->discard_page);
@@ -4614,10 +4621,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	ctrl->passthru_err_log_enabled = false;
 	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 	spin_lock_init(&ctrl->lock);
+	mutex_init(&ctrl->namespaces_lock);
+
+	ret = init_srcu_struct(&ctrl->srcu);
+	if (ret)
+		return ret;
+
 	mutex_init(&ctrl->scan_lock);
 	INIT_LIST_HEAD(&ctrl->namespaces);
 	xa_init(&ctrl->cels);
-	init_rwsem(&ctrl->namespaces_rwsem);
 	ctrl->dev = dev;
 	ctrl->ops = ops;
 	ctrl->quirks = quirks;
@@ -4697,6 +4709,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 out:
 	if (ctrl->discard_page)
 		__free_page(ctrl->discard_page);
+	cleanup_srcu_struct(&ctrl->srcu);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
@@ -4705,22 +4718,24 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl);
 void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_mark_disk_dead(ns->disk);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
 
 void nvme_unfreeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_mq_unfreeze_queue(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
 }
 EXPORT_SYMBOL_GPL(nvme_unfreeze);
@@ -4728,14 +4743,15 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze);
 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
 		if (timeout <= 0)
 			break;
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return timeout;
 }
 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
@@ -4743,23 +4759,25 @@ EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_mq_freeze_queue_wait(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
 
 void nvme_start_freeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
 	set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_freeze_queue_start(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
@@ -4802,11 +4820,12 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
 void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_sync_queue(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
 
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 499a8bb7cac7d..0f05058692b55 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -789,16 +789,16 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
 		bool open_for_write)
 {
 	struct nvme_ns *ns;
-	int ret;
+	int ret, srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
 	if (list_empty(&ctrl->namespaces)) {
 		ret = -ENOTTY;
 		goto out_unlock;
 	}
 
-	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
-	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
+	ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list);
+	if (!ns) {
 		dev_warn(ctrl->device,
 			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
 		ret = -EINVAL;
@@ -808,14 +808,14 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
 	dev_warn(ctrl->device,
 		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
 	kref_get(&ns->kref);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 
 	ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write);
 	nvme_put_ns(ns);
 	return ret;
 
 out_unlock:
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return ret;
 }
 
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 1bee176fd850e..d8b6b4648eaff 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -151,16 +151,17 @@ void nvme_mpath_end_request(struct request *rq)
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		if (!ns->head->disk)
 			continue;
 		kblockd_schedule_work(&ns->head->requeue_work);
 		if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
 			disk_uevent(ns->head->disk, KOBJ_CHANGE);
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 
 static const char *nvme_ana_state_names[] = {
@@ -194,13 +195,14 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		nvme_mpath_clear_current_path(ns);
 		kblockd_schedule_work(&ns->head->requeue_work);
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 
 void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
@@ -681,6 +683,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
 	unsigned *nr_change_groups = data;
 	struct nvme_ns *ns;
+	int srcu_idx;
 
 	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
 			le32_to_cpu(desc->grpid),
@@ -692,8 +695,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 	if (!nr_nsids)
 		return 0;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		unsigned nsid;
 again:
 		nsid = le32_to_cpu(desc->nsids[n]);
@@ -706,7 +709,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 		if (ns->head->ns_id > nsid)
 			goto again;
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return 0;
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index fc31bd340a63a..a005941a8b67e 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -282,7 +282,8 @@ struct nvme_ctrl {
 	struct blk_mq_tag_set *tagset;
 	struct blk_mq_tag_set *admin_tagset;
 	struct list_head namespaces;
-	struct rw_semaphore namespaces_rwsem;
+	struct mutex namespaces_lock;
+	struct srcu_struct srcu;
 	struct device ctrl_device;
 	struct device *device;	/* char device */
 #ifdef CONFIG_NVME_HWMON
-- 
2.43.0



^ permalink raw reply related	[relevance 3%]

* Re: [PATCH] nvme: use srcu for namespace list reading
  2024-05-23 15:51  3% [PATCH] nvme: use srcu for namespace list reading Keith Busch
@ 2024-05-23 16:18  0% ` Jens Axboe
  0 siblings, 0 replies; 200+ results
From: Jens Axboe @ 2024-05-23 16:18 UTC (permalink / raw)
  To: Keith Busch, linux-nvme, shinichiro.kawasaki, hch, sagi; +Cc: Keith Busch

On 5/23/24 9:51 AM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Iterate the namespace lists without taking a lock fixes potential
> pcircular lockdep complaints.

Change itself looks fine to me from a quick glance, however this commit
message is lacking (quite a bit). Please include a link and some more
detail on this, and why the SRCU approach will fix it.

-- 
Jens Axboe




^ permalink raw reply	[relevance 0%]

* Re: [PATCH v5] nvme: multipath: Implemented new iopolicy "queue-depth"
  @ 2024-05-23 15:56  5%       ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-23 15:56 UTC (permalink / raw)
  To: John Meneghini
  Cc: Christoph Hellwig, sagi, emilne, linux-nvme, linux-kernel, jrani,
	randyj, hare

On Thu, May 23, 2024 at 09:12:21AM -0400, John Meneghini wrote:
> On 5/23/24 02:45, Christoph Hellwig wrote:> On Wed, May 22, 2024 at 11:32:02AM -0600, Keith Busch wrote:
> > > Christoph, Sagi, 6.10 merge window is still open and this has been
> > > iterating long before that. Any objection?
> > 
> > No, it's not.  The window for development closes with the release for
> > 6.9.  New features must be in before that.
> 
> So what's the next window for new features?  6.11?

The nvme 6.11 branch will be created after block creates a block-6.11
branch, which usually happens after a few -rc's are released in the
current cycle. So at least a few more weeks.


^ permalink raw reply	[relevance 5%]

* [PATCH] nvme: use srcu for namespace list reading
@ 2024-05-23 15:51  3% Keith Busch
  2024-05-23 16:18  0% ` Jens Axboe
  0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-05-23 15:51 UTC (permalink / raw)
  To: linux-nvme, shinichiro.kawasaki, hch, sagi; +Cc: Keith Busch

From: Keith Busch <kbusch@kernel.org>

Iterate the namespace lists without taking a lock fixes potential
pcircular lockdep complaints.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c      | 97 +++++++++++++++++++++--------------
 drivers/nvme/host/ioctl.c     | 12 ++---
 drivers/nvme/host/multipath.c | 21 ++++----
 drivers/nvme/host/nvme.h      |  3 +-
 4 files changed, 78 insertions(+), 55 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7706df2373494..b7cd46f3cef69 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3684,9 +3684,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
 	struct nvme_ns *ns, *ret = NULL;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		if (ns->head->ns_id == nsid) {
 			if (!nvme_get_ns(ns))
 				continue;
@@ -3696,7 +3697,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		if (ns->head->ns_id > nsid)
 			break;
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return ret;
 }
 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
@@ -3710,7 +3711,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
 
 	list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
 		if (tmp->head->ns_id < ns->head->ns_id) {
-			list_add(&ns->list, &tmp->list);
+			list_add_rcu(&ns->list, &tmp->list);
 			return;
 		}
 	}
@@ -3776,17 +3777,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
 	if (nvme_update_ns_info(ns, info))
 		goto out_unlink_ns;
 
-	down_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
 	/*
 	 * Ensure that no namespaces are added to the ctrl list after the queues
 	 * are frozen, thereby avoiding a deadlock between scan and reset.
 	 */
 	if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
-		up_write(&ctrl->namespaces_rwsem);
+		mutex_unlock(&ctrl->namespaces_lock);
 		goto out_unlink_ns;
 	}
 	nvme_ns_add_to_ctrl_list(ns);
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_unlock(&ctrl->namespaces_lock);
+	synchronize_srcu(&ctrl->srcu);
 	nvme_get_ctrl(ctrl);
 
 	if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
@@ -3809,9 +3811,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
 
  out_cleanup_ns_from_list:
 	nvme_put_ctrl(ctrl);
-	down_write(&ctrl->namespaces_rwsem);
-	list_del_init(&ns->list);
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
+	list_del_rcu(&ns->list);
+	mutex_unlock(&ctrl->namespaces_lock);
+	synchronize_srcu(&ctrl->srcu);
  out_unlink_ns:
 	mutex_lock(&ctrl->subsys->lock);
 	list_del_rcu(&ns->siblings);
@@ -3861,9 +3864,10 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 		nvme_cdev_del(&ns->cdev, &ns->cdev_device);
 	del_gendisk(ns->disk);
 
-	down_write(&ns->ctrl->namespaces_rwsem);
-	list_del_init(&ns->list);
-	up_write(&ns->ctrl->namespaces_rwsem);
+	mutex_lock(&ns->ctrl->namespaces_lock);
+	list_del_rcu(&ns->list);
+	mutex_unlock(&ns->ctrl->namespaces_lock);
+	synchronize_srcu(&ns->ctrl->srcu);
 
 	if (last_path)
 		nvme_mpath_shutdown_disk(ns->head);
@@ -3953,16 +3957,17 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 	struct nvme_ns *ns, *next;
 	LIST_HEAD(rm_list);
 
-	down_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
 		if (ns->head->ns_id > nsid)
-			list_move_tail(&ns->list, &rm_list);
+			list_splice_init_rcu(&ns->list, &rm_list,
+					     synchronize_rcu);
 	}
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_unlock(&ctrl->namespaces_lock);
+	synchronize_srcu(&ctrl->srcu);
 
 	list_for_each_entry_safe(ns, next, &rm_list, list)
 		nvme_ns_remove(ns);
-
 }
 
 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
@@ -4132,9 +4137,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 	/* this is a no-op when called from the controller reset handler */
 	nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
 
-	down_write(&ctrl->namespaces_rwsem);
-	list_splice_init(&ctrl->namespaces, &ns_list);
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
+	list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
+	mutex_unlock(&ctrl->namespaces_lock);
+	synchronize_srcu(&ctrl->srcu);
 
 	list_for_each_entry_safe(ns, next, &ns_list, list)
 		nvme_ns_remove(ns);
@@ -4582,6 +4588,7 @@ static void nvme_free_ctrl(struct device *dev)
 	key_put(ctrl->tls_key);
 	nvme_free_cels(ctrl);
 	nvme_mpath_uninit(ctrl);
+	cleanup_srcu_struct(&ctrl->srcu);
 	nvme_auth_stop(ctrl);
 	nvme_auth_free(ctrl);
 	__free_page(ctrl->discard_page);
@@ -4614,10 +4621,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	ctrl->passthru_err_log_enabled = false;
 	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 	spin_lock_init(&ctrl->lock);
+	mutex_init(&ctrl->namespaces_lock);
+
+	ret = init_srcu_struct(&ctrl->srcu);
+	if (ret)
+		return ret;
+
 	mutex_init(&ctrl->scan_lock);
 	INIT_LIST_HEAD(&ctrl->namespaces);
 	xa_init(&ctrl->cels);
-	init_rwsem(&ctrl->namespaces_rwsem);
 	ctrl->dev = dev;
 	ctrl->ops = ops;
 	ctrl->quirks = quirks;
@@ -4697,6 +4709,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 out:
 	if (ctrl->discard_page)
 		__free_page(ctrl->discard_page);
+	cleanup_srcu_struct(&ctrl->srcu);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
@@ -4705,22 +4718,24 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl);
 void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_mark_disk_dead(ns->disk);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
 
 void nvme_unfreeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_mq_unfreeze_queue(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
 }
 EXPORT_SYMBOL_GPL(nvme_unfreeze);
@@ -4728,14 +4743,15 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze);
 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
 		if (timeout <= 0)
 			break;
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return timeout;
 }
 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
@@ -4743,23 +4759,25 @@ EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_mq_freeze_queue_wait(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
 
 void nvme_start_freeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
 	set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_freeze_queue_start(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
@@ -4802,11 +4820,12 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
 void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_sync_queue(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
 
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 499a8bb7cac7d..0f05058692b55 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -789,16 +789,16 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
 		bool open_for_write)
 {
 	struct nvme_ns *ns;
-	int ret;
+	int ret, srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
 	if (list_empty(&ctrl->namespaces)) {
 		ret = -ENOTTY;
 		goto out_unlock;
 	}
 
-	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
-	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
+	ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list);
+	if (!ns) {
 		dev_warn(ctrl->device,
 			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
 		ret = -EINVAL;
@@ -808,14 +808,14 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
 	dev_warn(ctrl->device,
 		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
 	kref_get(&ns->kref);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 
 	ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write);
 	nvme_put_ns(ns);
 	return ret;
 
 out_unlock:
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return ret;
 }
 
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 1bee176fd850e..d8b6b4648eaff 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -151,16 +151,17 @@ void nvme_mpath_end_request(struct request *rq)
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		if (!ns->head->disk)
 			continue;
 		kblockd_schedule_work(&ns->head->requeue_work);
 		if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
 			disk_uevent(ns->head->disk, KOBJ_CHANGE);
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 
 static const char *nvme_ana_state_names[] = {
@@ -194,13 +195,14 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		nvme_mpath_clear_current_path(ns);
 		kblockd_schedule_work(&ns->head->requeue_work);
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 
 void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
@@ -681,6 +683,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
 	unsigned *nr_change_groups = data;
 	struct nvme_ns *ns;
+	int srcu_idx;
 
 	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
 			le32_to_cpu(desc->grpid),
@@ -692,8 +695,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 	if (!nr_nsids)
 		return 0;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		unsigned nsid;
 again:
 		nsid = le32_to_cpu(desc->nsids[n]);
@@ -706,7 +709,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 		if (ns->head->ns_id > nsid)
 			goto again;
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return 0;
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index fc31bd340a63a..a005941a8b67e 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -282,7 +282,8 @@ struct nvme_ctrl {
 	struct blk_mq_tag_set *tagset;
 	struct blk_mq_tag_set *admin_tagset;
 	struct list_head namespaces;
-	struct rw_semaphore namespaces_rwsem;
+	struct mutex namespaces_lock;
+	struct srcu_struct srcu;
 	struct device ctrl_device;
 	struct device *device;	/* char device */
 #ifdef CONFIG_NVME_HWMON
-- 
2.43.0



^ permalink raw reply related	[relevance 3%]

* Re: [PATCH] nvme-pci: silence a lockdep complaint
  @ 2024-05-23 15:02  4%                       ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-23 15:02 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: Christoph Hellwig, Shinichiro Kawasaki, linux-nvme

On Thu, May 23, 2024 at 04:45:37PM +0300, Sagi Grimberg wrote:
> 
> 
> On 23/05/2024 16:19, Christoph Hellwig wrote:
> > On Thu, May 23, 2024 at 04:02:43PM +0300, Sagi Grimberg wrote:
> > > I just want to have good testing before we touch these areas that
> > > are all done in ctrl reset and error recovery which historically manifest
> > > most of the issues...
> > > 
> > > and all this because lockdep complains on false-positives...
> > I still haven't seen any proof that it is a false-positive.
> > 
> 
> The explanation is that the nvme-pci timeout handler will only disable the
> device when either
> it is able to transition the ctrl state to RESETTING or if the ctrl state is
> terminal, and in both
> cases the reset_work is not running concurrently...
> 
> It was also proposed moving the call to nvme_disable_ctrl to a different
> context (returning BLK_RESET_TIMER from
> the timeout handler) such that blk_sync_queue() does not depend on it. But
> Keith said it won't solve the issue.
> 
> Keith spent more time thinking about this, so I'll defer to him. I just want
> it to stop bothering Shinichiro and
> others running blktests regularly...

Lockdep is complaining about the namespaces_rwsem taken from both
timeout_work and reset_work.

The only time namespaces_rwsem is taken from timeout work is if
timeout_work successfully transitions controller state to RESETTING. But
the controller is already in the RESETTING state because reset_work
wouldn't be running here if we weren't. So timeout_work never locks the
rwsem if reset_work is already attempting to "cancel" that work.

Nevermind the fact that both are using the read lock, so it's not even a
problem if they both attempt to lock it. The only potential problem
could be if someone requests the write lock inbetween. At nearly the
same time, pciehp has to transition the state to DEAD in order for the
special "terminal" condition to apply for the timeout_work to attempt
some kind of recovery, but the removal doesn't take the write lock until
after the reset_work is flushed, so how would the write lock sneak in?

A potential interaction with scan_work while the timeout, reset, and
hotplug run concurrently might be the only part I'm not 100% sure about,
but I haven't found any concerning path so far.


^ permalink raw reply	[relevance 4%]

* Re: [PATCH] nvme: fix multipath batched completion accounting
  2024-05-21 17:05  5% [PATCH] nvme: fix multipath batched completion accounting Keith Busch
  2024-05-22  0:55  0% ` Chaitanya Kulkarni
@ 2024-05-23  9:59  0% ` Hannes Reinecke
  1 sibling, 0 replies; 200+ results
From: Hannes Reinecke @ 2024-05-23  9:59 UTC (permalink / raw)
  To: Keith Busch, hch, sagi, linux-nvme; +Cc: axboe, Keith Busch

On 5/21/24 19:05, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Batched completions were missing the io stats accounting and bio trace
> events. Move the common code to a helper and call it from the batched
> and non-batched functions.
> 
> Fixes: d4d957b53d91ee ("nvme-multipath: support io stats on the mpath device")
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   drivers/nvme/host/core.c | 15 ++++++++++-----
>   1 file changed, 10 insertions(+), 5 deletions(-)
> 
Reviewed-by: Hannes Reinecke <hare@suse.de>

Cheers,

Hannes




^ permalink raw reply	[relevance 0%]

* Re: [PATCH 0/1] nvme-pci: add quirks for Lexar NM790
  2024-05-22 17:42  5% ` Keith Busch
@ 2024-05-23  9:52  4%   ` Jason Nader
  0 siblings, 0 replies; 200+ results
From: Jason Nader @ 2024-05-23  9:52 UTC (permalink / raw)
  To: Keith Busch; +Cc: axboe, hch, sagi, linux-nvme

On Thu, 23 May 2024 at 02:42, Keith Busch <kbusch@kernel.org> wrote:
>
> On Thu, May 23, 2024 at 12:56:13AM +0900, Jason Nader wrote:
> namespace descriptors.
>
>   nvme id-ctrl /dev/nvme1 | grep subnqn

Output below:
```
subnqn    :
```


^ permalink raw reply	[relevance 4%]

* Re: [PATCH] nvme-multipath: fix io accounting on failover
  2024-05-22 14:18  5%   ` Keith Busch
@ 2024-05-23  7:00  0%     ` Nilay Shroff
  0 siblings, 0 replies; 200+ results
From: Nilay Shroff @ 2024-05-23  7:00 UTC (permalink / raw)
  To: Keith Busch; +Cc: Keith Busch, hch, sagi, linux-nvme, jmeneghi



On 5/22/24 19:48, Keith Busch wrote:
> On Wed, May 22, 2024 at 06:32:11PM +0530, Nilay Shroff wrote:
>>
>>
>> On 5/21/24 23:37, Keith Busch wrote:
>>> From: Keith Busch <kbusch@kernel.org>
>>>
>>> There are io stats accounting that needs to be handled, so don't call
>>> blk_mq_end_request() directly. Use the existing nvme_end_req() helper
>>> that already handles everything.
>>>
>> The changes look good however I have a question about why do we retry an IO
>> when that IO is cancelled? For instance, when a multipath IO request is cancelled 
>> (from nvme_cancel_request()) we re-queue the bio in nvme_failover_req().
>> Similarly, for non-multipath request, we do retry request in nvme_retry_req()
>> until retries for a request are maxed out by nvme_max_retries. So wouldn't it be 
>> appropriate to drop the cancelled request instead of retrying? 
>>
>> However, I do understand retrying a request on a different path when we got the 
>> request completion status specifying the path related error.
> 
> A cancelled request just means the host thinks the target failed to
> produce a response. It doesn't mean the host stopped caring about the
> command; the host still wants it to succeed, but determined corrective
> action is needed to reclaim and resubmit the command.
> 
Thank Keith, got it!

--Nilay


^ permalink raw reply	[relevance 0%]

* Re: [PATCH] nvme-pci: silence a lockdep complaint
  @ 2024-05-22 21:36  6%         ` Keith Busch
    0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-05-22 21:36 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: Christoph Hellwig, Shinichiro Kawasaki, linux-nvme

On Wed, May 22, 2024 at 09:00:34PM +0300, Sagi Grimberg wrote:
> I don't understand lockdep well enough to do that. This warning
> exists for 6 months now. The conclusion that Keith came to (and I agreed)
> is that it can't happen, because nvme_timeout handler will not disable the
> device while the ctrl reset_work is running, the ctrl state machine will
> prevent that from happening.
> 
> There was an attempt to convert namespaces_rwsem a srcu, but my comment to
> that was that this is an intrusive change for a complaint that we think is a
> false-positive.

I'll play devil's advocate and suggest there might be some whacky
timeout_work, scan_work, reset_work, and pciehp irq that could result in
a read, write, wait, read deadlock. I don't think it can happen, but
it's harder to prove that.

The below patch is what I'm messing with. I have a CMIC PCI NVMe and
nvme native multipathing enabled, and that can recreate the lockdep
splat 100% of the time. It goes away with this patch. Non-multipath
devices are harder to hit the lockdep splat for a different reason.

From 00dd7a573d3dd14be748b46e040bb5eb0f5687b1 Mon Sep 17 00:00:00 2001
From: Keith Busch <kbusch@kernel.org>
Date: Tue, 21 May 2024 06:41:45 -0700
Subject: [PATCH] nvme: switch rwsem to srcu

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c      | 94 +++++++++++++++++++++--------------
 drivers/nvme/host/ioctl.c     | 12 ++---
 drivers/nvme/host/multipath.c | 21 ++++----
 drivers/nvme/host/nvme.h      |  3 +-
 4 files changed, 76 insertions(+), 54 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7706df2373494..480af74f1606e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3684,9 +3684,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 {
 	struct nvme_ns *ns, *ret = NULL;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		if (ns->head->ns_id == nsid) {
 			if (!nvme_get_ns(ns))
 				continue;
@@ -3696,7 +3697,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 		if (ns->head->ns_id > nsid)
 			break;
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return ret;
 }
 EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
@@ -3710,7 +3711,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
 
 	list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
 		if (tmp->head->ns_id < ns->head->ns_id) {
-			list_add(&ns->list, &tmp->list);
+			list_add_rcu(&ns->list, &tmp->list);
 			return;
 		}
 	}
@@ -3776,17 +3777,17 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
 	if (nvme_update_ns_info(ns, info))
 		goto out_unlink_ns;
 
-	down_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
 	/*
 	 * Ensure that no namespaces are added to the ctrl list after the queues
 	 * are frozen, thereby avoiding a deadlock between scan and reset.
 	 */
 	if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
-		up_write(&ctrl->namespaces_rwsem);
+		mutex_unlock(&ctrl->namespaces_lock);
 		goto out_unlink_ns;
 	}
 	nvme_ns_add_to_ctrl_list(ns);
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_unlock(&ctrl->namespaces_lock);
 	nvme_get_ctrl(ctrl);
 
 	if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
@@ -3809,9 +3810,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
 
  out_cleanup_ns_from_list:
 	nvme_put_ctrl(ctrl);
-	down_write(&ctrl->namespaces_rwsem);
-	list_del_init(&ns->list);
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
+	list_del_rcu(&ns->list);
+	mutex_unlock(&ctrl->namespaces_lock);
  out_unlink_ns:
 	mutex_lock(&ctrl->subsys->lock);
 	list_del_rcu(&ns->siblings);
@@ -3861,10 +3862,11 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 		nvme_cdev_del(&ns->cdev, &ns->cdev_device);
 	del_gendisk(ns->disk);
 
-	down_write(&ns->ctrl->namespaces_rwsem);
-	list_del_init(&ns->list);
-	up_write(&ns->ctrl->namespaces_rwsem);
+	mutex_lock(&ns->ctrl->namespaces_lock);
+	list_del_rcu(&ns->list);
+	mutex_unlock(&ns->ctrl->namespaces_lock);
 
+	synchronize_srcu(&ns->ctrl->srcu);
 	if (last_path)
 		nvme_mpath_shutdown_disk(ns->head);
 	nvme_put_ns(ns);
@@ -3953,16 +3955,17 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 	struct nvme_ns *ns, *next;
 	LIST_HEAD(rm_list);
 
-	down_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
 		if (ns->head->ns_id > nsid)
-			list_move_tail(&ns->list, &rm_list);
+			list_splice_init_rcu(&ns->list, &rm_list, synchronize_rcu);
 	}
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_unlock(&ctrl->namespaces_lock);
 
 	list_for_each_entry_safe(ns, next, &rm_list, list)
 		nvme_ns_remove(ns);
 
+	synchronize_srcu(&ctrl->srcu);
 }
 
 static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
@@ -4132,12 +4135,14 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 	/* this is a no-op when called from the controller reset handler */
 	nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
 
-	down_write(&ctrl->namespaces_rwsem);
-	list_splice_init(&ctrl->namespaces, &ns_list);
-	up_write(&ctrl->namespaces_rwsem);
+	mutex_lock(&ctrl->namespaces_lock);
+	list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
+	mutex_unlock(&ctrl->namespaces_lock);
 
 	list_for_each_entry_safe(ns, next, &ns_list, list)
 		nvme_ns_remove(ns);
+
+	synchronize_srcu(&ctrl->srcu);
 }
 EXPORT_SYMBOL_GPL(nvme_remove_namespaces);
 
@@ -4582,6 +4587,7 @@ static void nvme_free_ctrl(struct device *dev)
 	key_put(ctrl->tls_key);
 	nvme_free_cels(ctrl);
 	nvme_mpath_uninit(ctrl);
+	cleanup_srcu_struct(&ctrl->srcu);
 	nvme_auth_stop(ctrl);
 	nvme_auth_free(ctrl);
 	__free_page(ctrl->discard_page);
@@ -4614,10 +4620,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	ctrl->passthru_err_log_enabled = false;
 	clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
 	spin_lock_init(&ctrl->lock);
+	mutex_init(&ctrl->namespaces_lock);
+
+	ret = init_srcu_struct(&ctrl->srcu);
+	if (ret)
+		return ret;
+
 	mutex_init(&ctrl->scan_lock);
 	INIT_LIST_HEAD(&ctrl->namespaces);
 	xa_init(&ctrl->cels);
-	init_rwsem(&ctrl->namespaces_rwsem);
 	ctrl->dev = dev;
 	ctrl->ops = ops;
 	ctrl->quirks = quirks;
@@ -4697,6 +4708,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 out:
 	if (ctrl->discard_page)
 		__free_page(ctrl->discard_page);
+	cleanup_srcu_struct(&ctrl->srcu);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
@@ -4705,22 +4717,24 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl);
 void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_mark_disk_dead(ns->disk);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
 
 void nvme_unfreeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_mq_unfreeze_queue(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
 }
 EXPORT_SYMBOL_GPL(nvme_unfreeze);
@@ -4728,14 +4742,15 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze);
 int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
 		if (timeout <= 0)
 			break;
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return timeout;
 }
 EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
@@ -4743,23 +4758,25 @@ EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
 void nvme_wait_freeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_mq_freeze_queue_wait(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_wait_freeze);
 
 void nvme_start_freeze(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
 	set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_freeze_queue_start(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
@@ -4802,11 +4819,12 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
 void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list)
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
 		blk_sync_queue(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
 
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 499a8bb7cac7d..0f05058692b55 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -789,16 +789,16 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
 		bool open_for_write)
 {
 	struct nvme_ns *ns;
-	int ret;
+	int ret, srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
 	if (list_empty(&ctrl->namespaces)) {
 		ret = -ENOTTY;
 		goto out_unlock;
 	}
 
-	ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
-	if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
+	ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list);
+	if (!ns) {
 		dev_warn(ctrl->device,
 			"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
 		ret = -EINVAL;
@@ -808,14 +808,14 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
 	dev_warn(ctrl->device,
 		"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
 	kref_get(&ns->kref);
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 
 	ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write);
 	nvme_put_ns(ns);
 	return ret;
 
 out_unlock:
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return ret;
 }
 
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 1bee176fd850e..d8b6b4648eaff 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -151,16 +151,17 @@ void nvme_mpath_end_request(struct request *rq)
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		if (!ns->head->disk)
 			continue;
 		kblockd_schedule_work(&ns->head->requeue_work);
 		if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
 			disk_uevent(ns->head->disk, KOBJ_CHANGE);
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 
 static const char *nvme_ana_state_names[] = {
@@ -194,13 +195,14 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
 void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
+	int srcu_idx;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		nvme_mpath_clear_current_path(ns);
 		kblockd_schedule_work(&ns->head->requeue_work);
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 }
 
 void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
@@ -681,6 +683,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 	u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
 	unsigned *nr_change_groups = data;
 	struct nvme_ns *ns;
+	int srcu_idx;
 
 	dev_dbg(ctrl->device, "ANA group %d: %s.\n",
 			le32_to_cpu(desc->grpid),
@@ -692,8 +695,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 	if (!nr_nsids)
 		return 0;
 
-	down_read(&ctrl->namespaces_rwsem);
-	list_for_each_entry(ns, &ctrl->namespaces, list) {
+	srcu_idx = srcu_read_lock(&ctrl->srcu);
+	list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
 		unsigned nsid;
 again:
 		nsid = le32_to_cpu(desc->nsids[n]);
@@ -706,7 +709,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
 		if (ns->head->ns_id > nsid)
 			goto again;
 	}
-	up_read(&ctrl->namespaces_rwsem);
+	srcu_read_unlock(&ctrl->srcu, srcu_idx);
 	return 0;
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index fc31bd340a63a..a005941a8b67e 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -282,7 +282,8 @@ struct nvme_ctrl {
 	struct blk_mq_tag_set *tagset;
 	struct blk_mq_tag_set *admin_tagset;
 	struct list_head namespaces;
-	struct rw_semaphore namespaces_rwsem;
+	struct mutex namespaces_lock;
+	struct srcu_struct srcu;
 	struct device ctrl_device;
 	struct device *device;	/* char device */
 #ifdef CONFIG_NVME_HWMON
-- 
2.43.0



^ permalink raw reply related	[relevance 6%]

* Re: [PATCH 0/1] nvme-pci: add quirks for Lexar NM790
  @ 2024-05-22 17:42  5% ` Keith Busch
  2024-05-23  9:52  4%   ` Jason Nader
  0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-05-22 17:42 UTC (permalink / raw)
  To: Jason Nader; +Cc: axboe, hch, sagi, linux-nvme

On Thu, May 23, 2024 at 12:56:13AM +0900, Jason Nader wrote:
> Hi,
> 
> This adds Lexar NM790 to the quirks list.
> Other drives in the model line-up already exist in the list.
> 
> Kernel logs before:
> 
> nvme nvme1: pci function 0000:03:00.0
> nvme nvme1: missing or invalid SUBNQN field.
> nvme nvme1: allocated 32 MiB host memory buffer.
> nvme nvme1: 8/0/0 default/read/poll queues
> block nvme1n1: No UUID available providing old NGUID
> 
> Kernel logs after:
> 
> nvme nvme1: pci function 0000:03:00.0
> nvme nvme1: allocated 32 MiB host memory buffer.
> nvme nvme1: 8/0/0 default/read/poll queues
> nvme nvme1: Ignoring bogus Namespace Identifiers
> 
> Other info:
> 
> >sudo nvme ns-descs /dev/nvme1n1    
> NVME Namespace Identification Descriptors NS 1:
> eui64   : 6479a74b40200c00
> nguid   : 00000000000000000000000000000000

This quirk is actually for the nvme subsystem. It has nothing to do with
namespace descriptors.

  nvme id-ctrl /dev/nvme1 | grep subnqn

I'm not even sure why we have this quirk; it just suppresses a harmless
print message; everything works the same with or without the quirk.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v5] nvme: multipath: Implemented new iopolicy "queue-depth"
  @ 2024-05-22 17:32  5% ` Keith Busch
    0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-05-22 17:32 UTC (permalink / raw)
  To: John Meneghini
  Cc: hch, sagi, emilne, linux-nvme, linux-kernel, jrani, randyj, hare

On Wed, May 22, 2024 at 12:54:06PM -0400, John Meneghini wrote:
> From: "Ewan D. Milne" <emilne@redhat.com>
> 
> The round-robin path selector is inefficient in cases where there is a
> difference in latency between paths.  In the presence of one or more
> high latency paths the round-robin selector continues to use the high
> latency path equally. This results in a bias towards the highest latency
> path and can cause a significant decrease in overall performance as IOs
> pile on the highest latency path. This problem is acute with NVMe-oF
> controllers.
> 
> The queue-depth policy instead sends I/O requests down the path with the
> least amount of requests in its request queue. Paths with lower latency
> will clear requests more quickly and have less requests in their queues
> compared to higher latency paths. The goal of this path selector is to
> make more use of lower latency paths which will bring down overall IO
> latency and increase throughput and performance.

I'm okay with this as-is, though I don't think you need either
atomic_set() calls.

Christoph, Sagi, 6.10 merge window is still open and this has been
iterating long before that. Any objection?


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v4 1/1] nvme: multipath: Implemented new iopolicy "queue-depth"
  @ 2024-05-22 16:29  5%       ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-22 16:29 UTC (permalink / raw)
  To: John Meneghini
  Cc: hch, sagi, emilne, linux-nvme, linux-kernel, jrani, randyj, hare

On Wed, May 22, 2024 at 12:23:51PM -0400, John Meneghini wrote:
> On 5/22/24 11:56, Keith Busch wrote:
> > On Wed, May 22, 2024 at 11:42:12AM -0400, John Meneghini wrote:
> > > +static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
> > > +{
> > > +	struct nvme_ctrl *ctrl;
> > > +	int old_iopolicy = READ_ONCE(subsys->iopolicy);
> > > +
> > > +	WRITE_ONCE(subsys->iopolicy, iopolicy);
> > > +
> > > +	/* iopolicy changes reset the counters and clear the mpath by design */
> > > +	mutex_lock(&nvme_subsystems_lock);
> > > +	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> > > +		atomic_set(&ctrl->nr_active, 0);
> > 
> > Can you me understand why this is a desirable feature? Unless you
> > quiesce everything at some point, you'll always have more unaccounted
> > requests on whichever path has higher latency. That sounds like it
> > defeats the goals of this io policy.
> 
> This is true. And as a matter of practice I never change the IO policy when IOs are in flight.  I always stop the IO first.
> But we can't stop any user from changing the IO policy again and again.  So I'm not sure what to do.
> 
> If you'd like I add the 'if (old_iopolicy == iopolicy) return;' here, but
> that's not going to solve the problem of inaccurate counters when users
> start flipping io policies around. with IO inflight. There is no
> synchronization between io submission across controllers and changing the
> policy so I expect changing between round-robin and queue-depth with IO
> inflight suffers from the same problem... though not as badly.
> 
> I'd rather take this patch now and figure out how to fix the problem with
> another patch in the future.  Maybe we can check the io stats and refuse to
> change the policy of they are not zero....

The idea of tagging the nvme_req()->flags on submission means the
completion handling the nr_active counter is symmetric with the
submission side: you don't ever need to reset nr_active because
everything is accounted for.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v2] nvmet: fix ns enable/disable possible hang
  @ 2024-05-22 16:24  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-22 16:24 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: linux-nvme, Christoph Hellwig, Chaitanya Kulkarni

On Tue, May 21, 2024 at 11:20:28PM +0300, Sagi Grimberg wrote:
> When disabling an nvmet namespace, there is a period where the
> subsys->lock is released, as the ns disable waits for backend IO to
> complete, and the ns percpu ref to be properly killed. The original
> intent was to avoid taking the subsystem lock for a prolong period as
> other processes may need to acquire it (for example new incoming
> connections).
> 
> However, it opens up a window where another process may come in and
> enable the ns, (re)intiailizing the ns percpu_ref, causing the disable
> sequence to hang.
> 
> Solve this by taking the global nvmet_config_sem over the entire configfs
> enable/disable sequence.

Thanks, applied to nvme-6.10.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvme-pci: silence a lockdep complaint
  2024-05-22 12:18  0% ` Shinichiro Kawasaki
@ 2024-05-22 16:12  5%   ` Keith Busch
    0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-05-22 16:12 UTC (permalink / raw)
  To: Shinichiro Kawasaki; +Cc: Sagi Grimberg, linux-nvme, Christoph Hellwig

On Wed, May 22, 2024 at 12:18:57PM +0000, Shinichiro Kawasaki wrote:
> On May 22, 2024 / 12:15, Sagi Grimberg wrote:
> > lockdep complains about the timeout handler running concurrently with
> > the reset work which is syncing the IO request queues (which in turn
> > flushes the timeout work).
> > 
> > We know it cannot be the case because the ctrl state machine prevents
> > the timeout handler from disabling the ctrl when the reset work is
> > running (changing ctrl state to RESETTING will fail, and the state is not
> > terminal). If this assumption happens to break in the future, we won't
> > have lockdep to assist, but for the time being we are simply seeing
> > false-positive complaints from it...
> > 
> > Reported-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> > Suggested-by: Keith Busch <kbusch@kernel.org>
> > Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
> 
> Sagi, thank you very much. I confirmed this patch avoids the lockdep WARN
> observed with the blktests test case nvme/050 [1]. When I repeated the test case
> around 10 times on v6.9 kernel without this patch, the lockdep WARN was
> recreated in stable manner. After applying this patch, the WARN was not
> observed even when the test case is repeated 100 times.
> 
> Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>

This is not desireable, but I give up trying to appease lockdep for this
one.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v4 1/1] nvme: multipath: Implemented new iopolicy "queue-depth"
  @ 2024-05-22 15:56  5%   ` Keith Busch
    0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-05-22 15:56 UTC (permalink / raw)
  To: John Meneghini
  Cc: hch, sagi, emilne, linux-nvme, linux-kernel, jrani, randyj, hare

On Wed, May 22, 2024 at 11:42:12AM -0400, John Meneghini wrote:
> +static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
> +{
> +	struct nvme_ctrl *ctrl;
> +	int old_iopolicy = READ_ONCE(subsys->iopolicy);
> +
> +	WRITE_ONCE(subsys->iopolicy, iopolicy);
> +
> +	/* iopolicy changes reset the counters and clear the mpath by design */
> +	mutex_lock(&nvme_subsystems_lock);
> +	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> +		atomic_set(&ctrl->nr_active, 0);

Can you me understand why this is a desirable feature? Unless you
quiesce everything at some point, you'll always have more unaccounted
requests on whichever path has higher latency. That sounds like it
defeats the goals of this io policy.

> @@ -1061,6 +1066,9 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
>  {
>  	return false;
>  }
> +static inline void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
> +{
> +}
>  #endif /* CONFIG_NVME_MULTIPATH */

You can remove this stub function since the only caller resides in a
CONFIG_NVME_MULTIPATH file.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvme-multipath: fix io accounting on failover
  2024-05-22 13:02  0% ` Nilay Shroff
@ 2024-05-22 14:18  5%   ` Keith Busch
  2024-05-23  7:00  0%     ` Nilay Shroff
  0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-05-22 14:18 UTC (permalink / raw)
  To: Nilay Shroff; +Cc: Keith Busch, hch, sagi, linux-nvme, jmeneghi

On Wed, May 22, 2024 at 06:32:11PM +0530, Nilay Shroff wrote:
> 
> 
> On 5/21/24 23:37, Keith Busch wrote:
> > From: Keith Busch <kbusch@kernel.org>
> > 
> > There are io stats accounting that needs to be handled, so don't call
> > blk_mq_end_request() directly. Use the existing nvme_end_req() helper
> > that already handles everything.
> > 
> The changes look good however I have a question about why do we retry an IO
> when that IO is cancelled? For instance, when a multipath IO request is cancelled 
> (from nvme_cancel_request()) we re-queue the bio in nvme_failover_req().
> Similarly, for non-multipath request, we do retry request in nvme_retry_req()
> until retries for a request are maxed out by nvme_max_retries. So wouldn't it be 
> appropriate to drop the cancelled request instead of retrying? 
> 
> However, I do understand retrying a request on a different path when we got the 
> request completion status specifying the path related error.

A cancelled request just means the host thinks the target failed to
produce a response. It doesn't mean the host stopped caring about the
command; the host still wants it to succeed, but determined corrective
action is needed to reclaim and resubmit the command.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvme-multipath: fix io accounting on failover
  2024-05-21 18:07  5% [PATCH] nvme-multipath: fix io accounting on failover Keith Busch
  2024-05-21 18:35  4% ` John Meneghini
@ 2024-05-22 13:02  0% ` Nilay Shroff
  2024-05-22 14:18  5%   ` Keith Busch
  1 sibling, 1 reply; 200+ results
From: Nilay Shroff @ 2024-05-22 13:02 UTC (permalink / raw)
  To: Keith Busch, hch, sagi, linux-nvme; +Cc: jmeneghi, Keith Busch



On 5/21/24 23:37, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> There are io stats accounting that needs to be handled, so don't call
> blk_mq_end_request() directly. Use the existing nvme_end_req() helper
> that already handles everything.
> 
The changes look good however I have a question about why do we retry an IO
when that IO is cancelled? For instance, when a multipath IO request is cancelled 
(from nvme_cancel_request()) we re-queue the bio in nvme_failover_req().
Similarly, for non-multipath request, we do retry request in nvme_retry_req()
until retries for a request are maxed out by nvme_max_retries. So wouldn't it be 
appropriate to drop the cancelled request instead of retrying? 

However, I do understand retrying a request on a different path when we got the 
request completion status specifying the path related error.

Thanks,
--Nilay



^ permalink raw reply	[relevance 0%]

* Re: [PATCH] nvme-pci: silence a lockdep complaint
  2024-05-22  9:15  3% [PATCH] nvme-pci: silence a lockdep complaint Sagi Grimberg
@ 2024-05-22 12:18  0% ` Shinichiro Kawasaki
  2024-05-22 16:12  5%   ` Keith Busch
  0 siblings, 1 reply; 200+ results
From: Shinichiro Kawasaki @ 2024-05-22 12:18 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: linux-nvme, Christoph Hellwig, Keith Busch

On May 22, 2024 / 12:15, Sagi Grimberg wrote:
> lockdep complains about the timeout handler running concurrently with
> the reset work which is syncing the IO request queues (which in turn
> flushes the timeout work).
> 
> We know it cannot be the case because the ctrl state machine prevents
> the timeout handler from disabling the ctrl when the reset work is
> running (changing ctrl state to RESETTING will fail, and the state is not
> terminal). If this assumption happens to break in the future, we won't
> have lockdep to assist, but for the time being we are simply seeing
> false-positive complaints from it...
> 
> Reported-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> Suggested-by: Keith Busch <kbusch@kernel.org>
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>

Sagi, thank you very much. I confirmed this patch avoids the lockdep WARN
observed with the blktests test case nvme/050 [1]. When I repeated the test case
around 10 times on v6.9 kernel without this patch, the lockdep WARN was
recreated in stable manner. After applying this patch, the WARN was not
observed even when the test case is repeated 100 times.

Tested-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>

[1] https://lore.kernel.org/linux-block/m6a437jvfwzq2jfytvvk62zpgu7e4bjvegr7x73pihhkp5me5c@sh6vs3s7w754/

^ permalink raw reply	[relevance 0%]

* [PATCH] nvme-pci: silence a lockdep complaint
@ 2024-05-22  9:15  3% Sagi Grimberg
  2024-05-22 12:18  0% ` Shinichiro Kawasaki
  0 siblings, 1 reply; 200+ results
From: Sagi Grimberg @ 2024-05-22  9:15 UTC (permalink / raw)
  To: linux-nvme; +Cc: Christoph Hellwig, Keith Busch, Shinichiro Kawasaki

lockdep complains about the timeout handler running concurrently with
the reset work which is syncing the IO request queues (which in turn
flushes the timeout work).

We know it cannot be the case because the ctrl state machine prevents
the timeout handler from disabling the ctrl when the reset work is
running (changing ctrl state to RESETTING will fail, and the state is not
terminal). If this assumption happens to break in the future, we won't
have lockdep to assist, but for the time being we are simply seeing
false-positive complaints from it...

Reported-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
Suggested-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
 drivers/nvme/host/pci.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 710043086dff..4a85b83b78f9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2718,7 +2718,18 @@ static void nvme_reset_work(struct work_struct *work)
 	 */
 	if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
 		nvme_dev_disable(dev, false);
+	/*
+	 * lockdep complains about the timeout handler running concurrently
+	 * with this call. We know it cannot be the case because the ctrl state
+	 * machine prevents the timeout handler from disabling the ctrl when
+	 * the reset work is running (changing ctrl state to RESETTING will
+	 * fail, and the state is not terminal). If this assumption happens to
+	 * break in the future, we won't have lockdep to assist, but for the
+	 * time being we are simply seeing false-positive complaints from it...
+	 */
+	lockdep_off();
 	nvme_sync_queues(&dev->ctrl);
+	lockdep_on();
 
 	mutex_lock(&dev->shutdown_lock);
 	result = nvme_pci_enable(dev);
-- 
2.40.1



^ permalink raw reply related	[relevance 3%]

* Re: [PATCH] nvme: fix multipath batched completion accounting
  2024-05-21 17:05  5% [PATCH] nvme: fix multipath batched completion accounting Keith Busch
@ 2024-05-22  0:55  0% ` Chaitanya Kulkarni
  2024-05-23  9:59  0% ` Hannes Reinecke
  1 sibling, 0 replies; 200+ results
From: Chaitanya Kulkarni @ 2024-05-22  0:55 UTC (permalink / raw)
  To: Keith Busch; +Cc: axboe, hch, linux-nvme, sagi, Keith Busch

On 5/21/24 10:05, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> Batched completions were missing the io stats accounting and bio trace
> events. Move the common code to a helper and call it from the batched
> and non-batched functions.
>
> Fixes: d4d957b53d91ee ("nvme-multipath: support io stats on the mpath device")
> Signed-off-by: Keith Busch <kbusch@kernel.org>
>

Looks good.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck



^ permalink raw reply	[relevance 0%]

* Re: [PATCH] nvme-multipath: fix io accounting on failover
  2024-05-21 18:35  4% ` John Meneghini
@ 2024-05-21 18:55  5%   ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-21 18:55 UTC (permalink / raw)
  To: John Meneghini; +Cc: Keith Busch, hch, sagi, linux-nvme, nilay

On Tue, May 21, 2024 at 02:35:44PM -0400, John Meneghini wrote:
> Awesome. I've noticed that there is an iostat bug lurking someplace during
> my many days of io policy testing these last two weeks. I'll add this patch
> to my test build and let you know if it fixes the problem!

I was testing CMIC capable PCI devices. The nvme-pci driver is currently
the only one batching completions, and that had a different multipath
accounting bug. I had to fix that before I could validate this patch,
and this one has a dependency on that first fix, which is here:

  https://lore.kernel.org/linux-nvme/20240521170537.2029233-1-kbusch@meta.com/T/#u


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvme-multipath: fix io accounting on failover
  2024-05-21 18:07  5% [PATCH] nvme-multipath: fix io accounting on failover Keith Busch
@ 2024-05-21 18:35  4% ` John Meneghini
  2024-05-21 18:55  5%   ` Keith Busch
  2024-05-22 13:02  0% ` Nilay Shroff
  1 sibling, 1 reply; 200+ results
From: John Meneghini @ 2024-05-21 18:35 UTC (permalink / raw)
  To: Keith Busch, hch, sagi, linux-nvme; +Cc: nilay, Keith Busch

Awesome. I've noticed that there is an iostat bug lurking someplace during my many days of io policy testing these last two 
weeks. I'll add this patch to my test build and let you know if it fixes the problem!

/John

On 5/21/24 14:07, Keith Busch wrote:> From: Keith Busch <kbusch@kernel.org>
> 
> There are io stats accounting that needs to be handled, so don't call
> blk_mq_end_request() directly. Use the existing nvme_end_req() helper
> that already handles everything.
> 
> Fixes: d4d957b53d91ee ("nvme-multipath: support io stats on the mpath device")
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   drivers/nvme/host/core.c      | 2 +-
>   drivers/nvme/host/multipath.c | 3 ++-
>   drivers/nvme/host/nvme.h      | 1 +
>   3 files changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 79cdd34dfa18e..7706df2373494 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -422,7 +422,7 @@ static inline void __nvme_end_req(struct request *req)
>   		nvme_mpath_end_request(req);
>   }
>   
> -static inline void nvme_end_req(struct request *req)
> +void nvme_end_req(struct request *req)
>   {
>   	blk_status_t status = nvme_error_status(nvme_req(req)->status);
>   
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 9c1e135b8df3b..1bee176fd850e 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -118,7 +118,8 @@ void nvme_failover_req(struct request *req)
>   	blk_steal_bios(&ns->head->requeue_list, req);
>   	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
>   
> -	blk_mq_end_request(req, 0);
> +	nvme_req(req)->status = 0;
> +	nvme_end_req(req);
>   	kblockd_schedule_work(&ns->head->requeue_work);
>   }
>   
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index cacc56f4bbf44..fc31bd340a63a 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -767,6 +767,7 @@ static inline bool nvme_state_terminal(struct nvme_ctrl *ctrl)
>   	}
>   }
>   
> +void nvme_end_req(struct request *req);
>   void nvme_complete_rq(struct request *req);
>   void nvme_complete_batch_req(struct request *req);
>   



^ permalink raw reply	[relevance 4%]

* [PATCH] nvme-multipath: fix io accounting on failover
@ 2024-05-21 18:07  5% Keith Busch
  2024-05-21 18:35  4% ` John Meneghini
  2024-05-22 13:02  0% ` Nilay Shroff
  0 siblings, 2 replies; 200+ results
From: Keith Busch @ 2024-05-21 18:07 UTC (permalink / raw)
  To: hch, sagi, linux-nvme; +Cc: jmeneghi, nilay, Keith Busch

From: Keith Busch <kbusch@kernel.org>

There are io stats accounting that needs to be handled, so don't call
blk_mq_end_request() directly. Use the existing nvme_end_req() helper
that already handles everything.

Fixes: d4d957b53d91ee ("nvme-multipath: support io stats on the mpath device")
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c      | 2 +-
 drivers/nvme/host/multipath.c | 3 ++-
 drivers/nvme/host/nvme.h      | 1 +
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 79cdd34dfa18e..7706df2373494 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -422,7 +422,7 @@ static inline void __nvme_end_req(struct request *req)
 		nvme_mpath_end_request(req);
 }
 
-static inline void nvme_end_req(struct request *req)
+void nvme_end_req(struct request *req)
 {
 	blk_status_t status = nvme_error_status(nvme_req(req)->status);
 
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 9c1e135b8df3b..1bee176fd850e 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -118,7 +118,8 @@ void nvme_failover_req(struct request *req)
 	blk_steal_bios(&ns->head->requeue_list, req);
 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
 
-	blk_mq_end_request(req, 0);
+	nvme_req(req)->status = 0;
+	nvme_end_req(req);
 	kblockd_schedule_work(&ns->head->requeue_work);
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cacc56f4bbf44..fc31bd340a63a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -767,6 +767,7 @@ static inline bool nvme_state_terminal(struct nvme_ctrl *ctrl)
 	}
 }
 
+void nvme_end_req(struct request *req);
 void nvme_complete_rq(struct request *req);
 void nvme_complete_batch_req(struct request *req);
 
-- 
2.43.0



^ permalink raw reply related	[relevance 5%]

* [PATCH] nvme: fix multipath batched completion accounting
@ 2024-05-21 17:05  5% Keith Busch
  2024-05-22  0:55  0% ` Chaitanya Kulkarni
  2024-05-23  9:59  0% ` Hannes Reinecke
  0 siblings, 2 replies; 200+ results
From: Keith Busch @ 2024-05-21 17:05 UTC (permalink / raw)
  To: hch, sagi, linux-nvme; +Cc: axboe, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Batched completions were missing the io stats accounting and bio trace
events. Move the common code to a helper and call it from the batched
and non-batched functions.

Fixes: d4d957b53d91ee ("nvme-multipath: support io stats on the mpath device")
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 954f850f113a1..79cdd34dfa18e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -414,6 +414,14 @@ static inline void nvme_end_req_zoned(struct request *req)
 	}
 }
 
+static inline void __nvme_end_req(struct request *req)
+{
+	nvme_end_req_zoned(req);
+	nvme_trace_bio_complete(req);
+	if (req->cmd_flags & REQ_NVME_MPATH)
+		nvme_mpath_end_request(req);
+}
+
 static inline void nvme_end_req(struct request *req)
 {
 	blk_status_t status = nvme_error_status(nvme_req(req)->status);
@@ -424,10 +432,7 @@ static inline void nvme_end_req(struct request *req)
 		else
 			nvme_log_error(req);
 	}
-	nvme_end_req_zoned(req);
-	nvme_trace_bio_complete(req);
-	if (req->cmd_flags & REQ_NVME_MPATH)
-		nvme_mpath_end_request(req);
+	__nvme_end_req(req);
 	blk_mq_end_request(req, status);
 }
 
@@ -476,7 +481,7 @@ void nvme_complete_batch_req(struct request *req)
 {
 	trace_nvme_complete_rq(req);
 	nvme_cleanup_cmd(req);
-	nvme_end_req_zoned(req);
+	__nvme_end_req(req);
 }
 EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
 
-- 
2.43.0



^ permalink raw reply related	[relevance 5%]

* Re: [PATCH] nvme-multipath: find NUMA path only for online numa-node
  @ 2024-05-21 13:47  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-21 13:47 UTC (permalink / raw)
  To: Nilay Shroff; +Cc: linux-nvme, hch, sagi, gjoyce, axboe

On Thu, May 16, 2024 at 05:43:51PM +0530, Nilay Shroff wrote:
> In current native multipath design when a shared namespace is created,
> we loop through each possible numa-node, calculate the NUMA distance of
> that node from each nvme controller and then cache the optimal IO path
> for future reference while sending IO. The issue with this design is that
> we may refer to the NUMA distance table for an offline node which may not

Thanks, applied to nvme-6.10.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v3 1/1] nvme: multipath: Implemented new iopolicy "queue-depth"
  @ 2024-05-21 14:10  5%       ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-21 14:10 UTC (permalink / raw)
  To: John Meneghini
  Cc: Hannes Reinecke, hch, sagi, emilne, linux-nvme, linux-kernel,
	jrani, randyj, hare

On Tue, May 21, 2024 at 09:58:31AM -0400, John Meneghini wrote:
> On 5/21/24 02:46, Hannes Reinecke wrote:
> > > +    list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> > > +        atomic_set(&ctrl->nr_active, 0);
> > > +        nvme_mpath_clear_ctrl_paths(ctrl);
> > 
> > You always reset the variables here, even if specified iopolicy is
> > the same than the currently active one.
> > I'd rather check if the iopolicy is different before changing the settings.
> 
> Yes, Keith pointed this out too.  This is actually a feature not a bug.  In
> situations were we want to "reset" the nr_active counters on all controllers
> the user can simply set the queue-depth iopolicy a second time.  I don't
> expect users to do this very often... they shouldn't be changing IO policies
> back and forth too much... but the ability to "reset" the nr_active counters
> during testing has been very helpful and important to do.  So I'd like to
> keep this.  Moreover, this is NOT the performance path. I don't see the
> point in making performance optimizations in a code path that is run once a
> year.

I missed that you actually want to reset the counters on a live queue.
Wouldn't that just lead to an imbalance? If that is really a feature,
then I retract a previous comment: you do need the atomic_dec_not_zero
(or whatever it was called) since the active count is no longer tied to
the inc's.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v3 1/1] nvme: multipath: Implemented new iopolicy "queue-depth"
  @ 2024-05-21 13:05  5%     ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-21 13:05 UTC (permalink / raw)
  To: Nilay Shroff
  Cc: John Meneghini, hch, sagi, emilne, linux-nvme, linux-kernel,
	jrani, randyj, hare

On Tue, May 21, 2024 at 02:18:09PM +0530, Nilay Shroff wrote:
> On 5/21/24 01:50, John Meneghini wrote:
> > @@ -140,8 +148,12 @@ void nvme_mpath_end_request(struct request *rq)
> >  {
> >  	struct nvme_ns *ns = rq->q->queuedata;
> >  
> > +	if ((nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE))
> > +		atomic_dec_if_positive(&ns->ctrl->nr_active);
> > +
> >  	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
> >  		return;
> > +
> >  	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
> >  			 blk_rq_bytes(rq) >> SECTOR_SHIFT,
> >  			 nvme_req(rq)->start_time);
> > @@ -330,6 +342,40 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
> >  	return found;
> >  }
> >  
> I think you may also want to reset nr_active counter if in case, in-flight nvme request 
> is cancelled. If the request is cancelled then nvme_mpath_end_request() wouldn't be invoked.
> So you may want to reset nr_active counter from nvme_cancel_request() as below:
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index bf7615cb36ee..4fea7883ce8e 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -497,8 +497,9 @@ EXPORT_SYMBOL_GPL(nvme_host_path_error);
>  
>  bool nvme_cancel_request(struct request *req, void *data)
>  {
> -       dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device,
> -                               "Cancelling I/O %d", req->tag);
> +       struct nvme_ctrl *ctrl = (struct nvme_ctrl *)data;
> +
> +       dev_dbg_ratelimited(ctrl->device, "Cancelling I/O %d", req->tag);
>  
>         /* don't abort one completed or idle request */
>         if (blk_mq_rq_state(req) != MQ_RQ_IN_FLIGHT)
> @@ -506,6 +507,8 @@ bool nvme_cancel_request(struct request *req, void *data)
>  
>         nvme_req(req)->status = NVME_SC_HOST_ABORTED_CMD;
>         nvme_req(req)->flags |= NVME_REQ_CANCELLED;
> +       if ((nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE))
> +               atomic_dec(&ctrl->nr_active);
>         blk_mq_complete_request(req);
>         return true;
>  }

The io stats wouldn't be right if that happened. And maybe it isn't
right on a failover, but it needs to be. Would it work if
nvme_failover_req() calls nvme_end_req() instead of directly calling
blk_mq_end_req()?


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v3 1/1] nvme: multipath: Implemented new iopolicy "queue-depth"
  @ 2024-05-20 20:50  5%   ` Keith Busch
      2 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-20 20:50 UTC (permalink / raw)
  To: John Meneghini
  Cc: hch, sagi, emilne, linux-nvme, linux-kernel, jrani, randyj, hare

On Mon, May 20, 2024 at 04:20:45PM -0400, John Meneghini wrote:
> From: "Ewan D. Milne" <emilne@redhat.com>
> 
> The round-robin path selector is inefficient in cases where there is a
> difference in latency between multiple active optimized paths.  In the
> presence of one or more high latency paths the round-robin selector
> continues to the high latency path equally. This results in a bias
> towards the highest latency path and can cause is significant decrease
> in overall performance as IOs pile on the lowest latency path. This
> problem is particularly accute with NVMe-oF controllers.

The patch looks pretty good to me. Just a few questions/comments.

>  static LIST_HEAD(nvme_subsystems);
> -static DEFINE_MUTEX(nvme_subsystems_lock);
> +DEFINE_MUTEX(nvme_subsystems_lock);

This seems odd. Why is this lock protecting both the global
nvme_subsystems list, and also subsystem controllers? IOW, why isn't the
subsys->ctrls list protected by the more fine grained 'subsys->lock'
instead of this global lock?

> @@ -43,7 +46,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
>  module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
>  	&iopolicy, 0644);
>  MODULE_PARM_DESC(iopolicy,
> -	"Default multipath I/O policy; 'numa' (default) or 'round-robin'");
> +	"Default multipath I/O policy; 'numa' (default) , 'round-robin' or 'queue-depth'");

Unnecessary space before the ','.

> +	if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) {
> +		atomic_inc(&ns->ctrl->nr_active);
> +		nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
> +	}
> +
>  	if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
>  		return;
>  
> @@ -140,8 +148,12 @@ void nvme_mpath_end_request(struct request *rq)
>  {
>  	struct nvme_ns *ns = rq->q->queuedata;
>  
> +	if ((nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE))
> +		atomic_dec_if_positive(&ns->ctrl->nr_active);

You can just do a atomic_dec() since your new flag has this tied to to
the atomic_inc().

> +static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
> +{
> +	struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
> +	unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
> +	unsigned int depth;
> +
> +	list_for_each_entry_rcu(ns, &head->list, siblings) {
> +		if (nvme_path_is_disabled(ns))
> +			continue;
> +
> +		depth = atomic_read(&ns->ctrl->nr_active);
> +
> +		switch (ns->ana_state) {
> +		case NVME_ANA_OPTIMIZED:
> +			if (depth < min_depth_opt) {
> +				min_depth_opt = depth;
> +				best_opt = ns;
> +			}
> +			break;
> +
> +		case NVME_ANA_NONOPTIMIZED:
> +			if (depth < min_depth_nonopt) {
> +				min_depth_nonopt = depth;
> +				best_nonopt = ns;
> +			}
> +			break;
> +		default:
> +			break;
> +		}

Could we break out of this loop early if "min_depth_opt == 0"? We can't
find a better path that that, so no need to read the rest of the paths.

> +void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
> +{
> +	struct nvme_ctrl *ctrl;
> +	int old_iopolicy = READ_ONCE(subsys->iopolicy);
> +

Let's add a check here:

	if (old_iopolicy == iopolicy)
		return;

> @@ -935,6 +940,7 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
>  void nvme_mpath_shutdown_disk(struct nvme_ns_head *head);
>  void nvme_mpath_start_request(struct request *rq);
>  void nvme_mpath_end_request(struct request *rq);
> +void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy);

This funciton isn't used outside multipath.c, so it should be static.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v4 1/6] nvme: multipath: Implemented new iopolicy "queue-depth"
  @ 2024-05-20 14:46  5%   ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-20 14:46 UTC (permalink / raw)
  To: John Meneghini
  Cc: tj, josef, axboe, hch, sagi, emilne, hare, linux-block, cgroups,
	linux-nvme, linux-kernel, jrani, randyj

On Tue, May 14, 2024 at 01:53:17PM -0400, John Meneghini wrote:
> @@ -130,6 +133,7 @@ void nvme_mpath_start_request(struct request *rq)
>  	if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
>  		return;
>  
> +	atomic_inc(&ns->ctrl->nr_active);

Why skip passthrough and stats?

And I think you should squash the follow up patch that constrains the
atomics to the queue-depth path selector.

> +static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
> +{
> +	struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
> +	unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
> +	unsigned int depth;
> +
> +	list_for_each_entry_rcu(ns, &head->list, siblings) {
> +		if (nvme_path_is_disabled(ns))
> +			continue;
> +
> +		depth = atomic_read(&ns->ctrl->nr_active);
> +
> +		switch (ns->ana_state) {
> +		case NVME_ANA_OPTIMIZED:
> +			if (depth < min_depth_opt) {
> +				min_depth_opt = depth;
> +				best_opt = ns;
> +			}
> +			break;
> +
> +		case NVME_ANA_NONOPTIMIZED:
> +			if (depth < min_depth_nonopt) {
> +				min_depth_nonopt = depth;
> +				best_nonopt = ns;
> +			}
> +			break;
> +		default:
> +			break;
> +		}
> +	}
> +

I think you can do the atomic_inc here so you don't have to check the io
policy a 2nd time.

> +	return best_opt ? best_opt : best_nonopt;
> +}


^ permalink raw reply	[relevance 5%]

* [GIT PULL] nvme updates for Linux 6.10
@ 2024-05-14 15:07  4% Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-14 15:07 UTC (permalink / raw)
  To: axboe; +Cc: sagi, hch, linux-nvme

The following changes since commit 6ad0d7e0f4b68f87a98ea2b239123b7d865df86b:

  sbitmap: use READ_ONCE to access map->word (2024-04-26 07:40:28 -0600)

are available in the Git repository at:

  git://git.infradead.org/nvme.git tags/nvme-6.10-2024-05-14

for you to fetch changes up to 54a76c8732b265aa86030134d4af6a5a3c59fe52:

  nvme-rdma, nvme-tcp: include max reconnects for reconnect logging (2024-05-07 08:50:37 -0700)

----------------------------------------------------------------
nvme updates for Linux 6.10

 - Fabrics connection retries (Daniel, Hannes)
 - Fabrics logging enhancements (Tokunori)
 - RDMA delete optimization (Sagi)

----------------------------------------------------------------
Daniel Wagner (1):
      nvme: do not retry authentication failures

Hannes Reinecke (4):
      nvmet: lock config semaphore when accessing DH-HMAC-CHAP key
      nvmet: return DHCHAP status codes from nvmet_setup_auth()
      nvme: return kernel error codes for admin queue connect
      nvme-fabrics: short-circuit reconnect retries

Sagi Grimberg (1):
      nvmet-rdma: Avoid o(n^2) loop in delete_ctrl

Tokunori Ikegami (1):
      nvme-rdma, nvme-tcp: include max reconnects for reconnect logging

 drivers/nvme/host/auth.c               |  6 +++---
 drivers/nvme/host/core.c               |  6 +++---
 drivers/nvme/host/fabrics.c            | 51 ++++++++++++++++++++++++++++++++-------------------
 drivers/nvme/host/fabrics.h            |  2 +-
 drivers/nvme/host/fc.c                 |  4 +---
 drivers/nvme/host/nvme.h               |  2 +-
 drivers/nvme/host/rdma.c               | 23 ++++++++++++++---------
 drivers/nvme/host/tcp.c                | 30 ++++++++++++++++++------------
 drivers/nvme/target/auth.c             | 22 ++++++++++------------
 drivers/nvme/target/configfs.c         | 22 +++++++++++++++++-----
 drivers/nvme/target/fabrics-cmd-auth.c | 49 +++++++++++++++++++++++++------------------------
 drivers/nvme/target/fabrics-cmd.c      | 11 ++++++-----
 drivers/nvme/target/nvmet.h            |  8 ++++----
 drivers/nvme/target/rdma.c             | 16 ++++++----------
 14 files changed, 141 insertions(+), 111 deletions(-)


^ permalink raw reply	[relevance 4%]

* Re: [PATCH 2/2] nvme-pci: allow unmanaged interrupts
  2024-05-10 15:10  0%   ` Christoph Hellwig
  2024-05-10 16:20  5%     ` Keith Busch
@ 2024-05-13 13:12  0%     ` Bart Van Assche
  1 sibling, 0 replies; 200+ results
From: Bart Van Assche @ 2024-05-13 13:12 UTC (permalink / raw)
  To: Christoph Hellwig, Keith Busch
  Cc: linux-nvme, linux-kernel, tglx, ming.lei, Keith Busch

On 5/10/24 08:10, Christoph Hellwig wrote:
> On Fri, May 10, 2024 at 07:14:59AM -0700, Keith Busch wrote:
>> From: Keith Busch <kbusch@kernel.org>
>>
>> Some people _really_ want to control their interrupt affinity.
> 
> So let them argue why.  I'd rather have a really, really, really
> good argument for this crap, and I'd like to hear it from the horses
> mouth.

Performance can be increased by modifying the interrupt assignments
carefully, especially in storage appliances that have to process a
large number of network and storage interrupts. By carefully assigning
interrupts the number of completions processed per interrupt can be
increased and hence performance also increases. In 2014 I was working
on a product that benefited from this approach.

Thanks,

Bart.



^ permalink raw reply	[relevance 0%]

* Re: [PATCHv2] nvme-pci: allow unmanaged interrupts
  @ 2024-05-13  4:09  5%           ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-13  4:09 UTC (permalink / raw)
  To: Ming Lei; +Cc: Sagi Grimberg, Keith Busch, linux-nvme, hch

On Mon, May 13, 2024 at 09:12:58AM +0800, Ming Lei wrote:
> On Sun, May 12, 2024 at 05:16:13PM +0300, Sagi Grimberg wrote:
> > 
> > > > Everyone expects nvme performance will suffer. IO latency and CPU
> > > > efficieny are not everyone's top priority, so allowing people to
> > > > optimize for something else seems like a reasonable request.
> > > I guess more people may be interested in 'something else', care to share
> > > them in the commit log, cause nvme is going to support it.
> > 
> > I don't have a special interest in this, but I can share what I heard
> > several
> > times. The use-case is that people want to dedicate a few cores to handle
> > interrupts so they know it does not take cpu time from their application
> > threads
> > that are running (usually pinned to different cores).
> > 
> > The app threads isolation is more important to them than affinity to the
> > device...
> 
> That is exactly what CPU isolation is doing, include 'isolcpus=managed_irq',
> isn't it?

As I've mentioned previously, that option is a no-op when the incoming
mask matches the isolcated cpus. The use case for the kernel's isolated
CPUs doesn't align with the use cases for user defined IRQ affinity.

Let me redirect this discussion please. Is there a techincal reason why
Linux can't let users use their CPUs as they intend? They will take out
of tree patches if that's the position we're forceing them into, but why
is that is Linux taking that position in the first place?


^ permalink raw reply	[relevance 5%]

* Re: [PATCHv2] nvme-pci: allow unmanaged interrupts
  @ 2024-05-12 22:05  5%         ` Keith Busch
    1 sibling, 0 replies; 200+ results
From: Keith Busch @ 2024-05-12 22:05 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: Ming Lei, Keith Busch, linux-nvme, hch

On Sun, May 12, 2024 at 05:16:13PM +0300, Sagi Grimberg wrote:
> 
> > > Everyone expects nvme performance will suffer. IO latency and CPU
> > > efficieny are not everyone's top priority, so allowing people to
> > > optimize for something else seems like a reasonable request.
> > I guess more people may be interested in 'something else', care to share
> > them in the commit log, cause nvme is going to support it.
> 
> I don't have a special interest in this, but I can share what I heard
> several
> times. The use-case is that people want to dedicate a few cores to handle
> interrupts so they know it does not take cpu time from their application
> threads
> that are running (usually pinned to different cores).
> 
> The app threads isolation is more important to them than affinity to the
> device...

Yes, that is consistently the same reasoning I've heard. While managed
irq is overwhelmingly the best choice for most use cases, it's clearly
been communicated that some users do not want it for exactly this
reason.

As far as I can tell, there's no techincal reason to prevent letting
people make that choice. This "kernel knows better than you" argument is
less sustainable than letting users do whatever they want with their
CPUs.

> > > > Is there any benefit to use unmanaged irq in this way?
> > > The immediate desire is more predictable scheduling on a subset of CPUs
> > > by steering hardware interrupts somewhere else. It's the same reason
> > > RDMA undid managed interrupts.
> > > 
> > >    231243c82793428 ("Revert "mlx5: move affinity hints assignments to generic code")
> > The above commit only mentions it becomes not flexible since user can't
> > adjust irq affinity any more.
> > 
> > It is understandable for network, there is long history people need to adjust
> > irq affinity from user space.
> 
> I suspect that the reasoning is similar to nvme as well.

+1, exactly.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] block: unmap and free user mapped integrity via submitter
  2024-05-10 18:56  5%   ` Keith Busch
@ 2024-05-11 11:22  4%     ` Anuj gupta
  0 siblings, 0 replies; 200+ results
From: Anuj gupta @ 2024-05-11 11:22 UTC (permalink / raw)
  To: Keith Busch
  Cc: Anuj Gupta, axboe, hch, linux-nvme, linux-block, martin.petersen,
	Kanchan Joshi

On Sat, May 11, 2024 at 12:31 AM Keith Busch <kbusch@kernel.org> wrote:
>
> On Fri, May 10, 2024 at 03:14:29PM +0530, Anuj Gupta wrote:
> > The user mapped intergity is copied back and unpinned by
> > bio_integrity_free which is a low-level routine. Do it via the submitter
> > rather than doing it in the low-level block layer code, to split the
> > submitter side from the consumer side of the bio.
>
> Thanks, this looks pretty good.
>
> >  out_unmap:
> > -     if (bio)
> > +     if (bio) {
> > +             if (bio_integrity(bio))
> > +                     bio_integrity_unmap_free_user(bio);
> >               blk_rq_unmap_user(bio);
> > +     }
>
> Since we're adding more cleanup responsibilities on the caller, and this
> pattern is repeated 4 times, I think a little helper function is
> warranted: 'nvme_unmap_bio(struct bio *bio)', or something like that.

Makes sense, I will add this in the next version.

Thanks,
Anuj Gupta


^ permalink raw reply	[relevance 4%]

* Re: [PATCH 2/2] nvme-pci: allow unmanaged interrupts
  2024-05-11  0:41  5%         ` Keith Busch
@ 2024-05-11  0:59  0%           ` Ming Lei
  0 siblings, 0 replies; 200+ results
From: Ming Lei @ 2024-05-11  0:59 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Keith Busch, linux-nvme, linux-kernel, tglx

On Fri, May 10, 2024 at 06:41:58PM -0600, Keith Busch wrote:
> On Sat, May 11, 2024 at 07:50:21AM +0800, Ming Lei wrote:
> > On Fri, May 10, 2024 at 10:20:02AM -0600, Keith Busch wrote:
> > > On Fri, May 10, 2024 at 05:10:47PM +0200, Christoph Hellwig wrote:
> > > > On Fri, May 10, 2024 at 07:14:59AM -0700, Keith Busch wrote:
> > > > > From: Keith Busch <kbusch@kernel.org>
> > > > > 
> > > > > Some people _really_ want to control their interrupt affinity.
> > > > 
> > > > So let them argue why.  I'd rather have a really, really, really
> > > > good argument for this crap, and I'd like to hear it from the horses
> > > > mouth.
> > > 
> > > It's just prioritizing predictable user task scheduling for a subset of
> > > CPUs instead of having consistently better storage performance.
> > > 
> > > We already have "isolcpus=managed_irq," parameter to prevent managed
> > > interrupts from running on a subset of CPUs, so the use case is already
> > > kind of supported. The problem with that parameter is it is a no-op if
> > > the starting affinity spread contains only isolated CPUs.
> > 
> > Can you explain a bit why it is a no-op? If only isolated CPUs are
> > spread on one queue, there will be no IO originated from these isolated
> > CPUs, that is exactly what the isolation needs.
> 
> The "isolcpus=managed_irq," option doesn't limit the dispatching CPUs.

Please see commit a46c27026da1 ("blk-mq: don't schedule block kworker on isolated CPUs")
in for-6.10/block.

> It only limits where the managed irq will assign the effective_cpus as a
> best effort.

Most of times it does work.

> 
> Example, I boot with a system with 4 threads, one nvme device, and
> kernel parameter:
> 
>   isolcpus=managed_irq,2-3
> 
> Run this:
> 
>   for i in $(seq 0 3); do taskset -c $i dd if=/dev/nvme0n1 of=/dev/null bs=4k count=1000 iflag=direct; done

It is one test problem, when you try to isolate '2-3', it isn't expected
to submit IO or run application on these isolated CPUs.


Thanks, 
Ming



^ permalink raw reply	[relevance 0%]

* Re: [PATCH 2/2] nvme-pci: allow unmanaged interrupts
  2024-05-10 23:50  0%       ` Ming Lei
@ 2024-05-11  0:41  5%         ` Keith Busch
  2024-05-11  0:59  0%           ` Ming Lei
  0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-05-11  0:41 UTC (permalink / raw)
  To: Ming Lei; +Cc: Christoph Hellwig, Keith Busch, linux-nvme, linux-kernel, tglx

On Sat, May 11, 2024 at 07:50:21AM +0800, Ming Lei wrote:
> On Fri, May 10, 2024 at 10:20:02AM -0600, Keith Busch wrote:
> > On Fri, May 10, 2024 at 05:10:47PM +0200, Christoph Hellwig wrote:
> > > On Fri, May 10, 2024 at 07:14:59AM -0700, Keith Busch wrote:
> > > > From: Keith Busch <kbusch@kernel.org>
> > > > 
> > > > Some people _really_ want to control their interrupt affinity.
> > > 
> > > So let them argue why.  I'd rather have a really, really, really
> > > good argument for this crap, and I'd like to hear it from the horses
> > > mouth.
> > 
> > It's just prioritizing predictable user task scheduling for a subset of
> > CPUs instead of having consistently better storage performance.
> > 
> > We already have "isolcpus=managed_irq," parameter to prevent managed
> > interrupts from running on a subset of CPUs, so the use case is already
> > kind of supported. The problem with that parameter is it is a no-op if
> > the starting affinity spread contains only isolated CPUs.
> 
> Can you explain a bit why it is a no-op? If only isolated CPUs are
> spread on one queue, there will be no IO originated from these isolated
> CPUs, that is exactly what the isolation needs.

The "isolcpus=managed_irq," option doesn't limit the dispatching CPUs.
It only limits where the managed irq will assign the effective_cpus as a
best effort.

Example, I boot with a system with 4 threads, one nvme device, and
kernel parameter:

  isolcpus=managed_irq,2-3

Run this:

  for i in $(seq 0 3); do taskset -c $i dd if=/dev/nvme0n1 of=/dev/null bs=4k count=1000 iflag=direct; done

Check /proc/interrupts | grep nvme0:

           CPU0       CPU1       CPU2       CPU3
...
 26:       1000          0          0          0  PCI-MSIX-0000:00:05.0   1-edge      nvme0q1
 27:          0       1004          0          0  PCI-MSIX-0000:00:05.0   2-edge      nvme0q2
 28:          0          0       1000          0  PCI-MSIX-0000:00:05.0   3-edge      nvme0q3
 29:          0          0          0       1043  PCI-MSIX-0000:00:05.0   4-edge      nvme0q4

The isolcpus did nothing becuase the each vector's mask had just one
cpu; there was no where else that the managed irq could send it. The
documentation seems to indicate that was by design as a "best effort".


^ permalink raw reply	[relevance 5%]

* Re: [PATCHv2] nvme-pci: allow unmanaged interrupts
  2024-05-10 23:47  0% ` Ming Lei
@ 2024-05-11  0:29  5%   ` Keith Busch
    0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-05-11  0:29 UTC (permalink / raw)
  To: Ming Lei; +Cc: Keith Busch, linux-nvme, hch

On Sat, May 11, 2024 at 07:47:26AM +0800, Ming Lei wrote:
> On Fri, May 10, 2024 at 10:46:45AM -0700, Keith Busch wrote:
> >  		map->queue_offset = qoff;
> > -		if (i != HCTX_TYPE_POLL && offset)
> > +		if (managed_irqs && i != HCTX_TYPE_POLL && offset)
> >  			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
> >  		else
> >  			blk_mq_map_queues(map);
> 
> Now the queue mapping is built with nothing from irq affinity which is
> setup from userspace, and performance could be pretty bad.

This just decouples the sw from the irq mappings. Every cpu still has a
blk-mq hctx, there's just no connection to the completing CPU if you
enable this.

Everyone expects nvme performance will suffer. IO latency and CPU
efficieny are not everyone's top priority, so allowing people to
optimize for something else seems like a reasonable request.
 
> Is there any benefit to use unmanaged irq in this way?

The immediate desire is more predictable scheduling on a subset of CPUs
by steering hardware interrupts somewhere else. It's the same reason
RDMA undid managed interrupts.

  231243c82793428 ("Revert "mlx5: move affinity hints assignments to generic code")

Yes, the kernel's managed interrupts are the best choice for optimizing
interaction with that device, but it's not free, and maybe you want to
exchange that optimization for something else.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH 2/2] nvme-pci: allow unmanaged interrupts
  2024-05-10 16:20  5%     ` Keith Busch
@ 2024-05-10 23:50  0%       ` Ming Lei
  2024-05-11  0:41  5%         ` Keith Busch
  0 siblings, 1 reply; 200+ results
From: Ming Lei @ 2024-05-10 23:50 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Keith Busch, linux-nvme, linux-kernel, tglx

On Fri, May 10, 2024 at 10:20:02AM -0600, Keith Busch wrote:
> On Fri, May 10, 2024 at 05:10:47PM +0200, Christoph Hellwig wrote:
> > On Fri, May 10, 2024 at 07:14:59AM -0700, Keith Busch wrote:
> > > From: Keith Busch <kbusch@kernel.org>
> > > 
> > > Some people _really_ want to control their interrupt affinity.
> > 
> > So let them argue why.  I'd rather have a really, really, really
> > good argument for this crap, and I'd like to hear it from the horses
> > mouth.
> 
> It's just prioritizing predictable user task scheduling for a subset of
> CPUs instead of having consistently better storage performance.
> 
> We already have "isolcpus=managed_irq," parameter to prevent managed
> interrupts from running on a subset of CPUs, so the use case is already
> kind of supported. The problem with that parameter is it is a no-op if
> the starting affinity spread contains only isolated CPUs.

Can you explain a bit why it is a no-op? If only isolated CPUs are
spread on one queue, there will be no IO originated from these isolated
CPUs, that is exactly what the isolation needs.



Thanks,
Ming



^ permalink raw reply	[relevance 0%]

* Re: [PATCHv2] nvme-pci: allow unmanaged interrupts
  2024-05-10 17:46  5% [PATCHv2] nvme-pci: allow unmanaged interrupts Keith Busch
@ 2024-05-10 23:47  0% ` Ming Lei
  2024-05-11  0:29  5%   ` Keith Busch
  0 siblings, 1 reply; 200+ results
From: Ming Lei @ 2024-05-10 23:47 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-nvme, hch, Keith Busch

On Fri, May 10, 2024 at 10:46:45AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Some people _really_ want to control their interrupt affinity,
> preferring to sacrafice storage performance for scheduling
> predicatability on some other subset of CPUs.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
> Sorry for the rapid fire v2, and I know some are still aginst this; I'm
> just getting v2 out because v1 breaks a different use case.
> 
> And as far as acceptance goes, this doesn't look like it carries any
> longterm maintenance overhead. It's an opt-in feature, and you're own
> your own if you turn it on.
> 
> v1->v2: skip the the AFFINITY vector allocation if the parameter is
> provided instead trying to make the vector code handle all post_vectors.
> 
>  drivers/nvme/host/pci.c | 17 +++++++++++++++--
>  1 file changed, 15 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 8e0bb9692685d..def1a295284bb 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -63,6 +63,11 @@ MODULE_PARM_DESC(sgl_threshold,
>  		"Use SGLs when average request segment size is larger or equal to "
>  		"this size. Use 0 to disable SGLs.");
>  
> +static bool managed_irqs = true;
> +module_param(managed_irqs, bool, 0444);
> +MODULE_PARM_DESC(managed_irqs,
> +		 "set to false for user controlled irq affinity");
> +
>  #define NVME_PCI_MIN_QUEUE_SIZE 2
>  #define NVME_PCI_MAX_QUEUE_SIZE 4095
>  static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
> @@ -456,7 +461,7 @@ static void nvme_pci_map_queues(struct blk_mq_tag_set *set)
>  		 * affinity), so use the regular blk-mq cpu mapping
>  		 */
>  		map->queue_offset = qoff;
> -		if (i != HCTX_TYPE_POLL && offset)
> +		if (managed_irqs && i != HCTX_TYPE_POLL && offset)
>  			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
>  		else
>  			blk_mq_map_queues(map);

Now the queue mapping is built with nothing from irq affinity which is
setup from userspace, and performance could be pretty bad.

Is there any benefit to use unmanaged irq in this way?


Thanks,
Ming



^ permalink raw reply	[relevance 0%]

* Re: [PATCH] nvme: enable FDP support
  @ 2024-05-10 19:30  4%   ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-10 19:30 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: axboe, hch, linux-nvme, linux-block, javier.gonz, bvanassche,
	david, slava, gost.dev, Hui Qi, Nitesh Shetty

On Fri, May 10, 2024 at 07:10:15PM +0530, Kanchan Joshi wrote:
> Flexible Data Placement (FDP), as ratified in TP 4146a, allows the host
> to control the placement of logical blocks so as to reduce the SSD WAF.
> 
> Userspace can send the data lifetime information using the write hints.
> The SCSI driver (sd) can already pass this information to the SCSI
> devices. This patch does the same for NVMe.
> 
> Fetches the placement-identifiers (plids) if the device supports FDP.
> And map the incoming write-hints to plids.

Just some additional background since this looks similiar to when the
driver supported "streams".

Supporting streams in the driver was pretty a non-issue. The feature was
removed because devices didn't work with streams as expected, and
supporting it carried more maintenance overhead for the upper layers.

Since the block layer re-introduced write hints anyway outside of this
use case, this looks fine to me to re-introduce support for those hints.

So why not re-add stream support back? As far as I know, devices never
implemented that feature as expected, the driver had to enable it on
start up, and there's no required feedback mechanism to see if it's even
working or hurting.

For FDP, the user had to have configured the namespace that way in order
to get this, so it's still an optional, opt-in feature. It's also
mandatory for FDP capable drives to report WAF through the endurance
log, so users can see the effects of using it.

It would be nice to compare endurance logs with and without the FDP
configuration enabled for your various workloads. This will be great to
discuss at LSFMM next week.

> +static int nvme_fetch_fdp_plids(struct nvme_ns *ns, u32 nsid)
> +{
> +	struct nvme_command c = {};
> +	struct nvme_fdp_ruh_status *ruhs;
> +	struct nvme_fdp_ruh_status_desc *ruhsd;
> +	int size, ret, i;
> +
> +	size = sizeof(*ruhs) + NVME_MAX_PLIDS * sizeof(*ruhsd);

	size = struct_size(ruhs, ruhsd, MAX_PLIDS);

> +#define NVME_MAX_PLIDS   (128)
> +
>  /*
>   * Anchor structure for namespaces.  There is one for each namespace in a
>   * NVMe subsystem that any of our controllers can see, and the namespace
> @@ -457,6 +459,8 @@ struct nvme_ns_head {
>  	bool			shared;
>  	bool			passthru_err_log_enabled;
>  	int			instance;
> +	u16			nr_plids;
> +	u16			plids[NVME_MAX_PLIDS];

The largest index needed is WRITE_LIFE_EXTREME, which is "5", so I think
NVME_MAX_PLIDS should be the same value. And it will save space in the
struct.


^ permalink raw reply	[relevance 4%]

* Re: [PATCH] block: unmap and free user mapped integrity via submitter
  @ 2024-05-10 18:56  5%   ` Keith Busch
  2024-05-11 11:22  4%     ` Anuj gupta
  0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-05-10 18:56 UTC (permalink / raw)
  To: Anuj Gupta
  Cc: axboe, hch, linux-nvme, linux-block, martin.petersen, Kanchan Joshi

On Fri, May 10, 2024 at 03:14:29PM +0530, Anuj Gupta wrote:
> The user mapped intergity is copied back and unpinned by
> bio_integrity_free which is a low-level routine. Do it via the submitter
> rather than doing it in the low-level block layer code, to split the
> submitter side from the consumer side of the bio.

Thanks, this looks pretty good.

>  out_unmap:
> -	if (bio)
> +	if (bio) {
> +		if (bio_integrity(bio))
> +			bio_integrity_unmap_free_user(bio);
>  		blk_rq_unmap_user(bio);
> +	}

Since we're adding more cleanup responsibilities on the caller, and this
pattern is repeated 4 times, I think a little helper function is
warranted: 'nvme_unmap_bio(struct bio *bio)', or something like that.


^ permalink raw reply	[relevance 5%]

* [PATCHv2] nvme-pci: allow unmanaged interrupts
@ 2024-05-10 17:46  5% Keith Busch
  2024-05-10 23:47  0% ` Ming Lei
  0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-05-10 17:46 UTC (permalink / raw)
  To: linux-nvme; +Cc: hch, ming.lei, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Some people _really_ want to control their interrupt affinity,
preferring to sacrafice storage performance for scheduling
predicatability on some other subset of CPUs.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
Sorry for the rapid fire v2, and I know some are still aginst this; I'm
just getting v2 out because v1 breaks a different use case.

And as far as acceptance goes, this doesn't look like it carries any
longterm maintenance overhead. It's an opt-in feature, and you're own
your own if you turn it on.

v1->v2: skip the the AFFINITY vector allocation if the parameter is
provided instead trying to make the vector code handle all post_vectors.

 drivers/nvme/host/pci.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8e0bb9692685d..def1a295284bb 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -63,6 +63,11 @@ MODULE_PARM_DESC(sgl_threshold,
 		"Use SGLs when average request segment size is larger or equal to "
 		"this size. Use 0 to disable SGLs.");
 
+static bool managed_irqs = true;
+module_param(managed_irqs, bool, 0444);
+MODULE_PARM_DESC(managed_irqs,
+		 "set to false for user controlled irq affinity");
+
 #define NVME_PCI_MIN_QUEUE_SIZE 2
 #define NVME_PCI_MAX_QUEUE_SIZE 4095
 static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
@@ -456,7 +461,7 @@ static void nvme_pci_map_queues(struct blk_mq_tag_set *set)
 		 * affinity), so use the regular blk-mq cpu mapping
 		 */
 		map->queue_offset = qoff;
-		if (i != HCTX_TYPE_POLL && offset)
+		if (managed_irqs && i != HCTX_TYPE_POLL && offset)
 			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
 		else
 			blk_mq_map_queues(map);
@@ -2218,6 +2223,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 		.priv		= dev,
 	};
 	unsigned int irq_queues, poll_queues;
+	int ret;
 
 	/*
 	 * Poll queues don't need interrupts, but we need at least one I/O queue
@@ -2241,8 +2247,15 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 	irq_queues = 1;
 	if (!(dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR))
 		irq_queues += (nr_io_queues - poll_queues);
-	return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
+
+	if (managed_irqs)
+		return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
 			      PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
+
+	ret = pci_alloc_irq_vectors(pdev, 1, irq_queues, PCI_IRQ_ALL_TYPES);
+	if (ret > 0)
+		nvme_calc_irq_sets(&affd, ret - 1);
+	return ret;
 }
 
 static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
-- 
2.43.0



^ permalink raw reply related	[relevance 5%]

* Re: [PATCH 1/2] genirq/affinity: remove rsvd check against minvec
  2024-05-10 15:15  0% ` [PATCH 1/2] genirq/affinity: remove rsvd check against minvec Ming Lei
@ 2024-05-10 16:47  5%   ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-10 16:47 UTC (permalink / raw)
  To: Ming Lei; +Cc: Keith Busch, linux-nvme, linux-kernel, hch, tglx

On Fri, May 10, 2024 at 11:15:54PM +0800, Ming Lei wrote:
> On Fri, May 10, 2024 at 07:14:58AM -0700, Keith Busch wrote:
> > From: Keith Busch <kbusch@kernel.org>
> > 
> > The reserved vectors are just the desired vectors that don't need to be
> > managed.
> > 
> > Signed-off-by: Keith Busch <kbusch@kernel.org>
> > ---
> >  kernel/irq/affinity.c | 3 ---
> >  1 file changed, 3 deletions(-)
> > 
> > diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
> > index 44a4eba80315c..74b7cccb51a16 100644
> > --- a/kernel/irq/affinity.c
> > +++ b/kernel/irq/affinity.c
> > @@ -113,9 +113,6 @@ unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
> >  	unsigned int resv = affd->pre_vectors + affd->post_vectors;
> >  	unsigned int set_vecs;
> >  
> > -	if (resv > minvec)
> > -		return 0;
> > -
> 
> This behavior is introduced in 6f9a22bc5775 ("PCI/MSI: Ignore affinity if pre/post
> vector count is more than min_vecs"), which is one bug fix.

Thanks for the pointer. Probably best I avoid messing with irq code just
for this use case, so I can have the driver disable the PCI_IRQ_AFFINTY
flag instead ... assuming hch doesn't "nak" it.  :)


^ permalink raw reply	[relevance 5%]

* Re: [PATCH 2/2] nvme-pci: allow unmanaged interrupts
  2024-05-10 15:10  0%   ` Christoph Hellwig
@ 2024-05-10 16:20  5%     ` Keith Busch
  2024-05-10 23:50  0%       ` Ming Lei
  2024-05-13 13:12  0%     ` Bart Van Assche
  1 sibling, 1 reply; 200+ results
From: Keith Busch @ 2024-05-10 16:20 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, linux-nvme, linux-kernel, tglx, ming.lei

On Fri, May 10, 2024 at 05:10:47PM +0200, Christoph Hellwig wrote:
> On Fri, May 10, 2024 at 07:14:59AM -0700, Keith Busch wrote:
> > From: Keith Busch <kbusch@kernel.org>
> > 
> > Some people _really_ want to control their interrupt affinity.
> 
> So let them argue why.  I'd rather have a really, really, really
> good argument for this crap, and I'd like to hear it from the horses
> mouth.

It's just prioritizing predictable user task scheduling for a subset of
CPUs instead of having consistently better storage performance.

We already have "isolcpus=managed_irq," parameter to prevent managed
interrupts from running on a subset of CPUs, so the use case is already
kind of supported. The problem with that parameter is it is a no-op if
the starting affinity spread contains only isolated CPUs.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH 1/2] genirq/affinity: remove rsvd check against minvec
  2024-05-10 14:14  5% [PATCH 1/2] genirq/affinity: remove rsvd check against minvec Keith Busch
  2024-05-10 14:14  5% ` [PATCH 2/2] nvme-pci: allow unmanaged interrupts Keith Busch
@ 2024-05-10 15:15  0% ` Ming Lei
  2024-05-10 16:47  5%   ` Keith Busch
  1 sibling, 1 reply; 200+ results
From: Ming Lei @ 2024-05-10 15:15 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-nvme, linux-kernel, hch, tglx, Keith Busch

On Fri, May 10, 2024 at 07:14:58AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> The reserved vectors are just the desired vectors that don't need to be
> managed.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>  kernel/irq/affinity.c | 3 ---
>  1 file changed, 3 deletions(-)
> 
> diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
> index 44a4eba80315c..74b7cccb51a16 100644
> --- a/kernel/irq/affinity.c
> +++ b/kernel/irq/affinity.c
> @@ -113,9 +113,6 @@ unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
>  	unsigned int resv = affd->pre_vectors + affd->post_vectors;
>  	unsigned int set_vecs;
>  
> -	if (resv > minvec)
> -		return 0;
> -

This behavior is introduced in 6f9a22bc5775 ("PCI/MSI: Ignore affinity if pre/post
vector count is more than min_vecs"), which is one bug fix.

Thanks,
Ming



^ permalink raw reply	[relevance 0%]

* Re: [PATCH 2/2] nvme-pci: allow unmanaged interrupts
  2024-05-10 14:14  5% ` [PATCH 2/2] nvme-pci: allow unmanaged interrupts Keith Busch
@ 2024-05-10 15:10  0%   ` Christoph Hellwig
  2024-05-10 16:20  5%     ` Keith Busch
  2024-05-13 13:12  0%     ` Bart Van Assche
  0 siblings, 2 replies; 200+ results
From: Christoph Hellwig @ 2024-05-10 15:10 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-nvme, linux-kernel, hch, tglx, ming.lei, Keith Busch

On Fri, May 10, 2024 at 07:14:59AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Some people _really_ want to control their interrupt affinity.

So let them argue why.  I'd rather have a really, really, really
good argument for this crap, and I'd like to hear it from the horses
mouth.



^ permalink raw reply	[relevance 0%]

* [PATCH 1/2] genirq/affinity: remove rsvd check against minvec
@ 2024-05-10 14:14  5% Keith Busch
  2024-05-10 14:14  5% ` [PATCH 2/2] nvme-pci: allow unmanaged interrupts Keith Busch
  2024-05-10 15:15  0% ` [PATCH 1/2] genirq/affinity: remove rsvd check against minvec Ming Lei
  0 siblings, 2 replies; 200+ results
From: Keith Busch @ 2024-05-10 14:14 UTC (permalink / raw)
  To: linux-nvme, linux-kernel; +Cc: hch, tglx, ming.lei, Keith Busch

From: Keith Busch <kbusch@kernel.org>

The reserved vectors are just the desired vectors that don't need to be
managed.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 kernel/irq/affinity.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 44a4eba80315c..74b7cccb51a16 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -113,9 +113,6 @@ unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
 	unsigned int resv = affd->pre_vectors + affd->post_vectors;
 	unsigned int set_vecs;
 
-	if (resv > minvec)
-		return 0;
-
 	if (affd->calc_sets) {
 		set_vecs = maxvec - resv;
 	} else {
-- 
2.43.0



^ permalink raw reply related	[relevance 5%]

* [PATCH 2/2] nvme-pci: allow unmanaged interrupts
  2024-05-10 14:14  5% [PATCH 1/2] genirq/affinity: remove rsvd check against minvec Keith Busch
@ 2024-05-10 14:14  5% ` Keith Busch
  2024-05-10 15:10  0%   ` Christoph Hellwig
  2024-05-10 15:15  0% ` [PATCH 1/2] genirq/affinity: remove rsvd check against minvec Ming Lei
  1 sibling, 1 reply; 200+ results
From: Keith Busch @ 2024-05-10 14:14 UTC (permalink / raw)
  To: linux-nvme, linux-kernel; +Cc: hch, tglx, ming.lei, Keith Busch

From: Keith Busch <kbusch@kernel.org>

Some people _really_ want to control their interrupt affinity.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/pci.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8e0bb9692685d..4c2799c3f45f5 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -63,6 +63,11 @@ MODULE_PARM_DESC(sgl_threshold,
 		"Use SGLs when average request segment size is larger or equal to "
 		"this size. Use 0 to disable SGLs.");
 
+static bool managed_irqs = true;
+module_param(managed_irqs, bool, 0444);
+MODULE_PARM_DESC(managed_irqs,
+		 "set to false for user controlled irq affinity");
+
 #define NVME_PCI_MIN_QUEUE_SIZE 2
 #define NVME_PCI_MAX_QUEUE_SIZE 4095
 static int io_queue_depth_set(const char *val, const struct kernel_param *kp);
@@ -456,7 +461,7 @@ static void nvme_pci_map_queues(struct blk_mq_tag_set *set)
 		 * affinity), so use the regular blk-mq cpu mapping
 		 */
 		map->queue_offset = qoff;
-		if (i != HCTX_TYPE_POLL && offset)
+		if (managed_irqs && i != HCTX_TYPE_POLL && offset)
 			blk_mq_pci_map_queues(map, to_pci_dev(dev->dev), offset);
 		else
 			blk_mq_map_queues(map);
@@ -2180,6 +2185,9 @@ static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
 	struct nvme_dev *dev = affd->priv;
 	unsigned int nr_read_queues, nr_write_queues = dev->nr_write_queues;
 
+	if (!nrirqs)
+		nrirqs = affd->post_vectors;
+
 	/*
 	 * If there is no interrupt available for queues, ensure that
 	 * the default queue is set to 1. The affinity set size is
@@ -2226,6 +2234,9 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 	poll_queues = min(dev->nr_poll_queues, nr_io_queues - 1);
 	dev->io_queues[HCTX_TYPE_POLL] = poll_queues;
 
+	if (!managed_irqs)
+		affd.post_vectors = nr_io_queues - poll_queues;
+
 	/*
 	 * Initialize for the single interrupt case, will be updated in
 	 * nvme_calc_irq_sets().
-- 
2.43.0



^ permalink raw reply related	[relevance 5%]

* [GIT PULL] nvme fixes for Linux 6.9
@ 2024-05-09 17:48  5% Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-09 17:48 UTC (permalink / raw)
  To: axboe; +Cc: hch, sagi, linux-nvme

The following changes since commit ffd379c13fc0ab2c7c4313e7a01c71d9d202cc88:

  block: set default max segment size in case of virt_boundary (2024-05-06 20:27:51 -0600)

are available in the Git repository at:

  git://git.infradead.org/nvme.git tags/nvme-6.9-2024-05-09

for you to fetch changes up to 73964c1d07c054376f1b32a62548571795159148:

  nvmet-rdma: fix possible bad dereference when freeing rsps (2024-05-08 06:17:01 -0700)

----------------------------------------------------------------
nvme fixes for Linux 6.9

 - nvme target fixes (Sagi, Dan, Maurizo)
 - new vendor quirk for broken MSI (Sean)

----------------------------------------------------------------
Dan Carpenter (1):
      nvmet: prevent sprintf() overflow in nvmet_subsys_nsid_exists()

Maurizio Lombardi (1):
      nvmet-auth: return the error code to the nvmet_auth_ctrl_hash() callers

Sagi Grimberg (2):
      nvmet: make nvmet_wq unbound
      nvmet-rdma: fix possible bad dereference when freeing rsps

Sean Anderson (1):
      nvme-pci: Add quirk for broken MSIs

 drivers/nvme/host/nvme.h       |  5 +++++
 drivers/nvme/host/pci.c        | 14 +++++++++++---
 drivers/nvme/target/auth.c     |  2 +-
 drivers/nvme/target/configfs.c |  5 ++---
 drivers/nvme/target/core.c     |  3 ++-
 drivers/nvme/target/rdma.c     | 16 ++++------------
 6 files changed, 25 insertions(+), 20 deletions(-)


^ permalink raw reply	[relevance 5%]

* Re: WQ_UNBOUND workqueue warnings from multiple drivers
  @ 2024-05-08 23:16  5%         ` Kamaljit Singh
  0 siblings, 0 replies; 200+ results
From: Kamaljit Singh @ 2024-05-08 23:16 UTC (permalink / raw)
  To: Sagi Grimberg, Chaitanya Kulkarni; +Cc: kbusch, linux-kernel, linux-nvme

Sagi,

>Does this happen with a 90-%100% read workload?
Yes, we’ve now seen it with 100% reads as well. Here’s the Medusa cmd we used. I’ve removed the devices for brevity.
sudo /opt/medusa_labs/test_tools/bin/maim 20g -b8K -Q128 -Y1 -M30 --full-device -B3 -r -d900000 <device_list>
 
We saw the original issue with the upstream kernel v6.6.21. But now we’re also seeing it with Ubuntu 24.04 (kernel 6.8.0-31-generic), where IOs are timing out and forcing connection drops.
 
 
>Question, are you working with a Linux controller?
No, with our ASIC (NVMe Fabrics bridge).
 
>what is the ctrl ioccsz?
ioccsz    : 4
 
 
Thanks,
Kamaljit
 
From: Sagi Grimberg <sagi@grimberg.me>
Date: Sunday, April 7, 2024 at 13:08
To: Kamaljit Singh <Kamaljit.Singh1@wdc.com>, Chaitanya Kulkarni <chaitanyak@nvidia.com>
Cc: kbusch@kernel.org <kbusch@kernel.org>, linux-kernel@vger.kernel.org <linux-kernel@vger.kernel.org>, linux-nvme@lists.infradead.org <linux-nvme@lists.infradead.org>
Subject: Re: WQ_UNBOUND workqueue warnings from multiple drivers
CAUTION: This email originated from outside of Western Digital. Do not click on links or open attachments unless you recognize the sender and know that the content is safe.


On 03/04/2024 2:50, Kamaljit Singh wrote:
> Sagi, Chaitanya,
>
> Sorry for the delay, found your replies in the junk folder :(
>
>>   Was the test you were running read-heavy?
> No, most of the failing fio tests were doing heavy writes. All were with 8 Controllers and 32 NS each. io-specs are below.
>
> [1] bs=16k, iodepth=16, rwmixread=0, numjobs=16
> Failed in ~1 min
>
> Some others were:
> [2] bs=8k, iodepth=16, rwmixread=5, numjobs=16
> [3] bs=8k, iodepth=16, rwmixread=50, numjobs=16

Interesting, that is the opposite of what I would suspect (I thought that
the workload would be read-only or read-mostly).

Does this happen with a 90-%100% read workload?

If we look at nvme_tcp_io_work() it is essentially looping
doing send() and recv() and every iteration checks if a 1ms
deadline elapsed. The fact that it happens on a 100% write
workload leads me to conclude that the only way this can
happen if sending a single 16K request to a controller on its
own takes more than 10ms, which is unexpected...

Question, are you working with a Linux controller? what
is the ctrl ioccsz?

^ permalink raw reply	[relevance 5%]

* Re: [PATCH v2] nvmet-rdma: fix possible bad dereference when freeing rsps
  @ 2024-05-08 13:20  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-08 13:20 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: linux-nvme, Christoph Hellwig, Chaitanya Kulkarni

On Wed, May 08, 2024 at 10:53:06AM +0300, Sagi Grimberg wrote:
> It is possible that the host connected and saw a cm established
> event and started sending nvme capsules on the qp, however the
> ctrl did not yet see an established event. This is why the
> rsp_wait_list exists (for async handling of these cmds, we move
> them to a pending list).
> 
> Furthermore, it is possible that the ctrl cm times out, resulting
> in a connect-error cm event. in this case we hit a bad deref [1]
> because in nvmet_rdma_free_rsps we assume that all the responses
> are in the free list.
> 
> We are freeing the cmds array anyways, so don't even bother to
> remove the rsp from the free_list. It is also guaranteed that we
> are not racing anything when we are releasing the queue so no
> other context accessing this array should be running.
> 
> [1]:
> --
> Workqueue: nvmet-free-wq nvmet_rdma_free_queue_work [nvmet_rdma]
> [...]
> pc : nvmet_rdma_free_rsps+0x78/0xb8 [nvmet_rdma]
> lr : nvmet_rdma_free_queue_work+0x88/0x120 [nvmet_rdma]
>  Call trace:
>  nvmet_rdma_free_rsps+0x78/0xb8 [nvmet_rdma]
>  nvmet_rdma_free_queue_work+0x88/0x120 [nvmet_rdma]
>  process_one_work+0x1ec/0x4a0
>  worker_thread+0x48/0x490
>  kthread+0x158/0x160
>  ret_from_fork+0x10/0x18
> --
> 
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvmet: prevent sprintf() overflow in nvmet_subsys_nsid_exists()
  @ 2024-05-08 13:13  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-08 13:13 UTC (permalink / raw)
  To: Dan Carpenter
  Cc: Sagi Grimberg, Christoph Hellwig, Chaitanya Kulkarni, linux-nvme,
	linux-kernel, kernel-janitors

On Wed, May 08, 2024 at 10:43:04AM +0300, Dan Carpenter wrote:
> The nsid value is a u32 that comes from nvmet_req_find_ns().  It's
> endian data and we're on an error path and both of those raise red
> flags.  So let's make this safer.
> 
> 1) Make the buffer large enough for any u32.
> 2) Remove the unnecessary initialization.
> 3) Use snprintf() instead of sprintf() for even more safety.
> 4) The sprintf() function returns the number of bytes printed, not
>    counting the NUL terminator. It is impossible for the return value to
>    be <= 0 so delete that.
> 
> Fixes: 505363957fad ("nvmet: fix nvme status code when namespace is disabled")
> Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* [PATCH AUTOSEL 5.4 6/6] nvme: find numa distance only if controller has valid numa id
       [not found]     <20240507231424.395315-1-sashal@kernel.org>
@ 2024-05-07 23:14  4% ` Sasha Levin
  0 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:14 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Nilay Shroff, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Keith Busch, Sasha Levin, linux-nvme

From: Nilay Shroff <nilay@linux.ibm.com>

[ Upstream commit 863fe60ed27f2c85172654a63c5b827e72c8b2e6 ]

On system where native nvme multipath is configured and iopolicy
is set to numa but the nvme controller numa node id is undefined
or -1 (NUMA_NO_NODE) then avoid calculating node distance for
finding optimal io path. In such case we may access numa distance
table with invalid index and that may potentially refer to incorrect
memory. So this patch ensures that if the nvme controller numa node
id is -1 then instead of calculating node distance for finding optimal
io path, we set the numa node distance of such controller to default 10
(LOCAL_DISTANCE).

Link: https://lore.kernel.org/all/20240413090614.678353-1-nilay@linux.ibm.com/
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/multipath.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 811f7b96b5517..4f3220aef7c47 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -180,7 +180,8 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 		if (nvme_path_is_disabled(ns))
 			continue;
 
-		if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+		if (ns->ctrl->numa_node != NUMA_NO_NODE &&
+		    READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
 			distance = node_distance(node, ns->ctrl->numa_node);
 		else
 			distance = LOCAL_DISTANCE;
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 5.10 9/9] nvme: find numa distance only if controller has valid numa id
       [not found]     <20240507231406.395123-1-sashal@kernel.org>
@ 2024-05-07 23:14  4% ` Sasha Levin
  0 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:14 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Nilay Shroff, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Keith Busch, Sasha Levin, linux-nvme

From: Nilay Shroff <nilay@linux.ibm.com>

[ Upstream commit 863fe60ed27f2c85172654a63c5b827e72c8b2e6 ]

On system where native nvme multipath is configured and iopolicy
is set to numa but the nvme controller numa node id is undefined
or -1 (NUMA_NO_NODE) then avoid calculating node distance for
finding optimal io path. In such case we may access numa distance
table with invalid index and that may potentially refer to incorrect
memory. So this patch ensures that if the nvme controller numa node
id is -1 then instead of calculating node distance for finding optimal
io path, we set the numa node distance of such controller to default 10
(LOCAL_DISTANCE).

Link: https://lore.kernel.org/all/20240413090614.678353-1-nilay@linux.ibm.com/
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/multipath.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 379d6818a0635..9f59f93b70e26 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -168,7 +168,8 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 		if (nvme_path_is_disabled(ns))
 			continue;
 
-		if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+		if (ns->ctrl->numa_node != NUMA_NO_NODE &&
+		    READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
 			distance = node_distance(node, ns->ctrl->numa_node);
 		else
 			distance = LOCAL_DISTANCE;
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 5.15 14/15] nvme: find numa distance only if controller has valid numa id
       [not found]     <20240507231333.394765-1-sashal@kernel.org>
@ 2024-05-07 23:13  4% ` Sasha Levin
  0 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:13 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Nilay Shroff, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Keith Busch, Sasha Levin, linux-nvme

From: Nilay Shroff <nilay@linux.ibm.com>

[ Upstream commit 863fe60ed27f2c85172654a63c5b827e72c8b2e6 ]

On system where native nvme multipath is configured and iopolicy
is set to numa but the nvme controller numa node id is undefined
or -1 (NUMA_NO_NODE) then avoid calculating node distance for
finding optimal io path. In such case we may access numa distance
table with invalid index and that may potentially refer to incorrect
memory. So this patch ensures that if the nvme controller numa node
id is -1 then instead of calculating node distance for finding optimal
io path, we set the numa node distance of such controller to default 10
(LOCAL_DISTANCE).

Link: https://lore.kernel.org/all/20240413090614.678353-1-nilay@linux.ibm.com/
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/multipath.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 73eddb67f0d24..f8ad43b5f0690 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -190,7 +190,8 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 		if (nvme_path_is_disabled(ns))
 			continue;
 
-		if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+		if (ns->ctrl->numa_node != NUMA_NO_NODE &&
+		    READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
 			distance = node_distance(node, ns->ctrl->numa_node);
 		else
 			distance = LOCAL_DISTANCE;
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.1 24/25] nvmet: fix nvme status code when namespace is disabled
       [not found]     <20240507231231.394219-1-sashal@kernel.org>
                   ` (3 preceding siblings ...)
  2024-05-07 23:12  4% ` [PATCH AUTOSEL 6.1 23/25] nvmet-tcp: fix possible memory leak when tearing down a controller Sasha Levin
@ 2024-05-07 23:12  3% ` Sasha Levin
  4 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:12 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sagi Grimberg, Jirong Feng, Christoph Hellwig, Keith Busch,
	Sasha Levin, kch, linux-nvme

From: Sagi Grimberg <sagi@grimberg.me>

[ Upstream commit 505363957fad35f7aed9a2b0d8dad73451a80fb5 ]

If the user disabled a nvmet namespace, it is removed from the subsystem
namespaces list. When nvmet processes a command directed to an nsid that
was disabled, it cannot differentiate between a nsid that is disabled
vs. a non-existent namespace, and resorts to return NVME_SC_INVALID_NS
with the dnr bit set.

This translates to a non-retryable status for the host, which translates
to a user error. We should expect disabled namespaces to not cause an
I/O error in a multipath environment.

Address this by searching a configfs item for the namespace nvmet failed
to find, and if we found one, conclude that the namespace is disabled
(perhaps temporarily). Return NVME_SC_INTERNAL_PATH_ERROR in this case
and keep DNR bit cleared.

Reported-by: Jirong Feng <jirong.feng@easystack.cn>
Tested-by: Jirong Feng <jirong.feng@easystack.cn>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/target/configfs.c | 13 +++++++++++++
 drivers/nvme/target/core.c     |  5 ++++-
 drivers/nvme/target/nvmet.h    |  1 +
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 73ae16059a1cb..b1f5fa45bb4ac 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -615,6 +615,19 @@ static struct configfs_attribute *nvmet_ns_attrs[] = {
 	NULL,
 };
 
+bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid)
+{
+	struct config_item *ns_item;
+	char name[4] = {};
+
+	if (sprintf(name, "%u", nsid) <= 0)
+		return false;
+	mutex_lock(&subsys->namespaces_group.cg_subsys->su_mutex);
+	ns_item = config_group_find_item(&subsys->namespaces_group, name);
+	mutex_unlock(&subsys->namespaces_group.cg_subsys->su_mutex);
+	return ns_item != NULL;
+}
+
 static void nvmet_ns_release(struct config_item *item)
 {
 	struct nvmet_ns *ns = to_nvmet_ns(item);
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 3235baf7cc6b1..7b74926c50f9b 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -423,10 +423,13 @@ void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
 u16 nvmet_req_find_ns(struct nvmet_req *req)
 {
 	u32 nsid = le32_to_cpu(req->cmd->common.nsid);
+	struct nvmet_subsys *subsys = nvmet_req_subsys(req);
 
-	req->ns = xa_load(&nvmet_req_subsys(req)->namespaces, nsid);
+	req->ns = xa_load(&subsys->namespaces, nsid);
 	if (unlikely(!req->ns)) {
 		req->error_loc = offsetof(struct nvme_common_command, nsid);
+		if (nvmet_subsys_nsid_exists(subsys, nsid))
+			return NVME_SC_INTERNAL_PATH_ERROR;
 		return NVME_SC_INVALID_NS | NVME_SC_DNR;
 	}
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 273cca49a040f..6aee0ce60a4ba 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -527,6 +527,7 @@ void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys,
 		struct nvmet_host *host);
 void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
 		u8 event_info, u8 log_page);
+bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid);
 
 #define NVMET_QUEUE_SIZE	1024
 #define NVMET_NR_QUEUES		128
-- 
2.43.0



^ permalink raw reply related	[relevance 3%]

* [PATCH AUTOSEL 6.1 23/25] nvmet-tcp: fix possible memory leak when tearing down a controller
       [not found]     <20240507231231.394219-1-sashal@kernel.org>
                   ` (2 preceding siblings ...)
  2024-05-07 23:12  4% ` [PATCH AUTOSEL 6.1 22/25] nvmet-auth: replace pr_debug() with pr_err() to report an error Sasha Levin
@ 2024-05-07 23:12  4% ` Sasha Levin
  2024-05-07 23:12  3% ` [PATCH AUTOSEL 6.1 24/25] nvmet: fix nvme status code when namespace is disabled Sasha Levin
  4 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:12 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sagi Grimberg, Yi Zhang, Christoph Hellwig, Keith Busch,
	Sasha Levin, kch, linux-nvme

From: Sagi Grimberg <sagi@grimberg.me>

[ Upstream commit 6825bdde44340c5a9121f6d6fa25cc885bd9e821 ]

When we teardown the controller, we wait for pending I/Os to complete
(sq->ref on all queues to drop to zero) and then we go over the commands,
and free their command buffers in case they are still fetching data from
the host (e.g. processing nvme writes) and have yet to take a reference
on the sq.

However, we may miss the case where commands have failed before executing
and are queued for sending a response, but will never occur because the
queue socket is already down. In this case we may miss deallocating command
buffers.

Solve this by freeing all commands buffers as nvmet_tcp_free_cmd_buffers is
idempotent anyways.

Reported-by: Yi Zhang <yi.zhang@redhat.com>
Tested-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/target/tcp.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 3480768274699..5556f55880411 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -297,6 +297,7 @@ static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
 	return 0;
 }
 
+/* If cmd buffers are NULL, no operation is performed */
 static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd)
 {
 	kfree(cmd->iov);
@@ -1437,13 +1438,9 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue)
 	struct nvmet_tcp_cmd *cmd = queue->cmds;
 	int i;
 
-	for (i = 0; i < queue->nr_cmds; i++, cmd++) {
-		if (nvmet_tcp_need_data_in(cmd))
-			nvmet_tcp_free_cmd_buffers(cmd);
-	}
-
-	if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect))
-		nvmet_tcp_free_cmd_buffers(&queue->connect);
+	for (i = 0; i < queue->nr_cmds; i++, cmd++)
+		nvmet_tcp_free_cmd_buffers(cmd);
+	nvmet_tcp_free_cmd_buffers(&queue->connect);
 }
 
 static void nvmet_tcp_release_queue_work(struct work_struct *w)
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.1 22/25] nvmet-auth: replace pr_debug() with pr_err() to report an error.
       [not found]     <20240507231231.394219-1-sashal@kernel.org>
  2024-05-07 23:12  4% ` [PATCH AUTOSEL 6.1 20/25] nvme: find numa distance only if controller has valid numa id Sasha Levin
  2024-05-07 23:12  4% ` [PATCH AUTOSEL 6.1 21/25] nvmet-auth: return the error code to the nvmet_auth_host_hash() callers Sasha Levin
@ 2024-05-07 23:12  4% ` Sasha Levin
  2024-05-07 23:12  4% ` [PATCH AUTOSEL 6.1 23/25] nvmet-tcp: fix possible memory leak when tearing down a controller Sasha Levin
  2024-05-07 23:12  3% ` [PATCH AUTOSEL 6.1 24/25] nvmet: fix nvme status code when namespace is disabled Sasha Levin
  4 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:12 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Maurizio Lombardi, Sagi Grimberg, Chaitanya Kulkarni,
	Keith Busch, Sasha Levin, hare, linux-nvme

From: Maurizio Lombardi <mlombard@redhat.com>

[ Upstream commit 445f9119e70368ccc964575c2a6d3176966a9d65 ]

In nvmet_auth_host_hash(), if a mismatch is detected in the hash length
the kernel should print an error.

Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/target/auth.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
index 1f7d492c4dc26..e900525b78665 100644
--- a/drivers/nvme/target/auth.c
+++ b/drivers/nvme/target/auth.c
@@ -284,9 +284,9 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
 	}
 
 	if (shash_len != crypto_shash_digestsize(shash_tfm)) {
-		pr_debug("%s: hash len mismatch (len %d digest %d)\n",
-			 __func__, shash_len,
-			 crypto_shash_digestsize(shash_tfm));
+		pr_err("%s: hash len mismatch (len %d digest %d)\n",
+			__func__, shash_len,
+			crypto_shash_digestsize(shash_tfm));
 		ret = -EINVAL;
 		goto out_free_tfm;
 	}
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.1 21/25] nvmet-auth: return the error code to the nvmet_auth_host_hash() callers
       [not found]     <20240507231231.394219-1-sashal@kernel.org>
  2024-05-07 23:12  4% ` [PATCH AUTOSEL 6.1 20/25] nvme: find numa distance only if controller has valid numa id Sasha Levin
@ 2024-05-07 23:12  4% ` Sasha Levin
  2024-05-07 23:12  4% ` [PATCH AUTOSEL 6.1 22/25] nvmet-auth: replace pr_debug() with pr_err() to report an error Sasha Levin
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:12 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Maurizio Lombardi, Sagi Grimberg, Chaitanya Kulkarni,
	Keith Busch, Sasha Levin, hare, linux-nvme

From: Maurizio Lombardi <mlombard@redhat.com>

[ Upstream commit 46b8f9f74f6d500871985e22eb19560b21f3bc81 ]

If the nvmet_auth_host_hash() function fails, the error code should
be returned to its callers.

Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/target/auth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
index 4dcddcf95279b..1f7d492c4dc26 100644
--- a/drivers/nvme/target/auth.c
+++ b/drivers/nvme/target/auth.c
@@ -368,7 +368,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
 	kfree_sensitive(host_response);
 out_free_tfm:
 	crypto_free_shash(shash_tfm);
-	return 0;
+	return ret;
 }
 
 int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.1 20/25] nvme: find numa distance only if controller has valid numa id
       [not found]     <20240507231231.394219-1-sashal@kernel.org>
@ 2024-05-07 23:12  4% ` Sasha Levin
  2024-05-07 23:12  4% ` [PATCH AUTOSEL 6.1 21/25] nvmet-auth: return the error code to the nvmet_auth_host_hash() callers Sasha Levin
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:12 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Nilay Shroff, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Keith Busch, Sasha Levin, linux-nvme

From: Nilay Shroff <nilay@linux.ibm.com>

[ Upstream commit 863fe60ed27f2c85172654a63c5b827e72c8b2e6 ]

On system where native nvme multipath is configured and iopolicy
is set to numa but the nvme controller numa node id is undefined
or -1 (NUMA_NO_NODE) then avoid calculating node distance for
finding optimal io path. In such case we may access numa distance
table with invalid index and that may potentially refer to incorrect
memory. So this patch ensures that if the nvme controller numa node
id is -1 then instead of calculating node distance for finding optimal
io path, we set the numa node distance of such controller to default 10
(LOCAL_DISTANCE).

Link: https://lore.kernel.org/all/20240413090614.678353-1-nilay@linux.ibm.com/
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/multipath.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index f96d330d39641..6cf0ce7aff678 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -213,7 +213,8 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 		if (nvme_path_is_disabled(ns))
 			continue;
 
-		if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+		if (ns->ctrl->numa_node != NUMA_NO_NODE &&
+		    READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
 			distance = node_distance(node, ns->ctrl->numa_node);
 		else
 			distance = LOCAL_DISTANCE;
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.6 42/43] nvmet: fix nvme status code when namespace is disabled
       [not found]     <20240507231033.393285-1-sashal@kernel.org>
                   ` (4 preceding siblings ...)
  2024-05-07 23:10  4% ` [PATCH AUTOSEL 6.6 41/43] nvmet-tcp: fix possible memory leak when tearing down a controller Sasha Levin
@ 2024-05-07 23:10  3% ` Sasha Levin
  5 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:10 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sagi Grimberg, Jirong Feng, Christoph Hellwig, Keith Busch,
	Sasha Levin, kch, linux-nvme

From: Sagi Grimberg <sagi@grimberg.me>

[ Upstream commit 505363957fad35f7aed9a2b0d8dad73451a80fb5 ]

If the user disabled a nvmet namespace, it is removed from the subsystem
namespaces list. When nvmet processes a command directed to an nsid that
was disabled, it cannot differentiate between a nsid that is disabled
vs. a non-existent namespace, and resorts to return NVME_SC_INVALID_NS
with the dnr bit set.

This translates to a non-retryable status for the host, which translates
to a user error. We should expect disabled namespaces to not cause an
I/O error in a multipath environment.

Address this by searching a configfs item for the namespace nvmet failed
to find, and if we found one, conclude that the namespace is disabled
(perhaps temporarily). Return NVME_SC_INTERNAL_PATH_ERROR in this case
and keep DNR bit cleared.

Reported-by: Jirong Feng <jirong.feng@easystack.cn>
Tested-by: Jirong Feng <jirong.feng@easystack.cn>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/target/configfs.c | 13 +++++++++++++
 drivers/nvme/target/core.c     |  5 ++++-
 drivers/nvme/target/nvmet.h    |  1 +
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 01b2a3d1a5e6c..3670a1103863b 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -616,6 +616,19 @@ static struct configfs_attribute *nvmet_ns_attrs[] = {
 	NULL,
 };
 
+bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid)
+{
+	struct config_item *ns_item;
+	char name[4] = {};
+
+	if (sprintf(name, "%u", nsid) <= 0)
+		return false;
+	mutex_lock(&subsys->namespaces_group.cg_subsys->su_mutex);
+	ns_item = config_group_find_item(&subsys->namespaces_group, name);
+	mutex_unlock(&subsys->namespaces_group.cg_subsys->su_mutex);
+	return ns_item != NULL;
+}
+
 static void nvmet_ns_release(struct config_item *item)
 {
 	struct nvmet_ns *ns = to_nvmet_ns(item);
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 3935165048e74..ce7e945cb4f7e 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -425,10 +425,13 @@ void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
 u16 nvmet_req_find_ns(struct nvmet_req *req)
 {
 	u32 nsid = le32_to_cpu(req->cmd->common.nsid);
+	struct nvmet_subsys *subsys = nvmet_req_subsys(req);
 
-	req->ns = xa_load(&nvmet_req_subsys(req)->namespaces, nsid);
+	req->ns = xa_load(&subsys->namespaces, nsid);
 	if (unlikely(!req->ns)) {
 		req->error_loc = offsetof(struct nvme_common_command, nsid);
+		if (nvmet_subsys_nsid_exists(subsys, nsid))
+			return NVME_SC_INTERNAL_PATH_ERROR;
 		return NVME_SC_INVALID_NS | NVME_SC_DNR;
 	}
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 8cfd60f3b5648..15b00ed7be16a 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -530,6 +530,7 @@ void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys,
 		struct nvmet_host *host);
 void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
 		u8 event_info, u8 log_page);
+bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid);
 
 #define NVMET_QUEUE_SIZE	1024
 #define NVMET_NR_QUEUES		128
-- 
2.43.0



^ permalink raw reply related	[relevance 3%]

* [PATCH AUTOSEL 6.6 41/43] nvmet-tcp: fix possible memory leak when tearing down a controller
       [not found]     <20240507231033.393285-1-sashal@kernel.org>
                   ` (3 preceding siblings ...)
  2024-05-07 23:10  3% ` [PATCH AUTOSEL 6.6 40/43] nvme: cancel pending I/O if nvme controller is in terminal state Sasha Levin
@ 2024-05-07 23:10  4% ` Sasha Levin
  2024-05-07 23:10  3% ` [PATCH AUTOSEL 6.6 42/43] nvmet: fix nvme status code when namespace is disabled Sasha Levin
  5 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:10 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sagi Grimberg, Yi Zhang, Christoph Hellwig, Keith Busch,
	Sasha Levin, kch, linux-nvme

From: Sagi Grimberg <sagi@grimberg.me>

[ Upstream commit 6825bdde44340c5a9121f6d6fa25cc885bd9e821 ]

When we teardown the controller, we wait for pending I/Os to complete
(sq->ref on all queues to drop to zero) and then we go over the commands,
and free their command buffers in case they are still fetching data from
the host (e.g. processing nvme writes) and have yet to take a reference
on the sq.

However, we may miss the case where commands have failed before executing
and are queued for sending a response, but will never occur because the
queue socket is already down. In this case we may miss deallocating command
buffers.

Solve this by freeing all commands buffers as nvmet_tcp_free_cmd_buffers is
idempotent anyways.

Reported-by: Yi Zhang <yi.zhang@redhat.com>
Tested-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/target/tcp.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 8e5d547aa16cb..3d302815c6f36 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -324,6 +324,7 @@ static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
 	return 0;
 }
 
+/* If cmd buffers are NULL, no operation is performed */
 static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd)
 {
 	kfree(cmd->iov);
@@ -1476,13 +1477,9 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue)
 	struct nvmet_tcp_cmd *cmd = queue->cmds;
 	int i;
 
-	for (i = 0; i < queue->nr_cmds; i++, cmd++) {
-		if (nvmet_tcp_need_data_in(cmd))
-			nvmet_tcp_free_cmd_buffers(cmd);
-	}
-
-	if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect))
-		nvmet_tcp_free_cmd_buffers(&queue->connect);
+	for (i = 0; i < queue->nr_cmds; i++, cmd++)
+		nvmet_tcp_free_cmd_buffers(cmd);
+	nvmet_tcp_free_cmd_buffers(&queue->connect);
 }
 
 static void nvmet_tcp_release_queue_work(struct work_struct *w)
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.6 40/43] nvme: cancel pending I/O if nvme controller is in terminal state
       [not found]     <20240507231033.393285-1-sashal@kernel.org>
                   ` (2 preceding siblings ...)
  2024-05-07 23:10  4% ` [PATCH AUTOSEL 6.6 39/43] nvmet-auth: replace pr_debug() with pr_err() to report an error Sasha Levin
@ 2024-05-07 23:10  3% ` Sasha Levin
  2024-05-07 23:10  4% ` [PATCH AUTOSEL 6.6 41/43] nvmet-tcp: fix possible memory leak when tearing down a controller Sasha Levin
  2024-05-07 23:10  3% ` [PATCH AUTOSEL 6.6 42/43] nvmet: fix nvme status code when namespace is disabled Sasha Levin
  5 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:10 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Nilay Shroff, Sagi Grimberg, Keith Busch, Sasha Levin, linux-nvme

From: Nilay Shroff <nilay@linux.ibm.com>

[ Upstream commit 25bb3534ee21e39eb9301c4edd7182eb83cb0d07 ]

While I/O is running, if the pci bus error occurs then
in-flight I/O can not complete. Worst, if at this time,
user (logically) hot-unplug the nvme disk then the
nvme_remove() code path can't forward progress until
in-flight I/O is cancelled. So these sequence of events
may potentially hang hot-unplug code path indefinitely.
This patch helps cancel the pending/in-flight I/O from the
nvme request timeout handler in case the nvme controller
is in the terminal (DEAD/DELETING/DELETING_NOIO) state and
that helps nvme_remove() code path forward progress and
finish successfully.

Link: https://lore.kernel.org/all/199be893-5dfa-41e5-b6f2-40ac90ebccc4@linux.ibm.com/
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/core.c | 21 ---------------------
 drivers/nvme/host/nvme.h | 21 +++++++++++++++++++++
 drivers/nvme/host/pci.c  |  8 +++++++-
 3 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 012c8b3f5f9c9..02d9d1b973494 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -587,27 +587,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 }
 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
 
-/*
- * Returns true for sink states that can't ever transition back to live.
- */
-static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
-{
-	switch (nvme_ctrl_state(ctrl)) {
-	case NVME_CTRL_NEW:
-	case NVME_CTRL_LIVE:
-	case NVME_CTRL_RESETTING:
-	case NVME_CTRL_CONNECTING:
-		return false;
-	case NVME_CTRL_DELETING:
-	case NVME_CTRL_DELETING_NOIO:
-	case NVME_CTRL_DEAD:
-		return true;
-	default:
-		WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
-		return true;
-	}
-}
-
 /*
  * Waits for the controller state to be resetting, or returns false if it is
  * not possible to ever transition to that state.
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index ba62d42d2a8b7..43cff851ac5ae 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -735,6 +735,27 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
 		nvme_tag_from_cid(command_id) >= NVME_AQ_BLK_MQ_DEPTH;
 }
 
+/*
+ * Returns true for sink states that can't ever transition back to live.
+ */
+static inline bool nvme_state_terminal(struct nvme_ctrl *ctrl)
+{
+	switch (nvme_ctrl_state(ctrl)) {
+	case NVME_CTRL_NEW:
+	case NVME_CTRL_LIVE:
+	case NVME_CTRL_RESETTING:
+	case NVME_CTRL_CONNECTING:
+		return false;
+	case NVME_CTRL_DELETING:
+	case NVME_CTRL_DELETING_NOIO:
+	case NVME_CTRL_DEAD:
+		return true;
+	default:
+		WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
+		return true;
+	}
+}
+
 void nvme_complete_rq(struct request *req);
 void nvme_complete_batch_req(struct request *req);
 
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b985142fb84b9..4352206533ede 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1286,6 +1286,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 	struct nvme_command cmd = { };
 	u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
+	if (nvme_state_terminal(&dev->ctrl))
+		goto disable;
+
 	/* If PCI error recovery process is happening, we cannot reset or
 	 * the recovery mechanism will surely fail.
 	 */
@@ -1388,8 +1391,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 	return BLK_EH_RESET_TIMER;
 
 disable:
-	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING))
+	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
+		if (nvme_state_terminal(&dev->ctrl))
+			nvme_dev_disable(dev, true);
 		return BLK_EH_DONE;
+	}
 
 	nvme_dev_disable(dev, false);
 	if (nvme_try_sched_reset(&dev->ctrl))
-- 
2.43.0



^ permalink raw reply related	[relevance 3%]

* [PATCH AUTOSEL 6.6 39/43] nvmet-auth: replace pr_debug() with pr_err() to report an error.
       [not found]     <20240507231033.393285-1-sashal@kernel.org>
  2024-05-07 23:09  4% ` [PATCH AUTOSEL 6.6 37/43] nvme: find numa distance only if controller has valid numa id Sasha Levin
  2024-05-07 23:09  4% ` [PATCH AUTOSEL 6.6 38/43] nvmet-auth: return the error code to the nvmet_auth_host_hash() callers Sasha Levin
@ 2024-05-07 23:10  4% ` Sasha Levin
  2024-05-07 23:10  3% ` [PATCH AUTOSEL 6.6 40/43] nvme: cancel pending I/O if nvme controller is in terminal state Sasha Levin
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:10 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Maurizio Lombardi, Sagi Grimberg, Chaitanya Kulkarni,
	Keith Busch, Sasha Levin, hare, linux-nvme

From: Maurizio Lombardi <mlombard@redhat.com>

[ Upstream commit 445f9119e70368ccc964575c2a6d3176966a9d65 ]

In nvmet_auth_host_hash(), if a mismatch is detected in the hash length
the kernel should print an error.

Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/target/auth.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
index 1f7d492c4dc26..e900525b78665 100644
--- a/drivers/nvme/target/auth.c
+++ b/drivers/nvme/target/auth.c
@@ -284,9 +284,9 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
 	}
 
 	if (shash_len != crypto_shash_digestsize(shash_tfm)) {
-		pr_debug("%s: hash len mismatch (len %d digest %d)\n",
-			 __func__, shash_len,
-			 crypto_shash_digestsize(shash_tfm));
+		pr_err("%s: hash len mismatch (len %d digest %d)\n",
+			__func__, shash_len,
+			crypto_shash_digestsize(shash_tfm));
 		ret = -EINVAL;
 		goto out_free_tfm;
 	}
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.6 38/43] nvmet-auth: return the error code to the nvmet_auth_host_hash() callers
       [not found]     <20240507231033.393285-1-sashal@kernel.org>
  2024-05-07 23:09  4% ` [PATCH AUTOSEL 6.6 37/43] nvme: find numa distance only if controller has valid numa id Sasha Levin
@ 2024-05-07 23:09  4% ` Sasha Levin
  2024-05-07 23:10  4% ` [PATCH AUTOSEL 6.6 39/43] nvmet-auth: replace pr_debug() with pr_err() to report an error Sasha Levin
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:09 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Maurizio Lombardi, Sagi Grimberg, Chaitanya Kulkarni,
	Keith Busch, Sasha Levin, hare, linux-nvme

From: Maurizio Lombardi <mlombard@redhat.com>

[ Upstream commit 46b8f9f74f6d500871985e22eb19560b21f3bc81 ]

If the nvmet_auth_host_hash() function fails, the error code should
be returned to its callers.

Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/target/auth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
index 4dcddcf95279b..1f7d492c4dc26 100644
--- a/drivers/nvme/target/auth.c
+++ b/drivers/nvme/target/auth.c
@@ -368,7 +368,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
 	kfree_sensitive(host_response);
 out_free_tfm:
 	crypto_free_shash(shash_tfm);
-	return 0;
+	return ret;
 }
 
 int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.6 37/43] nvme: find numa distance only if controller has valid numa id
       [not found]     <20240507231033.393285-1-sashal@kernel.org>
@ 2024-05-07 23:09  4% ` Sasha Levin
  2024-05-07 23:09  4% ` [PATCH AUTOSEL 6.6 38/43] nvmet-auth: return the error code to the nvmet_auth_host_hash() callers Sasha Levin
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:09 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Nilay Shroff, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Keith Busch, Sasha Levin, linux-nvme

From: Nilay Shroff <nilay@linux.ibm.com>

[ Upstream commit 863fe60ed27f2c85172654a63c5b827e72c8b2e6 ]

On system where native nvme multipath is configured and iopolicy
is set to numa but the nvme controller numa node id is undefined
or -1 (NUMA_NO_NODE) then avoid calculating node distance for
finding optimal io path. In such case we may access numa distance
table with invalid index and that may potentially refer to incorrect
memory. So this patch ensures that if the nvme controller numa node
id is -1 then instead of calculating node distance for finding optimal
io path, we set the numa node distance of such controller to default 10
(LOCAL_DISTANCE).

Link: https://lore.kernel.org/all/20240413090614.678353-1-nilay@linux.ibm.com/
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/multipath.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 0a88d7bdc5e37..b39553b8378b5 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -246,7 +246,8 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 		if (nvme_path_is_disabled(ns))
 			continue;
 
-		if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+		if (ns->ctrl->numa_node != NUMA_NO_NODE &&
+		    READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
 			distance = node_distance(node, ns->ctrl->numa_node);
 		else
 			distance = LOCAL_DISTANCE;
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.8 51/52] nvme-tcp: strict pdu pacing to avoid send stalls on TLS
       [not found]     <20240507230800.392128-1-sashal@kernel.org>
                   ` (5 preceding siblings ...)
  2024-05-07 23:07  3% ` [PATCH AUTOSEL 6.8 50/52] nvmet: fix nvme status code when namespace is disabled Sasha Levin
@ 2024-05-07 23:07  4% ` Sasha Levin
  6 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:07 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Hannes Reinecke, Sagi Grimberg, Keith Busch, Sasha Levin, linux-nvme

From: Hannes Reinecke <hare@kernel.org>

[ Upstream commit 50abcc179e0c9ca667feb223b26ea406d5c4c556 ]

TLS requires a strict pdu pacing via MSG_EOR to signal the end
of a record and subsequent encryption. If we do not set MSG_EOR
at the end of a sequence the record won't be closed, encryption
doesn't start, and we end up with a send stall as the message
will never be passed on to the TCP layer.
So do not check for the queue status when TLS is enabled but
rather make the MSG_MORE setting dependent on the current
request only.

Signed-off-by: Hannes Reinecke <hare@kernel.org>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/tcp.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index a6d596e056021..6eeb96578d1b4 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -352,12 +352,18 @@ static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
 	} while (ret > 0);
 }
 
-static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
+static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue *queue)
 {
 	return !list_empty(&queue->send_list) ||
 		!llist_empty(&queue->req_list);
 }
 
+static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
+{
+	return !nvme_tcp_tls(&queue->ctrl->ctrl) &&
+		nvme_tcp_queue_has_pending(queue);
+}
+
 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 		bool sync, bool last)
 {
@@ -378,7 +384,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 		mutex_unlock(&queue->send_mutex);
 	}
 
-	if (last && nvme_tcp_queue_more(queue))
+	if (last && nvme_tcp_queue_has_pending(queue))
 		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 }
 
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.8 50/52] nvmet: fix nvme status code when namespace is disabled
       [not found]     <20240507230800.392128-1-sashal@kernel.org>
                   ` (4 preceding siblings ...)
  2024-05-07 23:07  4% ` [PATCH AUTOSEL 6.8 49/52] nvmet-tcp: fix possible memory leak when tearing down a controller Sasha Levin
@ 2024-05-07 23:07  3% ` Sasha Levin
  2024-05-07 23:07  4% ` [PATCH AUTOSEL 6.8 51/52] nvme-tcp: strict pdu pacing to avoid send stalls on TLS Sasha Levin
  6 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:07 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sagi Grimberg, Jirong Feng, Christoph Hellwig, Keith Busch,
	Sasha Levin, kch, linux-nvme

From: Sagi Grimberg <sagi@grimberg.me>

[ Upstream commit 505363957fad35f7aed9a2b0d8dad73451a80fb5 ]

If the user disabled a nvmet namespace, it is removed from the subsystem
namespaces list. When nvmet processes a command directed to an nsid that
was disabled, it cannot differentiate between a nsid that is disabled
vs. a non-existent namespace, and resorts to return NVME_SC_INVALID_NS
with the dnr bit set.

This translates to a non-retryable status for the host, which translates
to a user error. We should expect disabled namespaces to not cause an
I/O error in a multipath environment.

Address this by searching a configfs item for the namespace nvmet failed
to find, and if we found one, conclude that the namespace is disabled
(perhaps temporarily). Return NVME_SC_INTERNAL_PATH_ERROR in this case
and keep DNR bit cleared.

Reported-by: Jirong Feng <jirong.feng@easystack.cn>
Tested-by: Jirong Feng <jirong.feng@easystack.cn>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/target/configfs.c | 13 +++++++++++++
 drivers/nvme/target/core.c     |  5 ++++-
 drivers/nvme/target/nvmet.h    |  1 +
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 2482a0db25043..b7bfee4b77a84 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -728,6 +728,19 @@ static struct configfs_attribute *nvmet_ns_attrs[] = {
 	NULL,
 };
 
+bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid)
+{
+	struct config_item *ns_item;
+	char name[4] = {};
+
+	if (sprintf(name, "%u", nsid) <= 0)
+		return false;
+	mutex_lock(&subsys->namespaces_group.cg_subsys->su_mutex);
+	ns_item = config_group_find_item(&subsys->namespaces_group, name);
+	mutex_unlock(&subsys->namespaces_group.cg_subsys->su_mutex);
+	return ns_item != NULL;
+}
+
 static void nvmet_ns_release(struct config_item *item)
 {
 	struct nvmet_ns *ns = to_nvmet_ns(item);
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 8658e9c08534d..7a6b3d37cca70 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -425,10 +425,13 @@ void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
 u16 nvmet_req_find_ns(struct nvmet_req *req)
 {
 	u32 nsid = le32_to_cpu(req->cmd->common.nsid);
+	struct nvmet_subsys *subsys = nvmet_req_subsys(req);
 
-	req->ns = xa_load(&nvmet_req_subsys(req)->namespaces, nsid);
+	req->ns = xa_load(&subsys->namespaces, nsid);
 	if (unlikely(!req->ns)) {
 		req->error_loc = offsetof(struct nvme_common_command, nsid);
+		if (nvmet_subsys_nsid_exists(subsys, nsid))
+			return NVME_SC_INTERNAL_PATH_ERROR;
 		return NVME_SC_INVALID_NS | NVME_SC_DNR;
 	}
 
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 6c8acebe1a1a6..477416abf85ab 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -542,6 +542,7 @@ void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys,
 		struct nvmet_host *host);
 void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
 		u8 event_info, u8 log_page);
+bool nvmet_subsys_nsid_exists(struct nvmet_subsys *subsys, u32 nsid);
 
 #define NVMET_QUEUE_SIZE	1024
 #define NVMET_NR_QUEUES		128
-- 
2.43.0



^ permalink raw reply related	[relevance 3%]

* [PATCH AUTOSEL 6.8 49/52] nvmet-tcp: fix possible memory leak when tearing down a controller
       [not found]     <20240507230800.392128-1-sashal@kernel.org>
                   ` (3 preceding siblings ...)
  2024-05-07 23:07  3% ` [PATCH AUTOSEL 6.8 48/52] nvme: cancel pending I/O if nvme controller is in terminal state Sasha Levin
@ 2024-05-07 23:07  4% ` Sasha Levin
  2024-05-07 23:07  3% ` [PATCH AUTOSEL 6.8 50/52] nvmet: fix nvme status code when namespace is disabled Sasha Levin
  2024-05-07 23:07  4% ` [PATCH AUTOSEL 6.8 51/52] nvme-tcp: strict pdu pacing to avoid send stalls on TLS Sasha Levin
  6 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:07 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Sagi Grimberg, Yi Zhang, Christoph Hellwig, Keith Busch,
	Sasha Levin, kch, linux-nvme

From: Sagi Grimberg <sagi@grimberg.me>

[ Upstream commit 6825bdde44340c5a9121f6d6fa25cc885bd9e821 ]

When we teardown the controller, we wait for pending I/Os to complete
(sq->ref on all queues to drop to zero) and then we go over the commands,
and free their command buffers in case they are still fetching data from
the host (e.g. processing nvme writes) and have yet to take a reference
on the sq.

However, we may miss the case where commands have failed before executing
and are queued for sending a response, but will never occur because the
queue socket is already down. In this case we may miss deallocating command
buffers.

Solve this by freeing all commands buffers as nvmet_tcp_free_cmd_buffers is
idempotent anyways.

Reported-by: Yi Zhang <yi.zhang@redhat.com>
Tested-by: Yi Zhang <yi.zhang@redhat.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/target/tcp.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index c8655fc5aa5b8..8d4531a1606d1 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -348,6 +348,7 @@ static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
 	return 0;
 }
 
+/* If cmd buffers are NULL, no operation is performed */
 static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd)
 {
 	kfree(cmd->iov);
@@ -1580,13 +1581,9 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue)
 	struct nvmet_tcp_cmd *cmd = queue->cmds;
 	int i;
 
-	for (i = 0; i < queue->nr_cmds; i++, cmd++) {
-		if (nvmet_tcp_need_data_in(cmd))
-			nvmet_tcp_free_cmd_buffers(cmd);
-	}
-
-	if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect))
-		nvmet_tcp_free_cmd_buffers(&queue->connect);
+	for (i = 0; i < queue->nr_cmds; i++, cmd++)
+		nvmet_tcp_free_cmd_buffers(cmd);
+	nvmet_tcp_free_cmd_buffers(&queue->connect);
 }
 
 static void nvmet_tcp_release_queue_work(struct work_struct *w)
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.8 48/52] nvme: cancel pending I/O if nvme controller is in terminal state
       [not found]     <20240507230800.392128-1-sashal@kernel.org>
                   ` (2 preceding siblings ...)
  2024-05-07 23:07  4% ` [PATCH AUTOSEL 6.8 47/52] nvmet-auth: replace pr_debug() with pr_err() to report an error Sasha Levin
@ 2024-05-07 23:07  3% ` Sasha Levin
  2024-05-07 23:07  4% ` [PATCH AUTOSEL 6.8 49/52] nvmet-tcp: fix possible memory leak when tearing down a controller Sasha Levin
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:07 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Nilay Shroff, Sagi Grimberg, Keith Busch, Sasha Levin, linux-nvme

From: Nilay Shroff <nilay@linux.ibm.com>

[ Upstream commit 25bb3534ee21e39eb9301c4edd7182eb83cb0d07 ]

While I/O is running, if the pci bus error occurs then
in-flight I/O can not complete. Worst, if at this time,
user (logically) hot-unplug the nvme disk then the
nvme_remove() code path can't forward progress until
in-flight I/O is cancelled. So these sequence of events
may potentially hang hot-unplug code path indefinitely.
This patch helps cancel the pending/in-flight I/O from the
nvme request timeout handler in case the nvme controller
is in the terminal (DEAD/DELETING/DELETING_NOIO) state and
that helps nvme_remove() code path forward progress and
finish successfully.

Link: https://lore.kernel.org/all/199be893-5dfa-41e5-b6f2-40ac90ebccc4@linux.ibm.com/
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/core.c | 21 ---------------------
 drivers/nvme/host/nvme.h | 21 +++++++++++++++++++++
 drivers/nvme/host/pci.c  |  8 +++++++-
 3 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fe3627c5bdc99..01702bc1baf1e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -619,27 +619,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
 }
 EXPORT_SYMBOL_GPL(nvme_change_ctrl_state);
 
-/*
- * Returns true for sink states that can't ever transition back to live.
- */
-static bool nvme_state_terminal(struct nvme_ctrl *ctrl)
-{
-	switch (nvme_ctrl_state(ctrl)) {
-	case NVME_CTRL_NEW:
-	case NVME_CTRL_LIVE:
-	case NVME_CTRL_RESETTING:
-	case NVME_CTRL_CONNECTING:
-		return false;
-	case NVME_CTRL_DELETING:
-	case NVME_CTRL_DELETING_NOIO:
-	case NVME_CTRL_DEAD:
-		return true;
-	default:
-		WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
-		return true;
-	}
-}
-
 /*
  * Waits for the controller state to be resetting, or returns false if it is
  * not possible to ever transition to that state.
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 7b87763e2f8a6..5a09879cb1a5f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -740,6 +740,27 @@ static inline bool nvme_is_aen_req(u16 qid, __u16 command_id)
 		nvme_tag_from_cid(command_id) >= NVME_AQ_BLK_MQ_DEPTH;
 }
 
+/*
+ * Returns true for sink states that can't ever transition back to live.
+ */
+static inline bool nvme_state_terminal(struct nvme_ctrl *ctrl)
+{
+	switch (nvme_ctrl_state(ctrl)) {
+	case NVME_CTRL_NEW:
+	case NVME_CTRL_LIVE:
+	case NVME_CTRL_RESETTING:
+	case NVME_CTRL_CONNECTING:
+		return false;
+	case NVME_CTRL_DELETING:
+	case NVME_CTRL_DELETING_NOIO:
+	case NVME_CTRL_DEAD:
+		return true;
+	default:
+		WARN_ONCE(1, "Unhandled ctrl state:%d", ctrl->state);
+		return true;
+	}
+}
+
 void nvme_complete_rq(struct request *req);
 void nvme_complete_batch_req(struct request *req);
 
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8e0bb9692685d..e393f6947ce49 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1286,6 +1286,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 	u32 csts = readl(dev->bar + NVME_REG_CSTS);
 	u8 opcode;
 
+	if (nvme_state_terminal(&dev->ctrl))
+		goto disable;
+
 	/* If PCI error recovery process is happening, we cannot reset or
 	 * the recovery mechanism will surely fail.
 	 */
@@ -1390,8 +1393,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 	return BLK_EH_RESET_TIMER;
 
 disable:
-	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING))
+	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
+		if (nvme_state_terminal(&dev->ctrl))
+			nvme_dev_disable(dev, true);
 		return BLK_EH_DONE;
+	}
 
 	nvme_dev_disable(dev, false);
 	if (nvme_try_sched_reset(&dev->ctrl))
-- 
2.43.0



^ permalink raw reply related	[relevance 3%]

* [PATCH AUTOSEL 6.8 47/52] nvmet-auth: replace pr_debug() with pr_err() to report an error.
       [not found]     <20240507230800.392128-1-sashal@kernel.org>
  2024-05-07 23:07  4% ` [PATCH AUTOSEL 6.8 45/52] nvme: find numa distance only if controller has valid numa id Sasha Levin
  2024-05-07 23:07  4% ` [PATCH AUTOSEL 6.8 46/52] nvmet-auth: return the error code to the nvmet_auth_host_hash() callers Sasha Levin
@ 2024-05-07 23:07  4% ` Sasha Levin
  2024-05-07 23:07  3% ` [PATCH AUTOSEL 6.8 48/52] nvme: cancel pending I/O if nvme controller is in terminal state Sasha Levin
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:07 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Maurizio Lombardi, Sagi Grimberg, Chaitanya Kulkarni,
	Keith Busch, Sasha Levin, hare, linux-nvme

From: Maurizio Lombardi <mlombard@redhat.com>

[ Upstream commit 445f9119e70368ccc964575c2a6d3176966a9d65 ]

In nvmet_auth_host_hash(), if a mismatch is detected in the hash length
the kernel should print an error.

Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/target/auth.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
index 9e51c064b0728..fb518b00f71f6 100644
--- a/drivers/nvme/target/auth.c
+++ b/drivers/nvme/target/auth.c
@@ -285,9 +285,9 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
 	}
 
 	if (shash_len != crypto_shash_digestsize(shash_tfm)) {
-		pr_debug("%s: hash len mismatch (len %d digest %d)\n",
-			 __func__, shash_len,
-			 crypto_shash_digestsize(shash_tfm));
+		pr_err("%s: hash len mismatch (len %d digest %d)\n",
+			__func__, shash_len,
+			crypto_shash_digestsize(shash_tfm));
 		ret = -EINVAL;
 		goto out_free_tfm;
 	}
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.8 46/52] nvmet-auth: return the error code to the nvmet_auth_host_hash() callers
       [not found]     <20240507230800.392128-1-sashal@kernel.org>
  2024-05-07 23:07  4% ` [PATCH AUTOSEL 6.8 45/52] nvme: find numa distance only if controller has valid numa id Sasha Levin
@ 2024-05-07 23:07  4% ` Sasha Levin
  2024-05-07 23:07  4% ` [PATCH AUTOSEL 6.8 47/52] nvmet-auth: replace pr_debug() with pr_err() to report an error Sasha Levin
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:07 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Maurizio Lombardi, Sagi Grimberg, Chaitanya Kulkarni,
	Keith Busch, Sasha Levin, hare, linux-nvme

From: Maurizio Lombardi <mlombard@redhat.com>

[ Upstream commit 46b8f9f74f6d500871985e22eb19560b21f3bc81 ]

If the nvmet_auth_host_hash() function fails, the error code should
be returned to its callers.

Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/target/auth.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/target/auth.c b/drivers/nvme/target/auth.c
index 3ddbc3880cac8..9e51c064b0728 100644
--- a/drivers/nvme/target/auth.c
+++ b/drivers/nvme/target/auth.c
@@ -370,7 +370,7 @@ int nvmet_auth_host_hash(struct nvmet_req *req, u8 *response,
 	nvme_auth_free_key(transformed_key);
 out_free_tfm:
 	crypto_free_shash(shash_tfm);
-	return 0;
+	return ret;
 }
 
 int nvmet_auth_ctrl_hash(struct nvmet_req *req, u8 *response,
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.8 45/52] nvme: find numa distance only if controller has valid numa id
       [not found]     <20240507230800.392128-1-sashal@kernel.org>
@ 2024-05-07 23:07  4% ` Sasha Levin
  2024-05-07 23:07  4% ` [PATCH AUTOSEL 6.8 46/52] nvmet-auth: return the error code to the nvmet_auth_host_hash() callers Sasha Levin
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-05-07 23:07 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Nilay Shroff, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Keith Busch, Sasha Levin, linux-nvme

From: Nilay Shroff <nilay@linux.ibm.com>

[ Upstream commit 863fe60ed27f2c85172654a63c5b827e72c8b2e6 ]

On system where native nvme multipath is configured and iopolicy
is set to numa but the nvme controller numa node id is undefined
or -1 (NUMA_NO_NODE) then avoid calculating node distance for
finding optimal io path. In such case we may access numa distance
table with invalid index and that may potentially refer to incorrect
memory. So this patch ensures that if the nvme controller numa node
id is -1 then instead of calculating node distance for finding optimal
io path, we set the numa node distance of such controller to default 10
(LOCAL_DISTANCE).

Link: https://lore.kernel.org/all/20240413090614.678353-1-nilay@linux.ibm.com/
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/multipath.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 74de1e64aeead..75386d3e0f981 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -247,7 +247,8 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 		if (nvme_path_is_disabled(ns))
 			continue;
 
-		if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+		if (ns->ctrl->numa_node != NUMA_NO_NODE &&
+		    READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
 			distance = node_distance(node, ns->ctrl->numa_node);
 		else
 			distance = LOCAL_DISTANCE;
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* Re: [PATCH v2] nvme-rdma, nvme-tcp: include max reconnects for reconnect logging
  @ 2024-05-07 15:54  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-07 15:54 UTC (permalink / raw)
  To: Tokunori Ikegami; +Cc: Sagi Grimberg, linux-nvme

On Mon, May 06, 2024 at 12:24:59AM +0900, Tokunori Ikegami wrote:
> Makes clear max reconnects translated by ctrl loss tmo and reconnect delay.
> 
> Signed-off-by: Tokunori Ikegami <ikegami.t@gmail.com>

Thanks, applied to nvme-6.10 with a minor merge conflict fix folded in.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v2] nvmet-rdma: Avoid o(n^2) loop in delete_ctrl
  @ 2024-05-07 15:08  5%   ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-07 15:08 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Sagi Grimberg, linux-nvme, Chaitanya Kulkarni

On Tue, May 07, 2024 at 09:26:26AM +0200, Christoph Hellwig wrote:
> On Tue, May 07, 2024 at 09:54:44AM +0300, Sagi Grimberg wrote:
> > From: Sagi Grimberg <sagi.grimberg@vastdata.com>
> > 
> > When deleting a nvmet-rdma ctrl, we essentially loop over all
> > queues that belong to the controller and schedule a removal of
> > each. Instead of restarting the loop every time a queue is found,
> > do a simple safe list traversal.
> > 
> > This addresses an unneeded time spent scheduling queue removal in
> > cases there a lot of queues.
> 
> Looks good:
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> 
> (crossing fingers we're not going to run into a mess due to
> rdma_disconnect)

Sounds like nvme-6.10 material then. Applied to that tree.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v2] nvmet: make nvmet_wq unbound
  @ 2024-05-07 15:07  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-07 15:07 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: linux-nvme, Christoph Hellwig, Chaitanya Kulkarni

On Tue, May 07, 2024 at 09:54:10AM +0300, Sagi Grimberg wrote:
> From: Sagi Grimberg <sagi.grimberg@vastdata.com>
> 
> When deleting many controllers one-by-one, it takes a very
> long time as these work elements may serialize as they are
> scheduled on the executing cpu instead of spreading. In general
> nvmet_wq can definitely be used for long standing work elements
> so its better to make it unbound regardless.
> 
> Signed-off-by: Sagi Grimberg <sagi.grimberg@vastdata.com>

Applied to nvme-6.9, thanks.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvme-pci: Add quirk for broken MSIs
    2024-04-22 16:49  5% ` Keith Busch
@ 2024-05-07 15:02  5% ` Keith Busch
  1 sibling, 0 replies; 200+ results
From: Keith Busch @ 2024-05-07 15:02 UTC (permalink / raw)
  To: Sean Anderson
  Cc: Jens Axboe, Christoph Hellwig, Sagi Grimberg, linux-nvme,
	linux-kernel, stable

On Mon, Apr 22, 2024 at 12:28:23PM -0400, Sean Anderson wrote:
> Sandisk SN530 NVMe drives have broken MSIs. On systems without MSI-X
> support, all commands time out resulting in the following message:
> 
> nvme nvme0: I/O tag 12 (100c) QID 0 timeout, completion polled
> 
> These timeouts cause the boot to take an excessively-long time (over 20
> minutes) while the initial command queue is flushed.
> 
> Address this by adding a quirk for drives with buggy MSIs. The lspci
> output for this device (recorded on a system with MSI-X support) is:
> 
> 02:00.0 Non-Volatile memory controller: Sandisk Corp Device 5008 (rev 01) (prog-if 02 [NVM Express])
> 	Subsystem: Sandisk Corp Device 5008
> 	Flags: bus master, fast devsel, latency 0, IRQ 16, NUMA node 0
> 	Memory at f7e00000 (64-bit, non-prefetchable) [size=16K]
> 	Memory at f7e04000 (64-bit, non-prefetchable) [size=256]
> 	Capabilities: [80] Power Management version 3
> 	Capabilities: [90] MSI: Enable- Count=1/32 Maskable- 64bit+
> 	Capabilities: [b0] MSI-X: Enable+ Count=17 Masked-
> 	Capabilities: [c0] Express Endpoint, MSI 00
> 	Capabilities: [100] Advanced Error Reporting
> 	Capabilities: [150] Device Serial Number 00-00-00-00-00-00-00-00
> 	Capabilities: [1b8] Latency Tolerance Reporting
> 	Capabilities: [300] Secondary PCI Express
> 	Capabilities: [900] L1 PM Substates
> 	Kernel driver in use: nvme
> 	Kernel modules: nvme
> 
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Sean Anderson <sean.anderson@linux.dev>

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvmet-auth: return the error code to the nvmet_auth_ctrl_hash() callers
  @ 2024-05-07 15:02  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-07 15:02 UTC (permalink / raw)
  To: Maurizio Lombardi; +Cc: sagi, hare, dwagner, linux-nvme

On Fri, Apr 12, 2024 at 03:41:54PM +0200, Maurizio Lombardi wrote:
> If nvmet_auth_ctrl_hash() fails, return the error code to its callers
> 
> Signed-off-by: Maurizio Lombardi <mlombard@redhat.com>

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* RE: [RFC RESEND 00/16] Split IOMMU DMA mapping operation to two steps
  @ 2024-05-03 20:59  0%     ` Zeng, Oak
  0 siblings, 0 replies; 200+ results
From: Zeng, Oak @ 2024-05-03 20:59 UTC (permalink / raw)
  To: Jason Gunthorpe
  Cc: leon, Christoph Hellwig, Robin Murphy, Marek Szyprowski,
	Joerg Roedel, Will Deacon, Chaitanya Kulkarni, Brost, Matthew,
	Hellstrom, Thomas, Jonathan Corbet, Jens Axboe, Keith Busch,
	Sagi Grimberg, Yishai Hadas, Shameer Kolothum, Tian, Kevin,
	Alex Williamson, Jérôme Glisse, Andrew Morton,
	linux-doc, linux-kernel, linux-block, linux-rdma, iommu,
	linux-nvme, kvm, linux-mm, Bart Van Assche, Damien Le Moal,
	Amir Goldstein, josef, Martin K. Petersen, daniel, Williams,
	Dan J, jack, Leon Romanovsky, Zhu Yanjun



> -----Original Message-----
> From: Jason Gunthorpe <jgg@ziepe.ca>
> Sent: Friday, May 3, 2024 12:43 PM
> To: Zeng, Oak <oak.zeng@intel.com>
> Cc: leon@kernel.org; Christoph Hellwig <hch@lst.de>; Robin Murphy
> <robin.murphy@arm.com>; Marek Szyprowski
> <m.szyprowski@samsung.com>; Joerg Roedel <joro@8bytes.org>; Will
> Deacon <will@kernel.org>; Chaitanya Kulkarni <chaitanyak@nvidia.com>;
> Brost, Matthew <matthew.brost@intel.com>; Hellstrom, Thomas
> <thomas.hellstrom@intel.com>; Jonathan Corbet <corbet@lwn.net>; Jens
> Axboe <axboe@kernel.dk>; Keith Busch <kbusch@kernel.org>; Sagi
> Grimberg <sagi@grimberg.me>; Yishai Hadas <yishaih@nvidia.com>;
> Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>; Tian, Kevin
> <kevin.tian@intel.com>; Alex Williamson <alex.williamson@redhat.com>;
> Jérôme Glisse <jglisse@redhat.com>; Andrew Morton <akpm@linux-
> foundation.org>; linux-doc@vger.kernel.org; linux-kernel@vger.kernel.org;
> linux-block@vger.kernel.org; linux-rdma@vger.kernel.org;
> iommu@lists.linux.dev; linux-nvme@lists.infradead.org;
> kvm@vger.kernel.org; linux-mm@kvack.org; Bart Van Assche
> <bvanassche@acm.org>; Damien Le Moal
> <damien.lemoal@opensource.wdc.com>; Amir Goldstein
> <amir73il@gmail.com>; josef@toxicpanda.com; Martin K. Petersen
> <martin.petersen@oracle.com>; daniel@iogearbox.net; Williams, Dan J
> <dan.j.williams@intel.com>; jack@suse.com; Leon Romanovsky
> <leonro@nvidia.com>; Zhu Yanjun <zyjzyj2000@gmail.com>
> Subject: Re: [RFC RESEND 00/16] Split IOMMU DMA mapping operation to
> two steps
> 
> On Thu, May 02, 2024 at 11:32:55PM +0000, Zeng, Oak wrote:
> 
> > > Instead of teaching DMA to know these specific datatypes, let's separate
> > > existing DMA mapping routine to two steps and give an option to
> advanced
> > > callers (subsystems) perform all calculations internally in advance and
> > > map pages later when it is needed.
> >
> > I looked into how this scheme can be applied to DRM subsystem and GPU
> drivers.
> >
> > I figured RDMA can apply this scheme because RDMA can calculate the
> > iova size. Per my limited knowledge of rdma, user can register a
> > memory region (the reg_user_mr vfunc) and memory region's sized is
> > used to pre-allocate iova space. And in the RDMA use case, it seems
> > the user registered region can be very big, e.g., 512MiB or even GiB
> 
> In RDMA the iova would be linked to the SVA granual we discussed
> previously.

I need to learn more of this scheme. 

Let's say 512MiB granual... In a 57-bit virtual address machine, the use space can address space can be up to 56 bit (e.g.,  half-half split b/t kernel and user)

So you would end up with  134,217, 728 sub-regions (2 to the power of 27), which is huge...

Is that RDMA use a much smaller virtual address space?

With 512MiB granual, do you fault-in or map 512MiB virtual address range to RDMA page table? E.g., when page fault happens at address A, do you fault-in the whole 512MiB region to RDMA page table? How do you make sure all addresses in this 512MiB region are valid virtual address?  



> 
> > In GPU driver, we have a few use cases where we need dma-mapping. Just
> name two:
> >
> > 1) userptr: it is user malloc'ed/mmap'ed memory and registers to gpu
> > (in Intel's driver it is through a vm_bind api, similar to mmap). A
> > userptr can be of any random size, depending on user malloc
> > size. Today we use dma-map-sg for this use case. The down side of
> > our approach is, during userptr invalidation, even if user only
> > munmap partially of an userptr, we invalidate the whole userptr from
> > gpu page table, because there is no way for us to partially
> > dma-unmap the whole sg list. I think we can try your new API in this
> > case. The main benefit of the new approach is the partial munmap
> > case.
> 
> Yes, this is one of the main things it will improve.
> 
> > We will have to pre-allocate iova for each userptr, and we have many
> > userptrs of random size... So we might be not as efficient as RDMA
> > case where I assume user register a few big memory regions.
> 
> You are already doing this. dma_map_sg() does exactly the same IOVA
> allocation under the covers.

Sure. Then we can replace our _sg with your new DMA Api once it is merged. We will gain a benefit with a little more codes

> 
> > 2) system allocator: it is malloc'ed/mmap'ed memory be used for GPU
> > program directly, without any other extra driver API call. We call
> > this use case system allocator.
> 
> > For system allocator, driver have no knowledge of which virtual
> > address range is valid in advance. So when GPU access a
> > malloc'ed/mmap'ed address, we have a page fault. We then look up a
> > CPU vma which contains the fault address. I guess we can use the CPU
> > vma size to allocate the iova space of the same size?
> 
> No. You'd follow what we discussed in the other thread.
> 
> If you do a full SVA then you'd split your MM space into granuals and
> when a fault hits a granual you'd allocate the IOVA for the whole
> granual. RDMA ODP is using a 512M granual currently.

Per system allocator requirement, we have to do full SVA (which means ANY valid CPU virtual address is a valid GPU virtual address). 

Per my above calculation, with 512M granual, we will end up a huge number of sub-regions....

> 
> If you are doing sub ranges then you'd probably allocate the IOVA for
> the well defined sub range (assuming the typical use case isn't huge)

Can you explain what is sub ranges? Is that device only mirror partially of the CPU virtual address space?

How do we decide which part to mirror?


> 
> > But there will be a true difficulty to apply your scheme to this use
> > case. It is related to the STICKY flag. As I understand it, the
> > sticky flag is designed for driver to mark "this page/pfn has been
> > populated, no need to re-populate again", roughly...Unlike userptr
> > and RDMA use cases where the backing store of a buffer is always in
> > system memory, in the system allocator use case, the backing store
> > can be changing b/t system memory and GPU's device private
> > memory. Even worse, we have to assume the data migration b/t system
> > and GPU is dynamic. When data is migrated to GPU, we don't need
> > dma-map. And when migration happens to a pfn with STICKY flag, we
> > still need to repopulate this pfn. So you can see, it is not easy to
> > apply this scheme to this use case. At least I can't see an obvious
> > way.
> 
> You are already doing this today, you are keeping the sg list around
> until you unmap it.
> 
> Instead of keeping the sg list you'd keep a much smaller datastructure
> per-granual. The sticky bit is simply a convient way for ODP to manage
> the smaller data structure, you don't have to use it.
> 
> But you do need to keep track of what pages in the granual have been
> DMA mapped - sg list was doing this before. This could be a simple
> bitmap array matching the granual size.

Make sense. We can try once your API is ready. 

I still don't figure out the granular scheme. Please help with above questions.

Thanks,
Oak


> 
> Looking (far) forward we may be able to have a "replace" API that
> allows installing a new page unconditionally regardless of what is
> already there.
> 
> Jason


^ permalink raw reply	[relevance 0%]

* Re: [RFC RESEND 00/16] Split IOMMU DMA mapping operation to two steps
  2024-05-02 23:32  0% ` [RFC RESEND 00/16] Split IOMMU DMA mapping operation to two steps Zeng, Oak
@ 2024-05-03 11:57  0%   ` Zhu Yanjun
    1 sibling, 0 replies; 200+ results
From: Zhu Yanjun @ 2024-05-03 11:57 UTC (permalink / raw)
  To: Zeng, Oak, leon, Christoph Hellwig, Robin Murphy,
	Marek Szyprowski, Joerg Roedel, Will Deacon, Jason Gunthorpe,
	Chaitanya Kulkarni, Brost, Matthew, Hellstrom, Thomas
  Cc: Jonathan Corbet, Jens Axboe, Keith Busch, Sagi Grimberg,
	Yishai Hadas, Shameer Kolothum, Tian, Kevin, Alex Williamson,
	Jérôme Glisse, Andrew Morton, linux-doc, linux-kernel,
	linux-block, linux-rdma, iommu, linux-nvme, kvm, linux-mm,
	Bart Van Assche, Damien Le Moal, Amir Goldstein, josef,
	Martin K. Petersen, daniel, Williams, Dan J, jack,
	Leon Romanovsky


On 03.05.24 01:32, Zeng, Oak wrote:
> Hi Leon, Jason
>
>> -----Original Message-----
>> From: Leon Romanovsky <leon@kernel.org>
>> Sent: Tuesday, March 5, 2024 6:19 AM
>> To: Christoph Hellwig <hch@lst.de>; Robin Murphy
>> <robin.murphy@arm.com>; Marek Szyprowski
>> <m.szyprowski@samsung.com>; Joerg Roedel <joro@8bytes.org>; Will
>> Deacon <will@kernel.org>; Jason Gunthorpe <jgg@ziepe.ca>; Chaitanya
>> Kulkarni <chaitanyak@nvidia.com>
>> Cc: Jonathan Corbet <corbet@lwn.net>; Jens Axboe <axboe@kernel.dk>;
>> Keith Busch <kbusch@kernel.org>; Sagi Grimberg <sagi@grimberg.me>;
>> Yishai Hadas <yishaih@nvidia.com>; Shameer Kolothum
>> <shameerali.kolothum.thodi@huawei.com>; Kevin Tian
>> <kevin.tian@intel.com>; Alex Williamson <alex.williamson@redhat.com>;
>> Jérôme Glisse <jglisse@redhat.com>; Andrew Morton <akpm@linux-
>> foundation.org>; linux-doc@vger.kernel.org; linux-kernel@vger.kernel.org;
>> linux-block@vger.kernel.org; linux-rdma@vger.kernel.org;
>> iommu@lists.linux.dev; linux-nvme@lists.infradead.org;
>> kvm@vger.kernel.org; linux-mm@kvack.org; Bart Van Assche
>> <bvanassche@acm.org>; Damien Le Moal
>> <damien.lemoal@opensource.wdc.com>; Amir Goldstein
>> <amir73il@gmail.com>; josef@toxicpanda.com; Martin K. Petersen
>> <martin.petersen@oracle.com>; daniel@iogearbox.net; Dan Williams
>> <dan.j.williams@intel.com>; jack@suse.com; Leon Romanovsky
>> <leonro@nvidia.com>; Zhu Yanjun <zyjzyj2000@gmail.com>
>> Subject: [RFC RESEND 00/16] Split IOMMU DMA mapping operation to two
>> steps
>>
>> This is complimentary part to the proposed LSF/MM topic.
>> https://lore.kernel.org/linux-rdma/22df55f8-cf64-4aa8-8c0b-
>> b556c867b926@linux.dev/T/#m85672c860539fdbbc8fe0f5ccabdc05b40269057
>>
>> This is posted as RFC to get a feedback on proposed split, but RDMA, VFIO
>> and
>> DMA patches are ready for review and inclusion, the NVMe patches are still
>> in
>> progress as they require agreement on API first.
>>
>> Thanks
>>
>> -------------------------------------------------------------------------------
>> The DMA mapping operation performs two steps at one same time: allocates
>> IOVA space and actually maps DMA pages to that space. This one shot
>> operation works perfectly for non-complex scenarios, where callers use
>> that DMA API in control path when they setup hardware.
>>
>> However in more complex scenarios, when DMA mapping is needed in data
>> path and especially when some sort of specific datatype is involved,
>> such one shot approach has its drawbacks.
>>
>> That approach pushes developers to introduce new DMA APIs for specific
>> datatype. For example existing scatter-gather mapping functions, or
>> latest Chuck's RFC series to add biovec related DMA mapping [1] and
>> probably struct folio will need it too.
>>
>> These advanced DMA mapping APIs are needed to calculate IOVA size to
>> allocate it as one chunk and some sort of offset calculations to know
>> which part of IOVA to map.
>>
>> Instead of teaching DMA to know these specific datatypes, let's separate
>> existing DMA mapping routine to two steps and give an option to advanced
>> callers (subsystems) perform all calculations internally in advance and
>> map pages later when it is needed.
> I looked into how this scheme can be applied to DRM subsystem and GPU drivers.
>
> I figured RDMA can apply this scheme because RDMA can calculate the iova size. Per my limited knowledge of rdma, user can register a memory region (the reg_user_mr vfunc) and memory region's sized is used to pre-allocate iova space. And in the RDMA use case, it seems the user registered region can be very big, e.g., 512MiB or even GiB
>
> In GPU driver, we have a few use cases where we need dma-mapping. Just name two:
>
> 1) userptr: it is user malloc'ed/mmap'ed memory and registers to gpu (in Intel's driver it is through a vm_bind api, similar to mmap). A userptr can be of any random size, depending on user malloc size. Today we use dma-map-sg for this use case. The down side of our approach is, during userptr invalidation, even if user only munmap partially of an userptr, we invalidate the whole userptr from gpu page table, because there is no way for us to partially dma-unmap the whole sg list. I think we can try your new API in this case. The main benefit of the new approach is the partial munmap case.
>
> We will have to pre-allocate iova for each userptr, and we have many userptrs of random size... So we might be not as efficient as RDMA case where I assume user register a few big memory regions.
>
> 2) system allocator: it is malloc'ed/mmap'ed memory be used for GPU program directly, without any other extra driver API call. We call this use case system allocator.
>
> For system allocator, driver have no knowledge of which virtual address range is valid in advance. So when GPU access a malloc'ed/mmap'ed address, we have a page fault. We then look up a CPU vma which contains the fault address. I guess we can use the CPU vma size to allocate the iova space of the same size?
>
> But there will be a true difficulty to apply your scheme to this use case. It is related to the STICKY flag. As I understand it, the sticky flag is designed for driver to mark "this page/pfn has been populated, no need to re-populate again", roughly...Unlike userptr and RDMA use cases where the backing store of a buffer is always in system memory, in the system allocator use case, the backing store can be changing b/t system memory and GPU's device private memory. Even worse, we have to assume the data migration b/t system and GPU is dynamic. When data is migrated to GPU, we don't need dma-map. And when migration happens to a pfn with STICKY flag, we still need to repopulate this pfn. So you can see, it is not easy to apply this scheme to this use case. At least I can't see an obvious way.

Not sure if GPU peer to peer dma mapping GPU memory for use can use this 
scheme or not. If I remember it correctly, Intel Gaudi GPU supports peer 
2 peer dma mapping in GPU Direct RDMA. Not sure if this scheme can be 
applied in that place or not.

Just my 2 cent suggestions.

Zhu Yanjun

>
>
> Oak
>
>
>> In this series, three users are converted and each of such conversion
>> presents different positive gain:
>> 1. RDMA simplifies and speeds up its pagefault handling for
>>     on-demand-paging (ODP) mode.
>> 2. VFIO PCI live migration code saves huge chunk of memory.
>> 3. NVMe PCI avoids intermediate SG table manipulation and operates
>>     directly on BIOs.
>>
>> Thanks
>>
>> [1]
>> https://lore.kernel.org/all/169772852492.5232.17148564580779995849.stgit@
>> klimt.1015granger.net
>>
>> Chaitanya Kulkarni (2):
>>    block: add dma_link_range() based API
>>    nvme-pci: use blk_rq_dma_map() for NVMe SGL
>>
>> Leon Romanovsky (14):
>>    mm/hmm: let users to tag specific PFNs
>>    dma-mapping: provide an interface to allocate IOVA
>>    dma-mapping: provide callbacks to link/unlink pages to specific IOVA
>>    iommu/dma: Provide an interface to allow preallocate IOVA
>>    iommu/dma: Prepare map/unmap page functions to receive IOVA
>>    iommu/dma: Implement link/unlink page callbacks
>>    RDMA/umem: Preallocate and cache IOVA for UMEM ODP
>>    RDMA/umem: Store ODP access mask information in PFN
>>    RDMA/core: Separate DMA mapping to caching IOVA and page linkage
>>    RDMA/umem: Prevent UMEM ODP creation with SWIOTLB
>>    vfio/mlx5: Explicitly use number of pages instead of allocated length
>>    vfio/mlx5: Rewrite create mkey flow to allow better code reuse
>>    vfio/mlx5: Explicitly store page list
>>    vfio/mlx5: Convert vfio to use DMA link API
>>
>>   Documentation/core-api/dma-attributes.rst |   7 +
>>   block/blk-merge.c                         | 156 ++++++++++++++
>>   drivers/infiniband/core/umem_odp.c        | 219 +++++++------------
>>   drivers/infiniband/hw/mlx5/mlx5_ib.h      |   1 +
>>   drivers/infiniband/hw/mlx5/odp.c          |  59 +++--
>>   drivers/iommu/dma-iommu.c                 | 129 ++++++++---
>>   drivers/nvme/host/pci.c                   | 220 +++++--------------
>>   drivers/vfio/pci/mlx5/cmd.c               | 252 ++++++++++++----------
>>   drivers/vfio/pci/mlx5/cmd.h               |  22 +-
>>   drivers/vfio/pci/mlx5/main.c              | 136 +++++-------
>>   include/linux/blk-mq.h                    |   9 +
>>   include/linux/dma-map-ops.h               |  13 ++
>>   include/linux/dma-mapping.h               |  39 ++++
>>   include/linux/hmm.h                       |   3 +
>>   include/rdma/ib_umem_odp.h                |  22 +-
>>   include/rdma/ib_verbs.h                   |  54 +++++
>>   kernel/dma/debug.h                        |   2 +
>>   kernel/dma/direct.h                       |   7 +-
>>   kernel/dma/mapping.c                      |  91 ++++++++
>>   mm/hmm.c                                  |  34 +--
>>   20 files changed, 870 insertions(+), 605 deletions(-)
>>
>> --
>> 2.44.0

-- 
Best Regards,
Yanjun.Zhu



^ permalink raw reply	[relevance 0%]

* RE: [RFC RESEND 00/16] Split IOMMU DMA mapping operation to two steps
    @ 2024-05-02 23:32  0% ` Zeng, Oak
  2024-05-03 11:57  0%   ` Zhu Yanjun
    1 sibling, 2 replies; 200+ results
From: Zeng, Oak @ 2024-05-02 23:32 UTC (permalink / raw)
  To: leon, Christoph Hellwig, Robin Murphy, Marek Szyprowski,
	Joerg Roedel, Will Deacon, Jason Gunthorpe, Chaitanya Kulkarni,
	Brost, Matthew, Hellstrom, Thomas
  Cc: Jonathan Corbet, Jens Axboe, Keith Busch, Sagi Grimberg,
	Yishai Hadas, Shameer Kolothum, Tian, Kevin, Alex Williamson,
	Jérôme Glisse, Andrew Morton, linux-doc, linux-kernel,
	linux-block, linux-rdma, iommu, linux-nvme, kvm, linux-mm,
	Bart Van Assche, Damien Le Moal, Amir Goldstein, josef,
	Martin K. Petersen, daniel, Williams, Dan J, jack,
	Leon Romanovsky, Zhu Yanjun

Hi Leon, Jason

> -----Original Message-----
> From: Leon Romanovsky <leon@kernel.org>
> Sent: Tuesday, March 5, 2024 6:19 AM
> To: Christoph Hellwig <hch@lst.de>; Robin Murphy
> <robin.murphy@arm.com>; Marek Szyprowski
> <m.szyprowski@samsung.com>; Joerg Roedel <joro@8bytes.org>; Will
> Deacon <will@kernel.org>; Jason Gunthorpe <jgg@ziepe.ca>; Chaitanya
> Kulkarni <chaitanyak@nvidia.com>
> Cc: Jonathan Corbet <corbet@lwn.net>; Jens Axboe <axboe@kernel.dk>;
> Keith Busch <kbusch@kernel.org>; Sagi Grimberg <sagi@grimberg.me>;
> Yishai Hadas <yishaih@nvidia.com>; Shameer Kolothum
> <shameerali.kolothum.thodi@huawei.com>; Kevin Tian
> <kevin.tian@intel.com>; Alex Williamson <alex.williamson@redhat.com>;
> Jérôme Glisse <jglisse@redhat.com>; Andrew Morton <akpm@linux-
> foundation.org>; linux-doc@vger.kernel.org; linux-kernel@vger.kernel.org;
> linux-block@vger.kernel.org; linux-rdma@vger.kernel.org;
> iommu@lists.linux.dev; linux-nvme@lists.infradead.org;
> kvm@vger.kernel.org; linux-mm@kvack.org; Bart Van Assche
> <bvanassche@acm.org>; Damien Le Moal
> <damien.lemoal@opensource.wdc.com>; Amir Goldstein
> <amir73il@gmail.com>; josef@toxicpanda.com; Martin K. Petersen
> <martin.petersen@oracle.com>; daniel@iogearbox.net; Dan Williams
> <dan.j.williams@intel.com>; jack@suse.com; Leon Romanovsky
> <leonro@nvidia.com>; Zhu Yanjun <zyjzyj2000@gmail.com>
> Subject: [RFC RESEND 00/16] Split IOMMU DMA mapping operation to two
> steps
> 
> This is complimentary part to the proposed LSF/MM topic.
> https://lore.kernel.org/linux-rdma/22df55f8-cf64-4aa8-8c0b-
> b556c867b926@linux.dev/T/#m85672c860539fdbbc8fe0f5ccabdc05b40269057
> 
> This is posted as RFC to get a feedback on proposed split, but RDMA, VFIO
> and
> DMA patches are ready for review and inclusion, the NVMe patches are still
> in
> progress as they require agreement on API first.
> 
> Thanks
> 
> -------------------------------------------------------------------------------
> The DMA mapping operation performs two steps at one same time: allocates
> IOVA space and actually maps DMA pages to that space. This one shot
> operation works perfectly for non-complex scenarios, where callers use
> that DMA API in control path when they setup hardware.
> 
> However in more complex scenarios, when DMA mapping is needed in data
> path and especially when some sort of specific datatype is involved,
> such one shot approach has its drawbacks.
> 
> That approach pushes developers to introduce new DMA APIs for specific
> datatype. For example existing scatter-gather mapping functions, or
> latest Chuck's RFC series to add biovec related DMA mapping [1] and
> probably struct folio will need it too.
> 
> These advanced DMA mapping APIs are needed to calculate IOVA size to
> allocate it as one chunk and some sort of offset calculations to know
> which part of IOVA to map.
> 
> Instead of teaching DMA to know these specific datatypes, let's separate
> existing DMA mapping routine to two steps and give an option to advanced
> callers (subsystems) perform all calculations internally in advance and
> map pages later when it is needed.

I looked into how this scheme can be applied to DRM subsystem and GPU drivers. 

I figured RDMA can apply this scheme because RDMA can calculate the iova size. Per my limited knowledge of rdma, user can register a memory region (the reg_user_mr vfunc) and memory region's sized is used to pre-allocate iova space. And in the RDMA use case, it seems the user registered region can be very big, e.g., 512MiB or even GiB

In GPU driver, we have a few use cases where we need dma-mapping. Just name two:

1) userptr: it is user malloc'ed/mmap'ed memory and registers to gpu (in Intel's driver it is through a vm_bind api, similar to mmap). A userptr can be of any random size, depending on user malloc size. Today we use dma-map-sg for this use case. The down side of our approach is, during userptr invalidation, even if user only munmap partially of an userptr, we invalidate the whole userptr from gpu page table, because there is no way for us to partially dma-unmap the whole sg list. I think we can try your new API in this case. The main benefit of the new approach is the partial munmap case.

We will have to pre-allocate iova for each userptr, and we have many userptrs of random size... So we might be not as efficient as RDMA case where I assume user register a few big memory regions.  

2) system allocator: it is malloc'ed/mmap'ed memory be used for GPU program directly, without any other extra driver API call. We call this use case system allocator.

For system allocator, driver have no knowledge of which virtual address range is valid in advance. So when GPU access a malloc'ed/mmap'ed address, we have a page fault. We then look up a CPU vma which contains the fault address. I guess we can use the CPU vma size to allocate the iova space of the same size?

But there will be a true difficulty to apply your scheme to this use case. It is related to the STICKY flag. As I understand it, the sticky flag is designed for driver to mark "this page/pfn has been populated, no need to re-populate again", roughly...Unlike userptr and RDMA use cases where the backing store of a buffer is always in system memory, in the system allocator use case, the backing store can be changing b/t system memory and GPU's device private memory. Even worse, we have to assume the data migration b/t system and GPU is dynamic. When data is migrated to GPU, we don't need dma-map. And when migration happens to a pfn with STICKY flag, we still need to repopulate this pfn. So you can see, it is not easy to apply this scheme to this use case. At least I can't see an obvious way.


Oak


> 
> In this series, three users are converted and each of such conversion
> presents different positive gain:
> 1. RDMA simplifies and speeds up its pagefault handling for
>    on-demand-paging (ODP) mode.
> 2. VFIO PCI live migration code saves huge chunk of memory.
> 3. NVMe PCI avoids intermediate SG table manipulation and operates
>    directly on BIOs.
> 
> Thanks
> 
> [1]
> https://lore.kernel.org/all/169772852492.5232.17148564580779995849.stgit@
> klimt.1015granger.net
> 
> Chaitanya Kulkarni (2):
>   block: add dma_link_range() based API
>   nvme-pci: use blk_rq_dma_map() for NVMe SGL
> 
> Leon Romanovsky (14):
>   mm/hmm: let users to tag specific PFNs
>   dma-mapping: provide an interface to allocate IOVA
>   dma-mapping: provide callbacks to link/unlink pages to specific IOVA
>   iommu/dma: Provide an interface to allow preallocate IOVA
>   iommu/dma: Prepare map/unmap page functions to receive IOVA
>   iommu/dma: Implement link/unlink page callbacks
>   RDMA/umem: Preallocate and cache IOVA for UMEM ODP
>   RDMA/umem: Store ODP access mask information in PFN
>   RDMA/core: Separate DMA mapping to caching IOVA and page linkage
>   RDMA/umem: Prevent UMEM ODP creation with SWIOTLB
>   vfio/mlx5: Explicitly use number of pages instead of allocated length
>   vfio/mlx5: Rewrite create mkey flow to allow better code reuse
>   vfio/mlx5: Explicitly store page list
>   vfio/mlx5: Convert vfio to use DMA link API
> 
>  Documentation/core-api/dma-attributes.rst |   7 +
>  block/blk-merge.c                         | 156 ++++++++++++++
>  drivers/infiniband/core/umem_odp.c        | 219 +++++++------------
>  drivers/infiniband/hw/mlx5/mlx5_ib.h      |   1 +
>  drivers/infiniband/hw/mlx5/odp.c          |  59 +++--
>  drivers/iommu/dma-iommu.c                 | 129 ++++++++---
>  drivers/nvme/host/pci.c                   | 220 +++++--------------
>  drivers/vfio/pci/mlx5/cmd.c               | 252 ++++++++++++----------
>  drivers/vfio/pci/mlx5/cmd.h               |  22 +-
>  drivers/vfio/pci/mlx5/main.c              | 136 +++++-------
>  include/linux/blk-mq.h                    |   9 +
>  include/linux/dma-map-ops.h               |  13 ++
>  include/linux/dma-mapping.h               |  39 ++++
>  include/linux/hmm.h                       |   3 +
>  include/rdma/ib_umem_odp.h                |  22 +-
>  include/rdma/ib_verbs.h                   |  54 +++++
>  kernel/dma/debug.h                        |   2 +
>  kernel/dma/direct.h                       |   7 +-
>  kernel/dma/mapping.c                      |  91 ++++++++
>  mm/hmm.c                                  |  34 +--
>  20 files changed, 870 insertions(+), 605 deletions(-)
> 
> --
> 2.44.0



^ permalink raw reply	[relevance 0%]

* [GIT PULL] nvme fixes for Linux 6.9
@ 2024-05-02 13:00  4% Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-02 13:00 UTC (permalink / raw)
  To: axboe; +Cc: linux-nvme, hch, sagi

Jens:

I am on travel right now, and I've lost communication with my remote
server with my signing and push keys; therefore this does not have a
tag, much less a signed one. Since nvme is a bit behind on our pull
requests, I would like to get this out before the next rc. You can ping
me on back channel if you want to verify the legitimacy of the pull.

The following changes since commit 013ee5a6234d4c574dedd60c4887a4bcc9ecc749:

  Merge tag 'md-6.9-20240408' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into block-6.9 (2024-04-08 21:49:27 -0600)

are available in the Git repository at:

  git://git.infradead.org/nvme.git 50abcc179e0c9ca667feb223b26ea406d5c4c556

for you to fetch changes up to 50abcc179e0c9ca667feb223b26ea406d5c4c556:

  nvme-tcp: strict pdu pacing to avoid send stalls on TLS (2024-05-01 02:58:43 -0700)

----------------------------------------------------------------
Hannes Reinecke (1):
      nvme-tcp: strict pdu pacing to avoid send stalls on TLS

Maurizio Lombardi (2):
      nvmet-auth: return the error code to the nvmet_auth_host_hash() callers
      nvmet-auth: replace pr_debug() with pr_err() to report an error.

Nilay Shroff (2):
      nvme: find numa distance only if controller has valid numa id
      nvme: cancel pending I/O if nvme controller is in terminal state

Sagi Grimberg (2):
      nvmet-tcp: fix possible memory leak when tearing down a controller
      nvmet: fix nvme status code when namespace is disabled

Yi Zhang (1):
      nvme: fix warn output about shared namespaces without CONFIG_NVME_MULTIPATH

 drivers/nvme/host/core.c       | 23 +----------------------
 drivers/nvme/host/multipath.c  |  3 ++-
 drivers/nvme/host/nvme.h       | 21 +++++++++++++++++++++
 drivers/nvme/host/pci.c        |  8 +++++++-
 drivers/nvme/host/tcp.c        | 10 ++++++++--
 drivers/nvme/target/auth.c     |  8 ++++----
 drivers/nvme/target/configfs.c | 13 +++++++++++++
 drivers/nvme/target/core.c     |  5 ++++-
 drivers/nvme/target/nvmet.h    |  1 +
 drivers/nvme/target/tcp.c      | 11 ++++-------
 10 files changed, 65 insertions(+), 38 deletions(-)



^ permalink raw reply	[relevance 4%]

* Re: `nvme_disable_ctrl()` takes 411 ms on a Dell XPS 13 with SK hynix PC300 NVMEe
  @ 2024-05-02  8:43  4%             ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-02  8:43 UTC (permalink / raw)
  To: Paul Menzel; +Cc: Christoph Hellwig, Jens Axboe, Sagi Grimberg, linux-nvme

On Thu, May 02, 2024 at 08:12:39AM +0200, Paul Menzel wrote:
> > > That doesn't seem too hard to believe to me. A safe shutdown can often
> > > take a while time for an SSD. I've seen other implementations orders of
> > > magnitude worse than what you're showing.
> > 
> > But why? Due to physics or due to "slow" firmware?

Maybe both? The run time metadata doesn't necessarily match the on-disk
format, and constructing that can take a moment. These device's CPUs are
usually the cheapest the vendor could get that satisifies a run-time
performance target, so may be under powered for computational tasks.

And it may also have to flush pending user data from its internal
memory, which could be a few GB.

Lower end devices don't even have memory, so may have to make many round
trips to host memory to retreive its metadata then manipulate that to
its on-disk format.

Maybe this could be better optimized, but vendors may not consider
shutdown time to be a high priority.

This gets worse as you add more nvme devices to your system because
shutdown is serialized. Some of us have proposed patches parallelizing
this process. I wish I could spend more time on helping see that to
completion, but other priorities get in the way. :(

> > So this confirms the ftrace findings. Excuse my ignorance, so the
> > time-out is in seconds? And how does this relate to the rtd3e value (410
> > ms /= 60 ms /= 5 s(?)?

The driver provides a user tunable parameter to specify the minimum
timeout value, and it defaults to 5 seconds.

  nvme_core.shutdown_timeout=<time_in_seconds>

The driver selects this or the advertised rtd3e, whichever is greater.
We can't trust device's to report this correctly (and NVMe 1.0 didn't
even provide a way for a device to report an expected shutdown time), so
this exists to prevent unsafe shutdowns. Devices are supposed to survive
an unsafe shutdown, but it's best to avoid that path.

The parameter is in granularity of seconds because the NVMe 1.0 spec
said to "wait at least one second" for a shutdown to complete. Not the
most clear wording for a spec, but that's where we started.


^ permalink raw reply	[relevance 4%]

* Re: `nvme_disable_ctrl()` takes 411 ms on a Dell XPS 13 with SK hynix PC300 NVMEe
  @ 2024-05-01 22:03  5%       ` Keith Busch
    0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-05-01 22:03 UTC (permalink / raw)
  To: Paul Menzel; +Cc: Christoph Hellwig, Jens Axboe, Sagi Grimberg, linux-nvme

On Wed, May 01, 2024 at 10:58:05PM +0200, Paul Menzel wrote:
> Am 01.05.24 um 09:58 schrieb Keith Busch:
> > Exactly. Unless the device reports a lower D3 entry latency, then this
> > sounds like everything is working-as-designed.
> 
> Maybe according to the spec, but I have a hard time to believe, that disks
> should take longer to shut down than coreboot to initialize a mainboard.

That doesn't seem too hard to believe to me. A safe shutdown can often
take a while time for an SSD. I've seen other implementations orders of
magnitude worse than what you're showing. You could do an unsafe
shutdown instead, but the device will just take even more time to enable
on its next power-on.
 
> In the end, in my opinion, users cannot make an informed decision, if these
> things are hidden. If it would be visible somehow in the logs - maybe not
> warning but info level - then even not so technical users could inform
> themselves and factor this in their buying decision.

What good is it to advertise a shutdown time when vendors are clearly
unreliable at reporting an accurate value? If you need to see the driver
report it from emperical testing, then you've already bought the device,
right?

> > You can check your device's advertised shutdown time (assuming your
> > device is nvme0):
> > 
> >    nvme id-ctrl /dev/nvme0 | grep rtd3e
> > 
> > The value is reported in microseconds. If it shows 0, then the device
> > doesn't report an expected shutdown time.
> 
> Thank you for sharing. It´s 60 ms:


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v7 0/5] nvme-fabrics: short-circuit connect retries
  @ 2024-05-01 10:13  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-01 10:13 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: Christoph Hellwig, Sagi Grimberg, James Smart, Hannes Reinecke,
	linux-nvme, linux-kernel

On Tue, Apr 30, 2024 at 03:19:23PM +0200, Daniel Wagner wrote:
> I've splitted the last patch into the hopefully non controversial part 'do not
> retry when DNR is set' and the 'don't retry auth failures'. I hope we can get at
> least the first few patches in and have a lively discussion on the final patch
> at LSF.

I've dropped the previous series from the nvme-6.9 branch. This series
has been added to the newly created nvme-6.10 branch.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH 01/10] block: set bip_vcnt correctly
  @ 2024-05-01  8:03  5%           ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-05-01  8:03 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Kanchan Joshi, axboe, martin.petersen, brauner, asml.silence, dw,
	io-uring, linux-nvme, linux-block, gost.dev, Anuj Gupta

On Wed, May 01, 2024 at 09:45:44AM +0200, Christoph Hellwig wrote:
> On Sat, Apr 27, 2024 at 08:16:52AM -0600, Keith Busch wrote:
> > > Please add a Fixes tag and submit it separately from the features.
> > > 
> > > I'm actually kinda surprised the direct user mapping of integrity data
> > > survived so far without this.
> > 
> > The only existing use case for user metadata is REQ_OP_DRV_IN/OUT, which
> > never splits, so these initial fixes only really matter after this
> > series adds new usage for generic READ/WRITE.
> 
> Well, it matters to keep our contract up, even if we're not hitting it.
> 
> And apparently another user just came out of the woods in dm land..

But the bug report from dm has nothing to do with user mapped metadata.
That bug existed before that was added, so yeah, patch 5 from this
series (or something like it) should be applied on its own.


^ permalink raw reply	[relevance 5%]

* Re: `nvme_disable_ctrl()` takes 411 ms on a Dell XPS 13 with SK hynix PC300 NVMEe
  @ 2024-05-01  7:58  5%   ` Keith Busch
    0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-05-01  7:58 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Paul Menzel, Jens Axboe, Sagi Grimberg, linux-nvme

On Wed, May 01, 2024 at 06:51:45AM +0200, Christoph Hellwig wrote:
> On Wed, May 01, 2024 at 06:29:12AM +0200, Paul Menzel wrote:
> > Could a warning be logged, when this takes more than 50 ms with a hint that 
> > the disk vendor should improve their firmware?
> 
> Not sure why we'd warn about it.  A clean shutdown can definitively take
> some time and there's nothing tha forbids it.  I don't think a linux
> message is going to have any effect on firmware engineering..

Exactly. Unless the device reports a lower D3 entry latency, then this
sounds like everything is working-as-designed.

You can check your device's advertised shutdown time (assuming your
device is nvme0):

  nvme id-ctrl /dev/nvme0 | grep rtd3e

The value is reported in microseconds. If it shows 0, then the device
doesn't report an expected shutdown time.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH 02/10] block: copy bip_max_vcnt vecs instead of bip_vcnt during clone
  @ 2024-04-30  8:25  5%             ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-30  8:25 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Kanchan Joshi, axboe, martin.petersen, brauner, asml.silence, dw,
	io-uring, linux-nvme, linux-block, gost.dev, Anuj Gupta

On Mon, Apr 29, 2024 at 07:07:29PM +0200, Christoph Hellwig wrote:
> On Mon, Apr 29, 2024 at 01:04:12PM +0100, Keith Busch wrote:
> > An earlier version added a field in the bip to point to the original
> > bvec from the user address. That extra field wouldn't be used in the far
> > majority of cases, so moving the user bvec to the end of the existing
> > bip_vec is a spatial optimization. The code may look a little more
> > confusing that way, but I think it's better than making the bip bigger.
> 
> I think we need to do something like that - just hiding the bounce
> buffer is not really maintainable once we get multiple levels of stacking
> and other creative bio cloning.

Not sure I follow that. From patches 2-4 here, I think that pretty much
covers it. It's just missing a good code comment, but the implementation
side looks complete for any amount of stacking and splitting.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] block: change rq_integrity_vec to respect the iterator
  @ 2024-04-30  8:16  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-30  8:16 UTC (permalink / raw)
  To: Mikulas Patocka
  Cc: Jens Axboe, Mike Snitzer, linux-block, dm-devel, linux-nvme

On Mon, Apr 29, 2024 at 08:37:26PM +0200, Mikulas Patocka wrote:
> I am changing dm-crypt, so that it can store the autenticated encryption 
> tag directly into the NVMe metadata (without using dm-integrity). This 
> will improve performance significantly, because we can avoid journaling 
> done by dm-integrity. I've got it working, but I've found this bug, so I'm 
> sending a patch for it.

Patch looks fine, but Kanchan sent nearly the same one last week:

  https://lore.kernel.org/linux-block/20240425183943.6319-6-joshi.k@samsung.com/


^ permalink raw reply	[relevance 5%]

* Re: [PATCHv3] nvme-tcp: strict pdu pacing to avoid send stalls on TLS
  @ 2024-04-29 14:09  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-29 14:09 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: Christoph Hellwig, Sagi Grimberg, linux-nvme

On Thu, Apr 18, 2024 at 12:39:45PM +0200, Hannes Reinecke wrote:
> TLS requires a strict pdu pacing via MSG_EOR to signal the end
> of a record and subsequent encryption. If we do not set MSG_EOR
> at the end of a sequence the record won't be closed, encryption
> doesn't start, and we end up with a send stall as the message
> will never be passed on to the TCP layer.
> So do not check for the queue status when TLS is enabled but
> rather make the MSG_MORE setting dependent on the current
> request only.

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvmet: fix nvme status code when namespace is disabled
  @ 2024-04-29 11:52  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-29 11:52 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: linux-nvme, Christoph Hellwig, Chaitanya Kulkarni, jirong.feng

On Sun, Apr 28, 2024 at 12:25:40PM +0300, Sagi Grimberg wrote:
> If the user disabled a nvmet namesapce, it is removed from
> the subsystem namespaces list. When nvmet processes a command
> directed to an nsid that was disabled, it cannot differentiate
> between a nsid that is disabled vs. a non-existent namespace,
> and resorts to return NVME_SC_INVALID_NS with the dnr bit set.
> 
> This translates to a non-retryable status for the host, which
> translates to a user error. We should expect disabled namespaces
> to not cause an I/O error in a multipath environment.
> 
> Address this by searching a configfs item for the namespace nvmet
> failed to find, and if we found one, conclude that the namespace
> is disabled (perhaps temporarily). Return NVME_SC_INTERNAL_PATH_ERROR
> in this case and keep DNR bit cleared.
> 
> Reported-by: Jirong Feng <jirong.feng@easystack.cn>
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>

Applied with Jirong's "Tested-by" and Christoph's suggestions (spelling,
word wrap, and unnecessary 'else') to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH 02/10] block: copy bip_max_vcnt vecs instead of bip_vcnt during clone
  @ 2024-04-29 12:04  5%         ` Keith Busch
    0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-04-29 12:04 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Christoph Hellwig, axboe, martin.petersen, brauner, asml.silence,
	dw, io-uring, linux-nvme, linux-block, gost.dev, Anuj Gupta

On Mon, Apr 29, 2024 at 04:58:37PM +0530, Kanchan Joshi wrote:
> On 4/27/2024 12:33 PM, Christoph Hellwig wrote:
> >> If bio_integrity_copy_user is used to process the meta buffer, bip_max_vcnt
> >> is one greater than bip_vcnt. In this case bip_max_vcnt vecs needs to be
> >> copied to cloned bip.
> > Can you explain this a bit more?  The clone should only allocate what
> > is actually used, so this leaves be a bit confused.
> > 
> 
> Will expand the commit description.
> 
> Usually the meta buffer is pinned and used directly (say N bio vecs).
> In case kernel has to make a copy (in bio_integrity_copy_user), it 
> factors these N vecs in, and one extra for the bounce buffer.
> So for read IO, bip_max_vcnt is N+1, while bip_vcnt is N.
> 
> The clone bio also needs to be aware of all N+1 vecs, so that we can 
> copy the data from the bounce buffer to pinned user pages correctly 
> during read-completion.

An earlier version added a field in the bip to point to the original
bvec from the user address. That extra field wouldn't be used in the far
majority of cases, so moving the user bvec to the end of the existing
bip_vec is a spatial optimization. The code may look a little more
confusing that way, but I think it's better than making the bip bigger.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvme: cancel pending I/O if nvme controller is in terminal state
  @ 2024-04-29  8:51  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-29  8:51 UTC (permalink / raw)
  To: Nilay Shroff
  Cc: sagi, linux-nvme, hch, gjoyce, axboe, Srimannarayana Murthy Maram

On Thu, Apr 25, 2024 at 07:33:00PM +0530, Nilay Shroff wrote:
> While I/O is running, if the pci bus error occurs then
> in-flight I/O can not complete. Worst, if at this time,
> user (logically) hot-unplug the nvme disk then the
> nvme_remove() code path can't forward progress until
> in-flight I/O is cancelled. So these sequence of events
> may potentially hang hot-unplug code path indefinitely.
> This patch helps cancel the pending/in-flight I/O from the
> nvme request timeout handler in case the nvme controller
> is in the terminal (DEAD/DELETING/DELETING_NOIO) state and
> that helps nvme_remove() code path forward progress and
> finish successfully.
> 
> Link: https://lore.kernel.org/all/199be893-5dfa-41e5-b6f2-40ac90ebccc4@linux.ibm.com/
> Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvmet-tcp: fix possible memory leak when tearing down a controller
  @ 2024-04-29  8:51  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-29  8:51 UTC (permalink / raw)
  To: Sagi Grimberg; +Cc: linux-nvme, Christoph Hellwig, Chaitanya Kulkarni, Yi Zhang

On Sun, Apr 28, 2024 at 11:49:49AM +0300, Sagi Grimberg wrote:
> When we teardown the controller, we wait for pending I/Os to complete
> (sq->ref on all queues to drop to zero) and then we go over the commands,
> and free their command buffers in case they are still fetching data from
> the host (e.g. processing nvme writes) and have yet to take a reference
> on the sq.
> 
> However, we may miss the case where commands have failed before executing
> and are queued for sending a response, but will never occur because the
> queue socket is already down. In this case we may miss deallocating command
> buffers.
> 
> Solve this by freeing all commands buffers as nvmet_tcp_free_cmd_buffers is
> idempotent anyways.
> 
> Reported-by: Yi Zhang <yi.zhang@redhat.com>
> Tested-by: Yi Zhang <yi.zhang@redhat.com>
> Signed-off-by: Sagi Grimberg <sagi@grimberg.me>

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH 01/10] block: set bip_vcnt correctly
  @ 2024-04-27 14:16  5%       ` Keith Busch
    0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-04-27 14:16 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Kanchan Joshi, axboe, martin.petersen, brauner, asml.silence, dw,
	io-uring, linux-nvme, linux-block, gost.dev, Anuj Gupta

On Sat, Apr 27, 2024 at 09:02:14AM +0200, Christoph Hellwig wrote:
> On Fri, Apr 26, 2024 at 12:09:34AM +0530, Kanchan Joshi wrote:
> > From: Anuj Gupta <anuj20.g@samsung.com>
> > 
> > Set the bip_vcnt correctly in bio_integrity_init_user and
> > bio_integrity_copy_user. If the bio gets split at a later point,
> > this value is required to set the right bip_vcnt in the cloned bio.
> > 
> > Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
> > Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
> 
> Looks good:
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> 
> Please add a Fixes tag and submit it separately from the features.
> 
> I'm actually kinda surprised the direct user mapping of integrity data
> survived so far without this.

The only existing use case for user metadata is REQ_OP_DRV_IN/OUT, which
never splits, so these initial fixes only really matter after this
series adds new usage for generic READ/WRITE.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH 09/10] block: add support to send meta buffer
  @ 2024-04-26 15:21  5%     ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-26 15:21 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: axboe, martin.petersen, hch, brauner, asml.silence, dw, io-uring,
	linux-nvme, linux-block, gost.dev, Anuj Gupta

On Fri, Apr 26, 2024 at 12:09:42AM +0530, Kanchan Joshi wrote:
> diff --git a/block/fops.c b/block/fops.c
> index 679d9b752fe8..e488fa66dd60 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -353,6 +353,15 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
>  		task_io_account_write(bio->bi_iter.bi_size);
>  	}
>  
> +	if (unlikely(iocb->ki_flags & IOCB_USE_META)) {
> +		ret = bio_integrity_map_iter(bio, iocb->private);
> +		WRITE_ONCE(iocb->private, NULL);
> +		if (unlikely(ret)) {
> +			bio_put(bio);
> +			return ret;
> +		}
> +	}

Should this also be done for __blkdev_direct_IO and
__blkdev_direct_IO_simple?


^ permalink raw reply	[relevance 5%]

* Re: [PATCH 10/10] nvme: add separate handling for user integrity buffer
  @ 2024-04-25 19:56  5%     ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-25 19:56 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: axboe, martin.petersen, hch, brauner, asml.silence, dw, io-uring,
	linux-nvme, linux-block, gost.dev

On Fri, Apr 26, 2024 at 12:09:43AM +0530, Kanchan Joshi wrote:
> @@ -983,6 +1009,14 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
>  			if (WARN_ON_ONCE(!nvme_ns_has_pi(ns->head)))
>  				return BLK_STS_NOTSUPP;
>  			control |= NVME_RW_PRINFO_PRACT;
> +		} else {
> +			/* process user-created integrity */
> +			if (bio_integrity(req->bio)->bip_flags &
> +					BIP_INTEGRITY_USER) {

Make this an "else if" instead of nesting it an extra level.

> +				nvme_setup_user_integrity(ns, req, cmnd,
> +							  &control);
> +				goto out;
> +			}

And this can be structured a little differently so that you don't need
the "goto"; IMO, goto is good for error unwinding, but using it in a
good path harms readablilty.

This is getting complex enough that splitting it off in a helper
funciton, maybe nvme_setup_rw_meta(), might be a good idea.


^ permalink raw reply	[relevance 5%]

* Re: [Bug Report] PCIe errinject and hot-unplug causes nvme driver hang
  @ 2024-04-24 17:36  5%             ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-24 17:36 UTC (permalink / raw)
  To: Nilay Shroff
  Cc: Sagi Grimberg, linux-nvme, Christoph Hellwig, axboe,
	Gregory Joyce, Srimannarayana Murthy Maram

On Tue, Apr 23, 2024 at 03:22:46PM +0530, Nilay Shroff wrote:
> > 
> I tested the above patch, however, it doesn't help to solve the issue.
> I tested it for two cases listed below:
> 
> 1. Platform which doesn't support pci-error-recovery:
> -----------------------------------------------------
> On this platform when nvme_timeout() is invoked, it falls through 
> nvme_shoud_reset()
>   -> nvme_warn_reset() 
>     -> goto disable
> 
> When nvme_timeout() jumps to the label disable, it tries setting the
> controller state to RESETTING but that couldn't succeed because the 
> (logical) hot-unplug/nvme_remove() of the disk is started on another 
> thread and hence controller state has already changed to 
> DELETING/DELETING_NOIO. As nvme_timeout() couldn't set the controller 
> state to RESETTING, nvme_timeout() returns BLK_EH_DONE. In summary, 
> as nvme_timeout() couldn't cancel pending IO, the hot-unplug/nvme_remove() 
> couldn't forward progress and it keeps waiting for request queue to be freezed. 
> 
> 2. Platform supporting pci-error-recovery:
> ------------------------------------------
> Similarly, on this platform as explained for the above case, when 
> nvme_timeout() is invoked, it falls through nvme_shoud_reset()
> -> nvme_warn_reset() -> goto disable. In this case as well, 
> nvme_timeout() returns BLK_EH_DONE. Please note that though this 
> platform supports pci-error-recovery, we couldn't get through 
> nvme_error_detected() because the pci-error-recovery thread is pending 
> on acquiring mutex "pci_lock_rescan_remove". This mutex is acquired by 
> hot-unplug thread before it invokes nvme_remove() and nvme_remove() 
> is currently waiting for request queue to be freezed. For reference,
> I have already captured the task hang traces in previous email of this 
> thread where we could observe these hangs (for both pci-error-recovery
> thread as well as hot-unplig/nvme_remove()).
> 
> I understand that we don't want to cancel pending IO from the nvme_remove()
> unconditionally as if the disk is not physically hot-unplug then we still 
> want to  wait for the in-flight IO to be finished. Also looking through 
> the above cases, I think that the nvme_timeout() might be the code path 
> from where we want to cancel in-flight/pending IO if controller is 
> in the terminal state (i.e. DELETING or DELETING_NOIO). Keeping this idea in
> mind, I have worked out the below patch:
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 8e0bb9692685..e45a54d84649 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1286,6 +1286,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
>         u32 csts = readl(dev->bar + NVME_REG_CSTS);
>         u8 opcode;
>  
> +       if (nvme_state_terminal(&dev->ctrl))
> +               goto disable;
> +
>         /* If PCI error recovery process is happening, we cannot reset or
>          * the recovery mechanism will surely fail.
>          */
> @@ -1390,8 +1393,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
>         return BLK_EH_RESET_TIMER;
>  
>  disable:
> -       if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING))
> +       if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
> +               if (nvme_state_terminal(&dev->ctrl)) {
> +                       nvme_dev_disable(dev, false);
> +                       nvme_sync_queues(&dev->ctrl);
> +               }
>                 return BLK_EH_DONE;
> +       }
>  
>         nvme_dev_disable(dev, false);
>         if (nvme_try_sched_reset(&dev->ctrl))
> 
> I have tested the above patch against all possible cases. Please let me know
> if this looks good or if there are any further comments.

This looks okay to me. Just a couple things:

Set nvme_dev_disable's "shutdown" parameter to "true" since we're
restarting the queues again from this state.

Remove "nvme_sync_queues()". I think that would deadlock: sync_queues
waits for the timeout work to complete, but your calling it within the
timeout work, so this would have it wait for itself.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvme-pci: Add quirk for broken MSIs
  @ 2024-04-22 16:49  5% ` Keith Busch
  2024-05-07 15:02  5% ` Keith Busch
  1 sibling, 0 replies; 200+ results
From: Keith Busch @ 2024-04-22 16:49 UTC (permalink / raw)
  To: Sean Anderson
  Cc: Jens Axboe, Christoph Hellwig, Sagi Grimberg, linux-nvme,
	linux-kernel, stable

On Mon, Apr 22, 2024 at 12:28:23PM -0400, Sean Anderson wrote:
> Sandisk SN530 NVMe drives have broken MSIs. On systems without MSI-X
> support, all commands time out resulting in the following message:
> 
> nvme nvme0: I/O tag 12 (100c) QID 0 timeout, completion polled
> 
> These timeouts cause the boot to take an excessively-long time (over 20
> minutes) while the initial command queue is flushed.
> 
> Address this by adding a quirk for drives with buggy MSIs. The lspci
> output for this device (recorded on a system with MSI-X support) is:

Based on your description, the patch looks good. This will fallback to
legacy emulated pin interrupts, and that's better than timeout polling,
but will still appear sluggish compared to MSI's. Is there an errata
from the vendor on this? I'm just curious if the bug is at the Device ID
level, and not something we could constrain to a particular model or
firmware revision. 
 
> 02:00.0 Non-Volatile memory controller: Sandisk Corp Device 5008 (rev 01) (prog-if 02 [NVM Express])
> 	Subsystem: Sandisk Corp Device 5008
> 	Flags: bus master, fast devsel, latency 0, IRQ 16, NUMA node 0
> 	Memory at f7e00000 (64-bit, non-prefetchable) [size=16K]
> 	Memory at f7e04000 (64-bit, non-prefetchable) [size=256]
> 	Capabilities: [80] Power Management version 3
> 	Capabilities: [90] MSI: Enable- Count=1/32 Maskable- 64bit+
> 	Capabilities: [b0] MSI-X: Enable+ Count=17 Masked-

Interesting, the MSI capability does look weird here. I've never seen
MSI-x count smaller than the MSI's. As long as both work, though, I
think nvme would actually prefer whichever is bigger!


^ permalink raw reply	[relevance 5%]

* Re: [Bug Report] PCIe errinject and hot-unplug causes nvme driver hang
  2024-04-22 13:52  5%       ` Keith Busch
@ 2024-04-22 14:35  5%         ` Keith Busch
    0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-04-22 14:35 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Nilay Shroff, linux-nvme, Christoph Hellwig, axboe,
	Gregory Joyce, Srimannarayana Murthy Maram

On Mon, Apr 22, 2024 at 07:52:25AM -0600, Keith Busch wrote:
> On Mon, Apr 22, 2024 at 04:00:54PM +0300, Sagi Grimberg wrote:
> > > pci_rescan_remove_lock then it shall be able to recover the pci error and hence
> > > pending IOs could be finished. Later when hot-unplug task starts, it could
> > > forward progress and cleanup all resources used by the nvme disk.
> > > 
> > > So does it make sense if we unconditionally cancel the pending IOs from
> > > nvme_remove() before it forward progress to remove namespaces?
> > 
> > The driver attempts to allow inflights I/O to complete successfully, if the
> > device
> > is still present in the remove stage. I am not sure we want to
> > unconditionally fail these
> > I/Os.    Keith?
> 
> We have a timeout handler to clean this up, but I think it was another
> PPC specific patch that has the timeout handler do nothing if pcie error
> recovery is in progress. Which seems questionable, we should be able to
> concurrently run error handling and timeouts, but I think the error
> handling just needs to syncronize the request_queue's in the
> "error_detected" path.

This:

---
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8e0bb9692685d..38d0215fe53fc 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1286,13 +1286,6 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req)
 	u32 csts = readl(dev->bar + NVME_REG_CSTS);
 	u8 opcode;
 
-	/* If PCI error recovery process is happening, we cannot reset or
-	 * the recovery mechanism will surely fail.
-	 */
-	mb();
-	if (pci_channel_offline(to_pci_dev(dev->dev)))
-		return BLK_EH_RESET_TIMER;
-
 	/*
 	 * Reset immediately if the controller is failed
 	 */
@@ -3300,6 +3293,7 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
 			return PCI_ERS_RESULT_DISCONNECT;
 		}
 		nvme_dev_disable(dev, false);
+		nvme_sync_queues(&dev->ctrl);
 		return PCI_ERS_RESULT_NEED_RESET;
 	case pci_channel_io_perm_failure:
 		dev_warn(dev->ctrl.device,
--


^ permalink raw reply related	[relevance 5%]

* Re: [Bug Report] PCIe errinject and hot-unplug causes nvme driver hang
  @ 2024-04-22 13:52  5%       ` Keith Busch
  2024-04-22 14:35  5%         ` Keith Busch
  0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-04-22 13:52 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Nilay Shroff, linux-nvme, Christoph Hellwig, axboe,
	Gregory Joyce, Srimannarayana Murthy Maram

On Mon, Apr 22, 2024 at 04:00:54PM +0300, Sagi Grimberg wrote:
> > pci_rescan_remove_lock then it shall be able to recover the pci error and hence
> > pending IOs could be finished. Later when hot-unplug task starts, it could
> > forward progress and cleanup all resources used by the nvme disk.
> > 
> > So does it make sense if we unconditionally cancel the pending IOs from
> > nvme_remove() before it forward progress to remove namespaces?
> 
> The driver attempts to allow inflights I/O to complete successfully, if the
> device
> is still present in the remove stage. I am not sure we want to
> unconditionally fail these
> I/Os.    Keith?

We have a timeout handler to clean this up, but I think it was another
PPC specific patch that has the timeout handler do nothing if pcie error
recovery is in progress. Which seems questionable, we should be able to
concurrently run error handling and timeouts, but I think the error
handling just needs to syncronize the request_queue's in the
"error_detected" path.


^ permalink raw reply	[relevance 5%]

* Re: help re using nvme-cli to sanitize SSD
  @ 2024-04-22  2:49  4% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-22  2:49 UTC (permalink / raw)
  To: Deane Coleman; +Cc: linux-nvme

> I have an Acer laptop containing 1 x internal Samsung NVMe PCIe SSD running
> Win11 OS and I wish to completely sanitize the SSD of all user data,
> including over-provisioned / non-allocated space, whilst leaving vendor boot
> capability intact (if possible) and causing least wear on the SSD. Once
> sanitized, I wish to clean install Win 11 OS on the SSD. This is the
> objective.

What do you mean by "vendor boot capability"? The acer laptop? Or the
SSD bootstrapping itself? If there's something on the SSD that acer
needs to boot, sanitize will likely wipe it out.
 
> My limited use of nvme-cli identifies the SSD has only one controller
> 'nvme0' (with nvme0 containing all SSD data) and one namespace 'nvme0n1',
> however I presume (and please correct me if I'm wrong) the 3 above mentioned
> partitions in Linux = nvme0n1p1 + nvme0n1p2 + nvme0n1p3.
> 
> Based on above info (and presuming that info is sufficient), I respectfully
> ask whether the following procedure will 'smoothly' achieve the objective:
> 
> 1: backup all user data currently on SSD
> 2: have bootable Win11 ISO file prepped on USB drive via Rufus
> 3: boot laptop using Ubuntu Live USB
> 4: Open command terminal and run command - 'nvme sanitize -a 2 /dev/nvme0n1'
> to start block erase sanitize operation
> 5: when sanitize operation is complete, exchange Ubuntu Live USB for Win11
> USB and reboot laptop
> 6: Follow Win11 install procedures
> 7: Install Acer drivers
> 8: Install target apps

I don't know about step 6 there: why would you want to install that OS?

Kidding aside, assuming your device supports sanitize, step 4 will
definitely make all previous data inaccessible on all parititions,
including the partition table itself, essentially giving providing you a
blank slate storage device. If that's what you want, mission
accomplished.

Some devices don't support the sanitize operation though, in which case
'nvme format' is usually sufficient to permanently remove all previous
user data. You often use sanitize only if you're required to have a
paranoid decomissioning process.
 
> Additional questions arising from above:
> - If any above element won't achieve objective, please clarify what needs
> amending?
> - I currently understand 'nvme sanitize -a 4 /dev/nvme0n1' (crypto erase)
> causes least SSD wear but because all user data on the SSD is currently
> unencrypted, I presume crypto erase is pointless to achieve objective?

It's going to be vendor specific what that does, if anything. An SSD
might transparently generate and persistently store a random pattern and
XOR all user data with that, so a crypto erase could just forget the old
key.

> - I've tried researching the following sanitize options but am currently
> unable to appreciate their significance or relevance for the objective: 'No
> Deallocate After Sanitize' and 'Sanitize Action...001b - Exit Failure Mode'.
> Would you please help me discern whether I need to include either of these
> options to meet the objective and, if so, the correct syntax placement in
> the sanitize command for it/ them.

Don't bother with "no-deallocate", it probably doesn't mean anything to
this SSD anyway.

The "Exit Failure Mode" is how the host acknowledges a previous sanitize
attempt failed. You shouldn't have to worry about that because we're
expecting everything to work.

Once you start your 'santize' operation, the only other thing you need
to periodically check with nvme-cli is 'nvme sanitize-log /dev/nvme0'
until the operation is done. You probably don't want to reboot the
machine while the santize operation is in progress.


^ permalink raw reply	[relevance 4%]

* Re: nvme-cli spdk plugin
  @ 2024-04-18 14:44  5%       ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-18 14:44 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Jens Axboe, Christoph Hellwig, Daniel Wagner,
	Sebastian Brzezinka, Tomasz Zawadzki, Sagi Grimberg, linux-nvme

On Thu, Apr 18, 2024 at 08:25:45AM +0200, Hannes Reinecke wrote:
> On 4/17/24 16:31, Jens Axboe wrote:
> > On 4/17/24 8:26 AM, Christoph Hellwig wrote:
> > > On Wed, Apr 17, 2024 at 10:33:17AM +0200, Daniel Wagner wrote:
> > > > There is a PR [1] pending, adding a spdk plugin for nvme-cli. The
> > > > problem this new plugin tries to solve is, that with the recent change
> > > > to use sysfs only for discovering the nvme subystem, nvme-cli
> > > > lost support for spdk.
> > > > 
> > > > My question is, should we have a special plugin for 'list' and
> > > > 'list-subsystem' or should we try to get this somehow integrated into
> > > > the existing code? So that 'list' just works?
> > > 
> > > І don't think nvme-cli should deal with anything that is not driven
> > > by the kernel nvme driver.
> > 
> > Exactly, why on earth would we care about spdk at all in the first
> > place, nvme-cli or not.
> > 
> 
> And that depends on the direction of development we want to take.
> Do we want nvme-cli to become a 'general' nvme management tool, then
> we should investigate in having an spdk plugin.
> Or do we want to have nvme-cli as the cli for the linux nvme kernel driver,
> then clearly we wouldn't need an spdk plugin.
> That, I guess, is the real discussion.
> 
> Personally I would vote for the first option. But I'm sure others have
> other opinions.

I don't actively maintain this right now, so I think it's really up to
Daniel. It made sense to me from a design stand point that this utility
should have OS-agnostic abstractions. It's not a priority for me, but if
people are willing to maintain non-linux kernel environments, and as
long as that doesn't interfere with Linux kernel development or break
existing kernel usage, then I don't see why not.


^ permalink raw reply	[relevance 5%]

* Re: [PATCHv3] nvme: find numa distance only if controller has valid numa id
  @ 2024-04-16 16:44  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-16 16:44 UTC (permalink / raw)
  To: Nilay Shroff; +Cc: linux-nvme, hch, sagi, gjoyce, axboe, chaitanyak

On Tue, Apr 16, 2024 at 01:49:23PM +0530, Nilay Shroff wrote:
> On system where native nvme multipath is configured and iopolicy
> is set to numa but the nvme controller numa node id is undefined
> or -1 (NUMA_NO_NODE) then avoid calculating node distance for
> finding optimal io path. In such case we may access numa distance
> table with invalid index and that may potentially refer to incorrect
> memory. So this patch ensures that if the nvme controller numa node
> id is -1 then instead of calculating node distance for finding optimal
> io path, we set the numa node distance of such controller to default 10
> (LOCAL_DISTANCE).

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH 0/2] nvmet-auth: fix some minor bugs
  @ 2024-04-16 16:44  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-16 16:44 UTC (permalink / raw)
  To: Maurizio Lombardi; +Cc: sagi, hare, dwagner, linux-nvme

On Wed, Apr 10, 2024 at 11:48:40AM +0200, Maurizio Lombardi wrote:
> First patch modifies nvmet_auth_host_hash() so it correctly returns
> the error code to its caller.
> 
> The second patch replaces a pr_debug() with pr_err().

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvme: find numa distance only if controller has valid numa id
  @ 2024-04-15 16:56  5%           ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-15 16:56 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Nilay Shroff, Sagi Grimberg, linux-nvme, hch, gjoyce, axboe

On Mon, Apr 15, 2024 at 04:39:45PM +0200, Hannes Reinecke wrote:
> > For calculating the distance between two nodes we invoke the function __node_distance().
> > This function would then access the numa distance table, which is typically an array with
> > valid index starting from 0. So obviously accessing this table with index of -1 would
> > deference incorrect memory location. De-referencing incorrect memory location might have
> > side effects including panic (though I didn't encounter panic). Furthermore in such a case,
> > the calculated node distance could potentially be incorrect and that might cause the nvme
> > multipath to choose a suboptimal IO path.
> > 
> > This patch may not help choosing the optimal IO path (as we assume that the node distance would be
> > LOCAL_DISTANCE in case nvme controller numa node id is -1) but it ensures that we don't access the
> > invalid memory location for calculating node distance.
> > 
> Hmm. One wonders: how does such a system work?
> The systems I know always have the PCI slots attached to the CPU
> sockets, so if the CPU is not present the NVMe device on that
> slot will be non-functional. In fact, it wouldn't be visible at
> all as the PCI lanes are not powered up.
> In your system the PCI lanes clearly are powered up, as the NVMe
> device shows up in the PCI enumeration.
> Which means you are running a rather different PCI configuration.
> Question now is: does the NVMe device _work_?
> If it does, shouldn't the NUMA node continue to be present (some kind of
> memory-less, CPU-less NUMA node ...)?
> As a side-note, we'll need these kind of configuration anyway once
> CXL switches become available...

I recall systems with IO controller attached in a shared manner to all
sockets, so memory is UMA from IO device perspecitve (it may still be
NUMA from CPU). I don't think you need to consider memory-only NUMA
nodes unless there are additional distances to consider (at which point
it's no longer UMA).


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v5 0/6]  nvme-fabrics: short-circuit connect retries
  @ 2024-04-12 15:24  5%     ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-12 15:24 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: Christoph Hellwig, Sagi Grimberg, James Smart, Hannes Reinecke,
	linux-nvme, linux-kernel

On Fri, Apr 12, 2024 at 09:24:04AM +0200, Daniel Wagner wrote:
> On Thu, Apr 11, 2024 at 06:35:25PM -0600, Keith Busch wrote:
> > On Tue, Apr 09, 2024 at 11:35:04AM +0200, Daniel Wagner wrote:
> > > The first patch returns only kernel error codes now and avoids overwriting error
> > > codes later. Thje newly introduced helper for deciding if a reconnect should be
> > > attempted is the only place where we have the logic (and documentation).
> > > 
> > > On the target side I've separate the nvme status from the dhchap status handling
> > > which made it a bit clearer. I was tempted to refactor the code in
> > > nvmet_execute_auth_send to avoid hitting the 80 chars limit but didn't came up
> > > with something nice yet. So let's keep this change at a minimum before any
> > > refactoring attempts.
> > > 
> > > I've tested with blktests and also an real hardware for nvme-fc.
> > 
> > Thanks, series applied to nvme-6.9.
> 
> Thanks! I have an updated version here which addresses some of Sagi's
> feedback, e.g. using only one helper function. Sorry I didn't send out
> it earlier, I got a bit side tracked in testing because of the 'funky'
> results with RDMA.
> 
> Do you want me to send a complete fresh series or patches on top of this
> series? I'm fine either way.

Oh sorry, I didn't notice the discussion carried on after the "review"
tag. Please send me the update, I'll force push.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v5 4/6] nvme-rdma: short-circuit reconnect retries
  @ 2024-04-12  2:50  5%     ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-12  2:50 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Daniel Wagner, Christoph Hellwig, James Smart, Hannes Reinecke,
	linux-nvme, linux-kernel, Chaitanya Kulkarni

On Tue, Apr 09, 2024 at 11:28:04PM +0300, Sagi Grimberg wrote:
> On 09/04/2024 12:35, Daniel Wagner wrote:
> > 
> > Returning an nvme status from nvme_rdma_setup_ctrl() indicates that the
> > association was established and we have received a status from the
> > controller; consequently we should honour
> honor ?

King's English vs. Freedom English? Whichever flavour or color you like,
both are fine here!


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v2] nvme: fix warn output about shared namespaces without CONFIG_NVME_MULTIPATH
  @ 2024-04-12  0:35  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-12  0:35 UTC (permalink / raw)
  To: Yi Zhang; +Cc: linux-nvme, hch, chaitanyak, axboe

On Wed, Apr 10, 2024 at 08:57:14AM +0800, Yi Zhang wrote:
> Move the stray '.' that is currently at the end of the line after
> newline '\n' to before newline character which is the right position.
> 
> Fixes: ce8d78616a6b ("nvme: warn about shared namespaces without CONFIG_NVME_MULTIPATH")
> Signed-off-by: Yi Zhang <yi.zhang@redhat.com>

Thanks, patch applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v5 0/6]  nvme-fabrics: short-circuit connect retries
    @ 2024-04-12  0:35  5% ` Keith Busch
    1 sibling, 1 reply; 200+ results
From: Keith Busch @ 2024-04-12  0:35 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: Christoph Hellwig, Sagi Grimberg, James Smart, Hannes Reinecke,
	linux-nvme, linux-kernel

On Tue, Apr 09, 2024 at 11:35:04AM +0200, Daniel Wagner wrote:
> The first patch returns only kernel error codes now and avoids overwriting error
> codes later. Thje newly introduced helper for deciding if a reconnect should be
> attempted is the only place where we have the logic (and documentation).
> 
> On the target side I've separate the nvme status from the dhchap status handling
> which made it a bit clearer. I was tempted to refactor the code in
> nvmet_execute_auth_send to avoid hitting the 80 chars limit but didn't came up
> with something nice yet. So let's keep this change at a minimum before any
> refactoring attempts.
> 
> I've tested with blktests and also an real hardware for nvme-fc.

Thanks, series applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v6 02/10] block: Call blkdev_dio_unaligned() from blkdev_direct_IO()
  2024-04-10 22:53  0%   ` Luis Chamberlain
@ 2024-04-11  8:06  0%     ` John Garry
  0 siblings, 0 replies; 200+ results
From: John Garry @ 2024-04-11  8:06 UTC (permalink / raw)
  To: Luis Chamberlain
  Cc: axboe, kbusch, hch, sagi, jejb, martin.petersen, djwong, viro,
	brauner, dchinner, jack, linux-block, linux-kernel, linux-nvme,
	linux-fsdevel, tytso, jbongio, linux-scsi, ojaswin, linux-aio,
	linux-btrfs, io-uring, nilay, ritesh.list, willy

On 10/04/2024 23:53, Luis Chamberlain wrote:
> On Tue, Mar 26, 2024 at 01:38:05PM +0000, John Garry wrote:
>> blkdev_dio_unaligned() is called from __blkdev_direct_IO(),
>> __blkdev_direct_IO_simple(), and __blkdev_direct_IO_async(), and all these
>> are only called from blkdev_direct_IO().
>>
>> Move the blkdev_dio_unaligned() call to the common callsite,
>> blkdev_direct_IO().
>>
>> Pass those functions the bdev pointer from blkdev_direct_IO() as it is non-
>> trivial to calculate.
>>
>> Reviewed-by: Keith Busch<kbusch@kernel.org>
>> Reviewed-by: Christoph Hellwig<hch@lst.de>
>> Signed-off-by: John Garry<john.g.garry@oracle.com>
> Reviewed-by: Luis Chamberlain<mcgrof@kernel.org>
> 

cheers

> I think this patch should just be sent separately already and not part
> of this series.

That just creates a merge dependency, since I have later changes which 
depend on this. I suppose that since we're nearly at rc4, I could do that.

John



^ permalink raw reply	[relevance 0%]

* Re: [PATCH v6 01/10] block: Pass blk_queue_get_max_sectors() a request pointer
  2024-03-26 13:38  3% ` [PATCH v6 01/10] block: Pass blk_queue_get_max_sectors() a request pointer John Garry
@ 2024-04-10 22:58  0%   ` Luis Chamberlain
  0 siblings, 0 replies; 200+ results
From: Luis Chamberlain @ 2024-04-10 22:58 UTC (permalink / raw)
  To: John Garry
  Cc: axboe, kbusch, hch, sagi, jejb, martin.petersen, djwong, viro,
	brauner, dchinner, jack, linux-block, linux-kernel, linux-nvme,
	linux-fsdevel, tytso, jbongio, linux-scsi, ojaswin, linux-aio,
	linux-btrfs, io-uring, nilay, ritesh.list, willy

On Tue, Mar 26, 2024 at 01:38:04PM +0000, John Garry wrote:
> Currently blk_queue_get_max_sectors() is passed a enum req_op. In future
> the value returned from blk_queue_get_max_sectors() may depend on certain
> request flags, so pass a request pointer.
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Reviewed-by: Keith Busch <kbusch@kernel.org>
> Signed-off-by: John Garry <john.g.garry@oracle.com>

Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>

  Luis


^ permalink raw reply	[relevance 0%]

* Re: [PATCH v6 02/10] block: Call blkdev_dio_unaligned() from blkdev_direct_IO()
  2024-03-26 13:38  3% ` [PATCH v6 02/10] block: Call blkdev_dio_unaligned() from blkdev_direct_IO() John Garry
@ 2024-04-10 22:53  0%   ` Luis Chamberlain
  2024-04-11  8:06  0%     ` John Garry
  0 siblings, 1 reply; 200+ results
From: Luis Chamberlain @ 2024-04-10 22:53 UTC (permalink / raw)
  To: John Garry
  Cc: axboe, kbusch, hch, sagi, jejb, martin.petersen, djwong, viro,
	brauner, dchinner, jack, linux-block, linux-kernel, linux-nvme,
	linux-fsdevel, tytso, jbongio, linux-scsi, ojaswin, linux-aio,
	linux-btrfs, io-uring, nilay, ritesh.list, willy

On Tue, Mar 26, 2024 at 01:38:05PM +0000, John Garry wrote:
> blkdev_dio_unaligned() is called from __blkdev_direct_IO(),
> __blkdev_direct_IO_simple(), and __blkdev_direct_IO_async(), and all these
> are only called from blkdev_direct_IO().
> 
> Move the blkdev_dio_unaligned() call to the common callsite,
> blkdev_direct_IO().
> 
> Pass those functions the bdev pointer from blkdev_direct_IO() as it is non-
> trivial to calculate.
> 
> Reviewed-by: Keith Busch <kbusch@kernel.org>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: John Garry <john.g.garry@oracle.com>

Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>

I think this patch should just be sent separately already and not part
of this series.

  Luis


^ permalink raw reply	[relevance 0%]

* Re: [PATCH] drivers/nvme: Add quirks for device 1cc4:6a14
  @ 2024-04-10  3:08  5%     ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-10  3:08 UTC (permalink / raw)
  To: Holger Huo; +Cc: linux-nvme, linux-kernel, Holger Huo

On Wed, Apr 10, 2024 at 10:05:46AM +0800, Holger Huo wrote:
> On Wednesday, April 10, 2024 4:14:42 AM +08 Keith Busch wrote:
> > On Wed, Apr 10, 2024 at 03:28:50AM +0800, Holger Huo wrote:
> > > This commit adds NVME_QUIRK_BOGUS_NID for device [1cc4:6a14], the
> > > Shenzhen Unionmemory Information System Ltd. RPEYJ1T24MKN2QWY PCIe
> > > 4.0 NVMe SSD 1024GB (DRAM-less), which can be found on many Lenovo
> > > notebooks. This SSD produces all-zero nguid.
> > > 
> > > Other SSDs manufatured by Shenzhen Unionmemory Information System
> > > Ltd are likely to face similar issues, but I currently have no
> > > device to test.
> > 
> > Are you using the most recent stable kernel and still need this patch?
> > These quirks shouldn't be necessary anymore unless it's advertising
> > multipath capabilities, and it doesn't sound like this one should be
> > doing that.
> 
> Yes. My current Linux kernel version is 6.8.4 by arch. After further 
> investigation I found these quirks truly had no effects.
> But my nvme drive still freezes with Device not ready; aborting reset, 
> CSTS=0x1 failures and AMD-Vi: Event logged [IO_PAGE_FAULT domain=0x0001 ...] 
> waking from s0 (s2idle) sleep. Are there any other quirks or what that I 
> should try

I've seen the IO_PAGE_FAULT error reported before, but all I've heard is
that it means the device attempts to access a memory address that it is
not allowed to. I don't know why that would happen, though. I've never
first hand experience with this condition, I'm just as confused.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] drivers/nvme: Add quirks for device 1cc4:6a14
  @ 2024-04-09 20:14  5% ` Keith Busch
    0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-04-09 20:14 UTC (permalink / raw)
  To: Holger Huo; +Cc: hch, sagi, axboe, linux-nvme, linux-kernel, Holger Huo

On Wed, Apr 10, 2024 at 03:28:50AM +0800, Holger Huo wrote:
> This commit adds NVME_QUIRK_BOGUS_NID for device [1cc4:6a14], the
> Shenzhen Unionmemory Information System Ltd. RPEYJ1T24MKN2QWY PCIe
> 4.0 NVMe SSD 1024GB (DRAM-less), which can be found on many Lenovo
> notebooks. This SSD produces all-zero nguid.
> 
> Other SSDs manufatured by Shenzhen Unionmemory Information System
> Ltd are likely to face similar issues, but I currently have no
> device to test.

Are you using the most recent stable kernel and still need this patch?
These quirks shouldn't be necessary anymore unless it's advertising
multipath capabilities, and it doesn't sound like this one should be
doing that.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v5 4/6] nvme-rdma: short-circuit reconnect retries
  @ 2024-04-09 14:19  5%     ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-09 14:19 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Daniel Wagner, Sagi Grimberg, James Smart, Hannes Reinecke,
	linux-nvme, linux-kernel, Chaitanya Kulkarni

On Tue, Apr 09, 2024 at 04:00:54PM +0200, Christoph Hellwig wrote:
> On Tue, Apr 09, 2024 at 11:35:08AM +0200, Daniel Wagner wrote:
> > From: Hannes Reinecke <hare@suse.de>
> > 
> > Returning an nvme status from nvme_rdma_setup_ctrl() indicates that the
> 
> Shouldn't this an be an a based on my highschool english.  Or does
> Eeenvme count as a vowel?

It depends on how you hear it when you read it. If you automatically
expand the acronym, "Non-Volatile ...", then it should get the "a"
article.

If you instead try to pronounce "nvme" directly, it sounds like you're
saying "envy me", like commanding everyone to acknowledge your
awesomeness. Not sure if they had that in mind when deciding on the
name, but it's kind of amusing. Anyway, pronounce it that way, it gets
an "an". :)


^ permalink raw reply	[relevance 5%]

* Re: [PATCH for 5.15.y] nvme: fix miss command type check
  2024-04-07  9:15  4% [PATCH for 5.15.y] " Tokunori Ikegami
@ 2024-04-08 11:32  0% ` Greg Kroah-Hartman
  0 siblings, 0 replies; 200+ results
From: Greg Kroah-Hartman @ 2024-04-08 11:32 UTC (permalink / raw)
  To: Tokunori Ikegami
  Cc: linux-nvme, stable, min15.li, Kanchan Joshi, Christoph Hellwig,
	Keith Busch

On Sun, Apr 07, 2024 at 06:15:28PM +0900, Tokunori Ikegami wrote:
> From: "min15.li" <min15.li@samsung.com>
> 
> commit 31a5978243d24d77be4bacca56c78a0fbc43b00d upstream.
> 
> In the function nvme_passthru_end(), only the value of the command
> opcode is checked, without checking the command type (IO command or
> Admin command). When we send a Dataset Management command (The opcode
> of the Dataset Management command is the same as the Set Feature
> command), kernel thinks it is a set feature command, then sets the
> controller's keep alive interval, and calls nvme_keep_alive_work().
> 
> Signed-off-by: min15.li <min15.li@samsung.com>
> Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> Fixes: b58da2d270db ("nvme: update keep alive interval when kato is modified")
> Signed-off-by: Tokunori Ikegami <ikegami.t@gmail.com>
> ---
>  drivers/nvme/host/core.c | 6 ++++--
>  1 file changed, 4 insertions(+), 2 deletions(-)

Both now queued up, thanks.

greg k-h


^ permalink raw reply	[relevance 0%]

* [PATCH for 5.15.y] nvme: fix miss command type check
@ 2024-04-07  9:15  4% Tokunori Ikegami
  2024-04-08 11:32  0% ` Greg Kroah-Hartman
  0 siblings, 1 reply; 200+ results
From: Tokunori Ikegami @ 2024-04-07  9:15 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: linux-nvme, stable, min15.li, Kanchan Joshi, Christoph Hellwig,
	Keith Busch, Tokunori Ikegami

From: "min15.li" <min15.li@samsung.com>

commit 31a5978243d24d77be4bacca56c78a0fbc43b00d upstream.

In the function nvme_passthru_end(), only the value of the command
opcode is checked, without checking the command type (IO command or
Admin command). When we send a Dataset Management command (The opcode
of the Dataset Management command is the same as the Set Feature
command), kernel thinks it is a set feature command, then sets the
controller's keep alive interval, and calls nvme_keep_alive_work().

Signed-off-by: min15.li <min15.li@samsung.com>
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Fixes: b58da2d270db ("nvme: update keep alive interval when kato is modified")
Signed-off-by: Tokunori Ikegami <ikegami.t@gmail.com>
---
 drivers/nvme/host/core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8f06e5c1706b..960a31e3307a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1185,7 +1185,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	return effects;
 }
 
-static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
+static void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
 			      struct nvme_command *cmd, int status)
 {
 	if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
@@ -1201,6 +1201,8 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
 		nvme_queue_scan(ctrl);
 		flush_work(&ctrl->scan_work);
 	}
+	if (ns)
+		return;
 
 	switch (cmd->common.opcode) {
 	case nvme_admin_set_features:
@@ -1235,7 +1237,7 @@ int nvme_execute_passthru_rq(struct request *rq)
 	effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
 	ret = nvme_execute_rq(disk, rq, false);
 	if (effects) /* nothing to be done for zero cmd effects */
-		nvme_passthru_end(ctrl, effects, cmd, ret);
+		nvme_passthru_end(ctrl, ns, effects, cmd, ret);
 
 	return ret;
 }
-- 
2.40.1



^ permalink raw reply related	[relevance 4%]

* [PATCH for 6.1.y] nvme: fix miss command type check
@ 2024-04-07  2:28  3% Tokunori Ikegami
  0 siblings, 0 replies; 200+ results
From: Tokunori Ikegami @ 2024-04-07  2:28 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: linux-nvme, stable, min15.li, Kanchan Joshi, Christoph Hellwig,
	Keith Busch, Tokunori Ikegami

From: "min15.li" <min15.li@samsung.com>

commit 31a5978243d24d77be4bacca56c78a0fbc43b00d upstream.

In the function nvme_passthru_end(), only the value of the command
opcode is checked, without checking the command type (IO command or
Admin command). When we send a Dataset Management command (The opcode
of the Dataset Management command is the same as the Set Feature
command), kernel thinks it is a set feature command, then sets the
controller's keep alive interval, and calls nvme_keep_alive_work().

Signed-off-by: min15.li <min15.li@samsung.com>
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Fixes: b58da2d270db ("nvme: update keep alive interval when kato is modified")
Signed-off-by: Tokunori Ikegami <ikegami.t@gmail.com>
---
 drivers/nvme/host/core.c       | 4 +++-
 drivers/nvme/host/ioctl.c      | 3 ++-
 drivers/nvme/host/nvme.h       | 2 +-
 drivers/nvme/target/passthru.c | 3 ++-
 4 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index d7516e99275b..20160683e868 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1151,7 +1151,7 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	return effects;
 }
 
-void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
+void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
 		       struct nvme_command *cmd, int status)
 {
 	if (effects & NVME_CMD_EFFECTS_CSE_MASK) {
@@ -1167,6 +1167,8 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
 		nvme_queue_scan(ctrl);
 		flush_work(&ctrl->scan_work);
 	}
+	if (ns)
+		return;
 
 	switch (cmd->common.opcode) {
 	case nvme_admin_set_features:
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 91e6d0347579..b3e322e4ade3 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -147,6 +147,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
 		u32 meta_seed, u64 *result, unsigned timeout, bool vec)
 {
+	struct nvme_ns *ns = q->queuedata;
 	struct nvme_ctrl *ctrl;
 	struct request *req;
 	void *meta = NULL;
@@ -181,7 +182,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 	blk_mq_free_request(req);
 
 	if (effects)
-		nvme_passthru_end(ctrl, effects, cmd, ret);
+		nvme_passthru_end(ctrl, ns, effects, cmd, ret);
 
 	return ret;
 }
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a892d679e338..8e28d2de45c0 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -1063,7 +1063,7 @@ static inline void nvme_auth_free(struct nvme_ctrl *ctrl) {};
 u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 			 u8 opcode);
 int nvme_execute_passthru_rq(struct request *rq, u32 *effects);
-void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
+void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
 		       struct nvme_command *cmd, int status);
 struct nvme_ctrl *nvme_ctrl_from_file(struct file *file);
 struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid);
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index adc0958755d6..a0a292d49588 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -216,6 +216,7 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w)
 	struct nvmet_req *req = container_of(w, struct nvmet_req, p.work);
 	struct request *rq = req->p.rq;
 	struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
+	struct nvme_ns *ns = rq->q->queuedata;
 	u32 effects;
 	int status;
 
@@ -242,7 +243,7 @@ static void nvmet_passthru_execute_cmd_work(struct work_struct *w)
 	blk_mq_free_request(rq);
 
 	if (effects)
-		nvme_passthru_end(ctrl, effects, req->cmd, status);
+		nvme_passthru_end(ctrl, ns, effects, req->cmd, status);
 }
 
 static enum rq_end_io_ret nvmet_passthru_req_done(struct request *rq,
-- 
2.40.1



^ permalink raw reply related	[relevance 3%]

* Re: [PATCHv2 0/2] block,nvme: latency-based I/O scheduler
  @ 2024-04-05 15:03  5%     ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-05 15:03 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Hannes Reinecke, Christoph Hellwig, Sagi Grimberg, Jens Axboe,
	linux-nvme, linux-block

On Fri, Apr 05, 2024 at 08:21:14AM +0200, Hannes Reinecke wrote:
> On 4/4/24 23:14, Keith Busch wrote:
> > On Wed, Apr 03, 2024 at 04:17:54PM +0200, Hannes Reinecke wrote:
> > > Hi all,
> > > 
> > > there had been several attempts to implement a latency-based I/O
> > > scheduler for native nvme multipath, all of which had its issues.
> > > 
> > > So time to start afresh, this time using the QoS framework
> > > already present in the block layer.
> > > It consists of two parts:
> > > - a new 'blk-nlatency' QoS module, which is just a simple per-node
> > >    latency tracker
> > > - a 'latency' nvme I/O policy
> > Whatever happened with the io-depth based path selector? That should
> > naturally align with the lower latency path, and that metric is cheaper
> > to track.
> 
> Turns out that tracking queue depth (on the NVMe level) always requires
> an atomic, and with that a performance impact.
> The qos/blk-stat framework is already present, and as the numbers show
> actually leads to a performance improvement.
> 
> So I'm not quite sure what the argument 'cheaper to track' buys us here...

I was considering the blk_stat framework compared to those atomic
operations. I usually don't enable stats because all the extra
ktime_get_ns() and indirect calls are relatively costly. If you're
enabling stats anyway though, then yeah, I guess I don't really have a
point and your idea here seems pretty reasonable.


^ permalink raw reply	[relevance 5%]

* Re: [PATCHv2 0/2] block,nvme: latency-based I/O scheduler
  @ 2024-04-04 21:14  5% ` Keith Busch
    0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-04-04 21:14 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Christoph Hellwig, Sagi Grimberg, Jens Axboe, linux-nvme, linux-block

On Wed, Apr 03, 2024 at 04:17:54PM +0200, Hannes Reinecke wrote:
> Hi all,
> 
> there had been several attempts to implement a latency-based I/O
> scheduler for native nvme multipath, all of which had its issues.
> 
> So time to start afresh, this time using the QoS framework
> already present in the block layer.
> It consists of two parts:
> - a new 'blk-nlatency' QoS module, which is just a simple per-node
>   latency tracker
> - a 'latency' nvme I/O policy
 
Whatever happened with the io-depth based path selector? That should
naturally align with the lower latency path, and that metric is cheaper
to track.


^ permalink raw reply	[relevance 5%]

* [GIT PULL] nvme fixes for Linux 6.9
@ 2024-04-04 16:59  5% Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-04 16:59 UTC (permalink / raw)
  To: axboe; +Cc: hch, sagi, linux-nvme

[resend for the mailing list]
The following changes since commit 22d24a544b0d49bbcbd61c8c0eaf77d3c9297155:

  block: fix overflow in blk_ioctl_discard() (2024-04-02 07:43:24 -0600)

are available in the Git repository at:

  git://git.infradead.org/nvme.git tags/nvme-6.9-2024-04-04

for you to fetch changes up to 205fb5fa6fde1b5b426015eb1ff69f2ff25ef5bb:

  nvme-fc: rename free_ctrl callback to match name pattern (2024-04-04 08:47:56 -0700)

----------------------------------------------------------------
nvme fixes for Linux 6.9

 - Atomic queue limits fixes (Christoph)
 - Fabrics fixes (Hannes, Daniel)

----------------------------------------------------------------
Christoph Hellwig (3):
      nvme-multipath: don't inherit LBA-related fields for the multipath node
      nvme: split nvme_update_zone_info
      nvme: don't create a multipath node for zero capacity devices

Daniel Wagner (2):
      nvmet-fc: move RCU read lock to nvmet_fc_assoc_exists
      nvme-fc: rename free_ctrl callback to match name pattern

Hannes Reinecke (1):
      nvmet: implement unique discovery NQN

 drivers/nvme/host/core.c       | 41 ++++++++++++++++++++++++++++--------
 drivers/nvme/host/fc.c         |  4 ++--
 drivers/nvme/host/nvme.h       | 12 +++++++++--
 drivers/nvme/host/zns.c        | 33 +++++++++++++++++------------
 drivers/nvme/target/configfs.c | 47 ++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/target/core.c     |  7 +++++++
 drivers/nvme/target/fc.c       | 17 ++++++++-------
 7 files changed, 128 insertions(+), 33 deletions(-)
r


^ permalink raw reply	[relevance 5%]

* Re: [PATCHv5] nvmet: implement unique discovery NQN
  @ 2024-04-04 15:58  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-04 15:58 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: Christoph Hellwig, Sagi Grimberg, linux-nvme

On Wed, Apr 03, 2024 at 01:31:14PM +0200, Hannes Reinecke wrote:
> Unique discovery NQNs allow to differentiate between discovery
> services from (typically physically separate) NVMe-oF subsystems.
> This is required for establishing secured connections as otherwise
> the credentials won't be unique and the integrity of the connection
> cannot be guaranteed.
> This patch adds a configfs attribute 'discovery_nqn' in the 'nvmet'
> configfs directory to specify the unique discovery NQN.

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvme: don't create a multipath node for zero capacity devices
  @ 2024-04-04 15:58  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-04 15:58 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: sagi, linux-nvme, Nilay Shroff

On Wed, Apr 03, 2024 at 02:47:17PM +0200, Christoph Hellwig wrote:
> Apparently there are nvme controllers around that report namespaces
> in the namespace list which have zero capacity.  Return -ENXIO instead
> of -ENODEV from nvme_update_ns_info_block so we don't create a hidden
> multipath node for these namespaces but entirely ignore them.

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v2 0/2] nvme(t)-fc: couple of fixes/cleanups
  @ 2024-04-04 15:57  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-04 15:57 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: James Smart, Christoph Hellwig, Sagi Grimberg, Hannes Reinecke,
	linux-nvme, linux-kernel

On Thu, Apr 04, 2024 at 04:41:29PM +0200, Daniel Wagner wrote:
> Both patches are just a rebased to the current nvme-6.9 head.
> 
> Daniel Wagner (2):
>   nvmet-fc: move RCU read lock to nvmet_fc_assoc_exists
>   nvme-fc: rename free_ctrl callback to match name pattern

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* [PATCH AUTOSEL 5.10 7/8] drivers/nvme: Add quirks for device 126f:2262
       [not found]     <20240403172006.353022-1-sashal@kernel.org>
@ 2024-04-03 17:20  4% ` Sasha Levin
  0 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-04-03 17:20 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Jiawei Fu (iBug),
	Christoph Hellwig, Sagi Grimberg, Keith Busch, Sasha Levin,
	linux-nvme

From: "Jiawei Fu (iBug)" <i@ibugone.com>

[ Upstream commit e89086c43f0500bc7c4ce225495b73b8ce234c1f ]

This commit adds NVME_QUIRK_NO_DEEPEST_PS and NVME_QUIRK_BOGUS_NID for
device [126f:2262], which appears to be a generic VID:PID pair used for
many SSDs based on the Silicon Motion SM2262/SM2262EN controller.

Two of my SSDs with this VID:PID pair exhibit the same behavior:

  * They frequently have trouble exiting the deepest power state (5),
    resulting in the entire disk unresponsive.
    Verified by setting nvme_core.default_ps_max_latency_us=10000 and
    observing them behaving normally.
  * They produce all-zero nguid and eui64 with `nvme id-ns` command.

The offending products are:

  * HP SSD EX950 1TB
  * HIKVISION C2000Pro 2TB

Signed-off-by: Jiawei Fu <i@ibugone.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/pci.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 970a1b374a669..5242feda5471a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -3199,6 +3199,9 @@ static const struct pci_device_id nvme_id_table[] = {
 				NVME_QUIRK_BOGUS_NID, },
 	{ PCI_VDEVICE(REDHAT, 0x0010),	/* Qemu emulated controller */
 		.driver_data = NVME_QUIRK_BOGUS_NID, },
+	{ PCI_DEVICE(0x126f, 0x2262),	/* Silicon Motion generic */
+		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+				NVME_QUIRK_BOGUS_NID, },
 	{ PCI_DEVICE(0x126f, 0x2263),	/* Silicon Motion unidentified */
 		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST, },
 	{ PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 5.15 7/8] drivers/nvme: Add quirks for device 126f:2262
       [not found]     <20240403171945.350716-1-sashal@kernel.org>
@ 2024-04-03 17:19  4% ` Sasha Levin
  0 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-04-03 17:19 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Jiawei Fu (iBug),
	Christoph Hellwig, Sagi Grimberg, Keith Busch, Sasha Levin,
	linux-nvme

From: "Jiawei Fu (iBug)" <i@ibugone.com>

[ Upstream commit e89086c43f0500bc7c4ce225495b73b8ce234c1f ]

This commit adds NVME_QUIRK_NO_DEEPEST_PS and NVME_QUIRK_BOGUS_NID for
device [126f:2262], which appears to be a generic VID:PID pair used for
many SSDs based on the Silicon Motion SM2262/SM2262EN controller.

Two of my SSDs with this VID:PID pair exhibit the same behavior:

  * They frequently have trouble exiting the deepest power state (5),
    resulting in the entire disk unresponsive.
    Verified by setting nvme_core.default_ps_max_latency_us=10000 and
    observing them behaving normally.
  * They produce all-zero nguid and eui64 with `nvme id-ns` command.

The offending products are:

  * HP SSD EX950 1TB
  * HIKVISION C2000Pro 2TB

Signed-off-by: Jiawei Fu <i@ibugone.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/pci.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index fd20f3fdb1592..7bb74112fef37 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -3339,6 +3339,9 @@ static const struct pci_device_id nvme_id_table[] = {
 				NVME_QUIRK_BOGUS_NID, },
 	{ PCI_VDEVICE(REDHAT, 0x0010),	/* Qemu emulated controller */
 		.driver_data = NVME_QUIRK_BOGUS_NID, },
+	{ PCI_DEVICE(0x126f, 0x2262),	/* Silicon Motion generic */
+		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+				NVME_QUIRK_BOGUS_NID, },
 	{ PCI_DEVICE(0x126f, 0x2263),	/* Silicon Motion unidentified */
 		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST, },
 	{ PCI_DEVICE(0x1bb1, 0x0100),   /* Seagate Nytro Flash Storage */
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.1 14/15] drivers/nvme: Add quirks for device 126f:2262
       [not found]     <20240403171909.345570-1-sashal@kernel.org>
@ 2024-04-03 17:18  4% ` Sasha Levin
  0 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-04-03 17:18 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Jiawei Fu (iBug),
	Christoph Hellwig, Sagi Grimberg, Keith Busch, Sasha Levin,
	linux-nvme

From: "Jiawei Fu (iBug)" <i@ibugone.com>

[ Upstream commit e89086c43f0500bc7c4ce225495b73b8ce234c1f ]

This commit adds NVME_QUIRK_NO_DEEPEST_PS and NVME_QUIRK_BOGUS_NID for
device [126f:2262], which appears to be a generic VID:PID pair used for
many SSDs based on the Silicon Motion SM2262/SM2262EN controller.

Two of my SSDs with this VID:PID pair exhibit the same behavior:

  * They frequently have trouble exiting the deepest power state (5),
    resulting in the entire disk unresponsive.
    Verified by setting nvme_core.default_ps_max_latency_us=10000 and
    observing them behaving normally.
  * They produce all-zero nguid and eui64 with `nvme id-ns` command.

The offending products are:

  * HP SSD EX950 1TB
  * HIKVISION C2000Pro 2TB

Signed-off-by: Jiawei Fu <i@ibugone.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/pci.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3d01290994d89..5ff09f2cacab7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -3471,6 +3471,9 @@ static const struct pci_device_id nvme_id_table[] = {
 				NVME_QUIRK_BOGUS_NID, },
 	{ PCI_VDEVICE(REDHAT, 0x0010),	/* Qemu emulated controller */
 		.driver_data = NVME_QUIRK_BOGUS_NID, },
+	{ PCI_DEVICE(0x126f, 0x2262),	/* Silicon Motion generic */
+		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+				NVME_QUIRK_BOGUS_NID, },
 	{ PCI_DEVICE(0x126f, 0x2263),	/* Silicon Motion unidentified */
 		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
 				NVME_QUIRK_BOGUS_NID, },
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.6 17/20] drivers/nvme: Add quirks for device 126f:2262
       [not found]     <20240403171815.342668-1-sashal@kernel.org>
@ 2024-04-03 17:17  4% ` Sasha Levin
  0 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-04-03 17:17 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Jiawei Fu (iBug),
	Christoph Hellwig, Sagi Grimberg, Keith Busch, Sasha Levin,
	linux-nvme

From: "Jiawei Fu (iBug)" <i@ibugone.com>

[ Upstream commit e89086c43f0500bc7c4ce225495b73b8ce234c1f ]

This commit adds NVME_QUIRK_NO_DEEPEST_PS and NVME_QUIRK_BOGUS_NID for
device [126f:2262], which appears to be a generic VID:PID pair used for
many SSDs based on the Silicon Motion SM2262/SM2262EN controller.

Two of my SSDs with this VID:PID pair exhibit the same behavior:

  * They frequently have trouble exiting the deepest power state (5),
    resulting in the entire disk unresponsive.
    Verified by setting nvme_core.default_ps_max_latency_us=10000 and
    observing them behaving normally.
  * They produce all-zero nguid and eui64 with `nvme id-ns` command.

The offending products are:

  * HP SSD EX950 1TB
  * HIKVISION C2000Pro 2TB

Signed-off-by: Jiawei Fu <i@ibugone.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/pci.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index f8e92404a6591..b985142fb84b9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -3361,6 +3361,9 @@ static const struct pci_device_id nvme_id_table[] = {
 				NVME_QUIRK_BOGUS_NID, },
 	{ PCI_VDEVICE(REDHAT, 0x0010),	/* Qemu emulated controller */
 		.driver_data = NVME_QUIRK_BOGUS_NID, },
+	{ PCI_DEVICE(0x126f, 0x2262),	/* Silicon Motion generic */
+		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+				NVME_QUIRK_BOGUS_NID, },
 	{ PCI_DEVICE(0x126f, 0x2263),	/* Silicon Motion unidentified */
 		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
 				NVME_QUIRK_BOGUS_NID, },
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* [PATCH AUTOSEL 6.8 24/28] drivers/nvme: Add quirks for device 126f:2262
       [not found]     <20240403171656.335224-1-sashal@kernel.org>
@ 2024-04-03 17:16  4% ` Sasha Levin
  0 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-04-03 17:16 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Jiawei Fu (iBug),
	Christoph Hellwig, Sagi Grimberg, Keith Busch, Sasha Levin,
	linux-nvme

From: "Jiawei Fu (iBug)" <i@ibugone.com>

[ Upstream commit e89086c43f0500bc7c4ce225495b73b8ce234c1f ]

This commit adds NVME_QUIRK_NO_DEEPEST_PS and NVME_QUIRK_BOGUS_NID for
device [126f:2262], which appears to be a generic VID:PID pair used for
many SSDs based on the Silicon Motion SM2262/SM2262EN controller.

Two of my SSDs with this VID:PID pair exhibit the same behavior:

  * They frequently have trouble exiting the deepest power state (5),
    resulting in the entire disk unresponsive.
    Verified by setting nvme_core.default_ps_max_latency_us=10000 and
    observing them behaving normally.
  * They produce all-zero nguid and eui64 with `nvme id-ns` command.

The offending products are:

  * HP SSD EX950 1TB
  * HIKVISION C2000Pro 2TB

Signed-off-by: Jiawei Fu <i@ibugone.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/pci.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index e6267a6aa3801..8e0bb9692685d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -3363,6 +3363,9 @@ static const struct pci_device_id nvme_id_table[] = {
 				NVME_QUIRK_BOGUS_NID, },
 	{ PCI_VDEVICE(REDHAT, 0x0010),	/* Qemu emulated controller */
 		.driver_data = NVME_QUIRK_BOGUS_NID, },
+	{ PCI_DEVICE(0x126f, 0x2262),	/* Silicon Motion generic */
+		.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+				NVME_QUIRK_BOGUS_NID, },
 	{ PCI_DEVICE(0x126f, 0x2263),	/* Silicon Motion unidentified */
 		.driver_data = NVME_QUIRK_NO_NS_DESC_LIST |
 				NVME_QUIRK_BOGUS_NID, },
-- 
2.43.0



^ permalink raw reply related	[relevance 4%]

* RE: [PATCH] fabrics : allow host to create duplicate connections to target
  @ 2024-04-03 16:43  4%   ` Engel, Amit
  0 siblings, 0 replies; 200+ results
From: Engel, Amit @ 2024-04-03 16:43 UTC (permalink / raw)
  To: Daniel Wagner, linux-nvme, Nilay Shroff; +Cc: kbusch, gjoyce

Looks good, thank you!


Internal Use - Confidential
-----Original Message-----
From: Daniel Wagner <dwagner@suse.de>
Sent: Wednesday, 3 April 2024 16:52
To: linux-nvme@lists.infradead.org; Nilay Shroff <nilay@linux.ibm.com>
Cc: Daniel Wagner <dwagner@suse.de>; Engel, Amit <Amit.Engel@Dell.com>; kbusch@kernel.org; gjoyce@ibm.com; Engel, Amit <Amit.Engel@Dell.com>
Subject: Re: [PATCH] fabrics : allow host to create duplicate connections to target


[EXTERNAL EMAIL]


On Wed, 03 Apr 2024 19:04:16 +0530, Nilay Shroff wrote:
> Apparently the nvmf connect code doesn't allow creating duplicate
> connections from a host to the same target even though user specifies
> option "--duplicate-connect" or "-D".
> This patch help fix this issue.
>
>

Applied, thanks!

[1/1] fabrics : allow host to create duplicate connections to target
      commit: fd8ecb054ce53acae504e54666bec6c9976a82dd

Best regards,
--
Daniel Wagner <dwagner@suse.de>


^ permalink raw reply	[relevance 4%]

* Re: [Bug Report] nvme-cli commands fails to open head disk node and print error
  2024-04-02 22:07  4%   ` Kamaljit Singh
@ 2024-04-03  3:07  5%     ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-03  3:07 UTC (permalink / raw)
  To: Kamaljit Singh
  Cc: Daniel Wagner, Christoph Hellwig, linux-nvme, linux-block, axboe,
	Gregory Joyce, Nilay Shroff

On Tue, Apr 02, 2024 at 10:07:25PM +0000, Kamaljit Singh wrote:
> 
> Hi Daniel,
> Your question about the nvme-cli version makes me wonder if there is a
> version compatibility matrix (nvme-cli vs kernel) somewhere you could
> point me to? I didn't see such info in the nvme-cli release notes.

I don't believe there's ever been an intentional incompatibility for
nvme-cli vs. kernel versions. Most of the incompatibility problems come
from sysfs dependencies, but those should not be necessary for the core
passthrough commands on any version pairing.

And yeah, there should be sane fallbacks for older kernels in case a new
feature introduces a regression, but it's not always perfect. We try to
fix them as we learn about them, so bug reports on the github are useful
for tracking that.

> For example, I've seen issues with newer than nvme-cli v1.16 on Ubuntu
> 22.04 (stock & newer kernels). From a compatibility perspective I do
> wonder whether circumventing a distro's package manager and directly
> installing newer nvme-cli versions might be a bad idea. This could
> possibly become dire if there were intentional version dependencies
> across the stack.

The struggle is real, isn't it? New protocol features are added upstream
faster than distro package updates provide their users. On the other
hand, distros may be cautious to potential instability.


^ permalink raw reply	[relevance 5%]

* Re: WQ_UNBOUND workqueue warnings from multiple drivers
  @ 2024-04-02 23:50  5%     ` Kamaljit Singh
    0 siblings, 1 reply; 200+ results
From: Kamaljit Singh @ 2024-04-02 23:50 UTC (permalink / raw)
  To: Chaitanya Kulkarni, Sagi Grimberg; +Cc: kbusch, linux-kernel, linux-nvme

Sagi, Chaitanya,
 
Sorry for the delay, found your replies in the junk folder :(
 
> Was the test you were running read-heavy?
No, most of the failing fio tests were doing heavy writes. All were with 8 Controllers and 32 NS each. io-specs are below.

[1] bs=16k, iodepth=16, rwmixread=0, numjobs=16
Failed in ~1 min

Some others were:
[2] bs=8k, iodepth=16, rwmixread=5, numjobs=16 
[3] bs=8k, iodepth=16, rwmixread=50, numjobs=16 
 
Thanks,
Kamaljit
 
From: Chaitanya Kulkarni <chaitanyak@nvidia.com>
Date: Thursday, March 21, 2024 at 10:36
To: Sagi Grimberg <sagi@grimberg.me>, Kamaljit Singh <Kamaljit.Singh1@wdc.com>
Cc: kbusch@kernel.org <kbusch@kernel.org>, linux-kernel@vger.kernel.org <linux-kernel@vger.kernel.org>, linux-nvme@lists.infradead.org <linux-nvme@lists.infradead.org>
Subject: Re: WQ_UNBOUND workqueue warnings from multiple drivers
CAUTION: This email originated from outside of Western Digital. Do not click on links or open attachments unless you recognize the sender and know that the content is safe.


On 3/20/24 02:11, Sagi Grimberg wrote:
>
>
> On 19/03/2024 0:33, Kamaljit Singh wrote:
>> Hello,
>>
>> After switching from Kernel v6.6.2 to v6.6.21 we're now seeing these
>> workqueue
>> warnings. I found a discussion thread about the the Intel drm driver
>> here
>> https://lore.kernel.org/lkml/ZO-BkaGuVCgdr3wc@slm.duckdns.org/T/
>>
>> and this related bug report
>> https://gitlab.freedesktop.org/drm/intel/-/issues/9245
>> but that that drm fix isn't merged into v6.6.21. It appears that we
>> may need the same
>> WQ_UNBOUND change to the nvme host tcp driver among others.
>>   [Fri Mar 15 22:30:06 2024] workqueue: nvme_tcp_io_work [nvme_tcp]
>> hogged CPU for >10000us 4 times, consider switching to WQ_UNBOUND
>> [Fri Mar 15 23:44:58 2024] workqueue: drain_vmap_area_work hogged CPU
>> for >10000us 4 times, consider switching to WQ_UNBOUND
>> [Sat Mar 16 09:55:27 2024] workqueue: drain_vmap_area_work hogged CPU
>> for >10000us 8 times, consider switching to WQ_UNBOUND
>> [Sat Mar 16 17:51:18 2024] workqueue: nvme_tcp_io_work [nvme_tcp]
>> hogged CPU for >10000us 8 times, consider switching to WQ_UNBOUND
>> [Sat Mar 16 23:04:14 2024] workqueue: nvme_tcp_io_work [nvme_tcp]
>> hogged CPU for >10000us 16 times, consider switching to WQ_UNBOUND
>> [Sun Mar 17 21:35:46 2024] perf: interrupt took too long (2707 >
>> 2500), lowering kernel.perf_event_max_sample_rate to 73750
>> [Sun Mar 17 21:49:34 2024] workqueue: drain_vmap_area_work hogged CPU
>> for >10000us 16 times, consider switching to WQ_UNBOUND
>> ...
>> workqueue: drm_fb_helper_damage_work [drm_kms_helper] hogged CPU for
>> >10000us 32 times, consider switching to WQ_UNBOUND
>
> Hey Kamaljit,
>
> Its interesting that this happens because nvme_tcp_io_work is bound to
> 1 jiffie.
> Although in theory we do not stop receiving from a socket once we
> started, so
> I guess this can happen in some extreme cases. Was the test you were
> running
> read-heavy?
>
> I was thinking that we may want to optionally move the recv path to
> softirq instead to
> get some latency improvements, although I don't know if that would
> improve the situation
> if we end up spending a lot of time in soft-irq...
>
>>     Thanks,
>> Kamaljit Singh
>
>

we need a regular test for this in blktests as it doesn't look like we
caught this in
regular testing ...

Kamaljit, can you please provide details of the tests you are running so
we can
reproduce ?

-ck



^ permalink raw reply	[relevance 5%]

* Re: [Bug Report] nvme-cli commands fails to open head disk node and print error
  @ 2024-04-02 22:07  4%   ` Kamaljit Singh
  2024-04-03  3:07  5%     ` Keith Busch
  0 siblings, 1 reply; 200+ results
From: Kamaljit Singh @ 2024-04-02 22:07 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: Christoph Hellwig, Keith Busch, linux-nvme, linux-block, axboe,
	Gregory Joyce, Nilay Shroff


Hi Daniel,
Your question about the nvme-cli version makes me wonder if there is a version compatibility matrix (nvme-cli vs kernel) somewhere you could point me to? I didn't see such info in the nvme-cli release notes.

For example, I've seen issues with newer than nvme-cli v1.16 on Ubuntu 22.04 (stock & newer kernels). From a compatibility perspective I do wonder whether circumventing a distro's package manager and directly installing newer nvme-cli versions might be a bad idea. This could possibly become dire if there were intentional version dependencies across the stack.
 
Thanks,
Kamaljit
 

From: Linux-nvme <linux-nvme-bounces@lists.infradead.org> on behalf of Daniel Wagner <dwagner@suse.de>
Date: Thursday, March 28, 2024 at 01:46
To: Nilay Shroff <nilay@linux.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>, Keith Busch <kbusch@kernel.org>, linux-nvme@lists.infradead.org <linux-nvme@lists.infradead.org>, linux-block@vger.kernel.org <linux-block@vger.kernel.org>, axboe@fb.com <axboe@fb.com>, Gregory Joyce <gjoyce@ibm.com>
Subject: Re: [Bug Report] nvme-cli commands fails to open head disk node and print error
CAUTION: This email originated from outside of Western Digital. Do not click on links or open attachments unless you recognize the sender and know that the content is safe.


On Thu, Mar 28, 2024 at 12:00:07PM +0530, Nilay Shroff wrote:
> From the above output it's evident that nvme-cli attempts to open the disk node /dev/nvme0n3
> however that entry doesn't exist. Apparently, on 6.9-rc1 kernel though head disk node /dev/nvme0n3
> doesn't exit, the relevant entries /sys/block/nvme0c0n3 and /sys/block/nvme0n3 are present.

I assume you are using not latest version of nvme-cli/libnvme. The
latest version does not try to open any block devices when scanning the
sysfs topology.

What does `nvme version` say?

^ permalink raw reply	[relevance 4%]

* Re: [PATCH v2] nvme: split nvme_update_zone_info
    2024-04-02 15:12  5% ` Keith Busch
@ 2024-04-02 19:58  5% ` Keith Busch
  1 sibling, 0 replies; 200+ results
From: Keith Busch @ 2024-04-02 19:58 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: sagi, linux-nvme, Kanchan Joshi

On Tue, Apr 02, 2024 at 04:47:54PM +0200, Christoph Hellwig wrote:
> +	lim->chunk_sectors = ns->head->zsze =
> +		nvme_lba_to_sect(ns->head, zi->zone_size);

Hey, since chuck_sectors is only 32 bits and zone_size is 64, do we need
to worry about massive zones breaking this limit? Unlikely as we are to
see a multi-terabyte zone, it's easy enough to check. We never have
before though, so I guess we're confident it won't happen?


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvme-pci: Add sleep quirk for Samsung 990 Evo
  @ 2024-04-02 16:08  5%     ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-02 16:08 UTC (permalink / raw)
  To: Georg Gottleuber
  Cc: Christoph Hellwig, Werner Sembach, Jens Axboe, Sagi Grimberg,
	Georg Gottleuber, linux-nvme, linux-kernel

On Tue, Apr 02, 2024 at 05:13:48PM +0200, Georg Gottleuber wrote:
> Am 02.04.24 um 15:16 schrieb Christoph Hellwig:
> > On Thu, Mar 28, 2024 at 02:09:22PM +0100, Werner Sembach wrote:
> > > From: Georg Gottleuber <ggo@tuxedocomputers.com>
> > > 
> > > On some TUXEDO platforms, a Samsung 990 Evo NVMe leads to a high
> > > power consumption in s2idle sleep (2-3 watts).
> > > 
> > > This patch applies 'Force No Simple Suspend' quirk to achieve a
> > > sleep with a lower power consumption, typically around 0.5 watts.
> > 
> > Does this only apply to a specific SSD or all SSDs on this platform?
> > How do these platforms even get into the conditional?  Probably
> > through acpi_storage_d3 setting, which probably is set incorrectly
> > for the platform?  Any chance to just fix that?
> 
> Yes, this only apply to a specific SSD. I tested these SSDs (on
> PH4PRX1_PH6PRX1):
> * Kingston NV1, SNVS250G
> * Samsung 980, MZ-V8V500
> * Samsung 970 Evo, S46DNX0K900454D
> * Samsung 980 Pro, S69ENX0T709932L
> 
> S2idle consumes around 0.4 watts with these SSDs. But with a Samsung 990 Evo
> s2idle on this platform consumes 3.7 to 4.4 watts (6.8 vs 6.5 kernel).

For all these different SSDs you tested in this platform, do you see the
"platform quirk: setting simple suspend" in the dmesg? I just want to
confirm if the platform is changing the reported acpi_storage_d3 value
for different SSD models or if they're all the same.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v2] nvme: split nvme_update_zone_info
  @ 2024-04-02 15:12  5% ` Keith Busch
  2024-04-02 19:58  5% ` Keith Busch
  1 sibling, 0 replies; 200+ results
From: Keith Busch @ 2024-04-02 15:12 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: sagi, linux-nvme, Kanchan Joshi

On Tue, Apr 02, 2024 at 04:47:54PM +0200, Christoph Hellwig wrote:
> nvme_update_zone_info does (admin queue) I/O to the device and can fail.
> We fail to abort the queue limits update if that happen, but really
> should avoid with the frozen I/O queue as much as possible anyway.
> 
> Split the logic into a helper to query the information that can be
> called on an unfrozen queue and one to apply it to the queue limits.
> 
> Fixes: 9b130d681443 ("nvme: use the atomic queue limits update API")
> Reported-by: Kanchan Joshi <joshi.k@samsung.com>
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvme-multipath: don't inherit LBA-related fields for the multipath node
  @ 2024-04-02 14:15  5%     ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-04-02 14:15 UTC (permalink / raw)
  To: Nilay Shroff; +Cc: Christoph Hellwig, sagi, linux-nvme

On Tue, Apr 02, 2024 at 06:51:02PM +0530, Nilay Shroff wrote:
> I found this one couldn't made into 6.9-rc2. So are we planning to queue this fix
> for the next rc?

Yes, it'll get in. I send pull requests on Thursday's but some unplanned
travel events got in the way last week.


^ permalink raw reply	[relevance 5%]

* [Bug Report] nvme-cli commands fails to open head disk node and print error
@ 2024-03-28  6:30  3% Nilay Shroff
    0 siblings, 1 reply; 200+ results
From: Nilay Shroff @ 2024-03-28  6:30 UTC (permalink / raw)
  To: Christoph Hellwig, Keith Busch
  Cc: linux-nvme, linux-block, axboe, Gregory Joyce

Hi,

We observed that nvme-cli commands (nvme list, nvme list-subsys, nvme show topology etc.) print error message prior to printing the actual output.

Notes and observations:
======================-
This issue is observed on the latest linus kernel tree (v6.9-rc1). This was working well in kernel v6.8.

Test details:
=============
I have an NVMe disk which has two controllers, two namespaces and it's multipath capable:

# nvme list-ns /dev/nvme0 
[   0]:0x1
[   1]:0x3

One of namespaces has zero disk capacity:

# nvme id-ns /dev/nvme0 -n 0x3
NVME Identify Namespace 3:
nsze    : 0
ncap    : 0
nuse    : 0
nsfeat  : 0x14
nlbaf   : 4
flbas   : 0
<snip>

Another namespace has non-zero disk capacity:

# nvme id-ns /dev/nvme0 -n 0x1 
NVME Identify Namespace 1:
nsze    : 0x156d56
ncap    : 0x156d56
nuse    : 0
nsfeat  : 0x14
nlbaf   : 4
flbas   : 0
<snip>
 
6.8 kernel:
----------

# nvme list -v 

Subsystem        Subsystem-NQN                                                                                    Controllers
---------------- ------------------------------------------------------------------------------------------------ ----------------
nvme-subsys0     nqn.2019-10.com.kioxia:KCM7DRUG1T92:3D60A04906N1                                                 nvme0, nvme2

Device   SN                   MN                                       FR       TxPort Asdress        Slot   Subsystem    Namespaces      
-------- -------------------- ---------------------------------------- -------- ------ -------------- ------ ------------ ----------------
nvme0    3D60A04906N1         1.6TB NVMe Gen4 U.2 SSD IV               REV.CAS2 pcie   0524:28:00.0          nvme-subsys0 nvme0n1
nvme2    3D60A04906N1         1.6TB NVMe Gen4 U.2 SSD IV               REV.CAS2 pcie   0584:28:00.0          nvme-subsys0 

Device       Generic      NSID       Usage                      Format           Controllers     
------------ ------------ ---------- -------------------------- ---------------- ----------------
/dev/nvme0n1 /dev/ng0n1   0x1          0.00   B /   5.75  GB      4 KiB +  0 B   nvme0

As we can see above the namespace (0x3) with zero disk capacity is not listed in the output.
Furthermore, we don't create head disk node (i.e. /dev/nvmeXnY) for a namespace with zero
disk capacity and also we don't have any entry for such disk under /sys/block/.  

6.9-rc1 kernel:
---------------

# nvme list -v 

Failed to open ns nvme0n3, errno 2 <== error is printed first followed by output

Subsystem        Subsystem-NQN                                                                                    Controllers
---------------- ------------------------------------------------------------------------------------------------ ----------------
nvme-subsys0     nqn.2019-10.com.kioxia:KCM7DRUG1T92:3D60A04906N1                                                 nvme0, nvme2

Device   SN                   MN                                       FR       TxPort Asdress        Slot   Subsystem    Namespaces      
-------- -------------------- ---------------------------------------- -------- ------ -------------- ------ ------------ ----------------
nvme0    3D60A04906N1         1.6TB NVMe Gen4 U.2 SSD IV               REV.CAS2 pcie   0524:28:00.0          nvme-subsys0 nvme0n1
nvme2    3D60A04906N1         1.6TB NVMe Gen4 U.2 SSD IV               REV.CAS2 pcie   0584:28:00.0          nvme-subsys0 

Device       Generic      NSID       Usage                      Format           Controllers     
------------ ------------ ---------- -------------------------- ---------------- ----------------
/dev/nvme0n1 /dev/ng0n1   0x1          0.00   B /   5.75  GB      4 KiB +  0 B   nvme0


# nvme list-subsys 

Failed to open ns nvme0n3, errno 2 <== error is printed first followed by output

nvme-subsys0 - NQN=nqn.2019-10.com.kioxia:KCM7DRUG1T92:3D60A04906N1
               hostnqn=nqn.2014-08.org.nvmexpress:uuid:41528538-e8ad-4eaf-84a7-9c552917d988
               iopolicy=numa
\
 +- nvme2 pcie 0584:28:00.0 live
 +- nvme0 pcie 0524:28:00.0 live

# nvme show-topology

Failed to open ns nvme0n3, errno 2 <== error is printed first followed by output

nvme-subsys0 - NQN=nqn.2019-10.com.kioxia:KCM7DRUG1T92:3D60A04906N1
               hostnqn=nqn.2014-08.org.nvmexpress:uuid:41528538-e8ad-4eaf-84a7-9c552917d988
               iopolicy=numa
\
 +- ns 1
 \
  +- nvme0 pcie 0524:28:00.0 live optimized

From the above output it's evident that nvme-cli attempts to open the disk node /dev/nvme0n3 
however that entry doesn't exist. Apparently, on 6.9-rc1 kernel though head disk node /dev/nvme0n3
doesn't exit, the relevant entries /sys/block/nvme0c0n3 and /sys/block/nvme0n3 are present. 

As I understand, typically the nvme-cli command build the nvme subsystem topology first before 
printing the output. Here in this case, nvme-cli could find the nvme0c0n3 and nvme0n3 under 
/sys/block and so it assumes that there would be a corresponding disk node entry /dev/nvme0n3
show present however when nvme-cli attempts to open the /dev/nvme0n3 it fails and causing the 
observed symptom. 

Git bisect:
===========
The git bisect points to the below commit:

commit 46e7422cda8482aa3074c9caf4c224cf2fb74d71 (HEAD)
Author: Christoph Hellwig <hch@lst.de>
Date:   Mon Mar 4 07:04:54 2024 -0700

    nvme: move common logic into nvme_update_ns_info
    
    nvme_update_ns_info_generic and nvme_update_ns_info_block share a
    fair amount of logic related to not fully supported namespace
    formats and updating the multipath information.  Move this logic
    into the common caller.
    
    Signed-off-by: Christoph Hellwig <hch@lst.de>
    Signed-off-by: Keith Busch <kbusch@kernel.org>


In 6.9-rc1, it seems that with the above code restructuring, we would now hide the head disk 
node nvmeXnY showing up under /dev, however the relevant disk names nvmeXcYnZ and nvmeXnY do 
exist under /sys/block/. On 6.8 kernel, we don't create any disk node under /dev and as well
the corresponding disk folders under /sys/block if the disk capacity is zero. 

Thanks,
--Nilay






^ permalink raw reply	[relevance 3%]

* Re: [PATCH 2/2] nvme: cancel the queue limit update when nvme_update_zone_info fails
  @ 2024-03-27 18:01  5%   ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-27 18:01 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: axboe, sagi, linux-nvme, linux-block, Kanchan Joshi

On Wed, Mar 27, 2024 at 06:21:45PM +0100, Christoph Hellwig wrote:
> @@ -2115,6 +2115,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
>  	    ns->head->ids.csi == NVME_CSI_ZNS) {
>  		ret = nvme_update_zone_info(ns, lbaf, &lim);
>  		if (ret) {
> +			queue_limits_cancel_update(ns->disk->queue);

Could you instead move nvme_update_zone_info() outside the
queue_limits_start_update()? That way we wouldn't need to "cancel" the
update. You'd just need to save a copy of "mor" and "mar" in the ns
instead of writing these directly to the queue_limits.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvme: initialize identify ns data to NULL
  @ 2024-03-26 15:37  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-26 15:37 UTC (permalink / raw)
  To: Tokunori Ikegami; +Cc: linux-nvme

On Tue, Mar 26, 2024 at 12:45:03AM +0900, Tokunori Ikegami wrote:
>  static int ns_head_update_nuse(struct nvme_ns_head *head)
>  {
> -	struct nvme_id_ns *id;
> +	struct nvme_id_ns *id = NULL;
>  	struct nvme_ns *ns;
>  	int srcu_idx, ret = -EWOULDBLOCK;

This is a redundant setting. The first thing that happens to "id" is
reference passed to nvme_identify_ns, and the first thing it does is
this:

	*id = kmalloc(sizeof(**id), GFP_KERNEL);

So either kmalloc succeeds and overwrites your NULL setting, or malloc
fails and sets it to NULL again.


^ permalink raw reply	[relevance 5%]

* [PATCH v6 10/10] nvme: Atomic write support
    2024-03-26 13:38  3% ` [PATCH v6 01/10] block: Pass blk_queue_get_max_sectors() a request pointer John Garry
  2024-03-26 13:38  3% ` [PATCH v6 02/10] block: Call blkdev_dio_unaligned() from blkdev_direct_IO() John Garry
@ 2024-03-26 13:38  2% ` John Garry
  2 siblings, 0 replies; 200+ results
From: John Garry @ 2024-03-26 13:38 UTC (permalink / raw)
  To: axboe, kbusch, hch, sagi, jejb, martin.petersen, djwong, viro,
	brauner, dchinner, jack
  Cc: linux-block, linux-kernel, linux-nvme, linux-fsdevel, tytso,
	jbongio, linux-scsi, ojaswin, linux-aio, linux-btrfs, io-uring,
	nilay, ritesh.list, willy, Alan Adamson, John Garry

From: Alan Adamson <alan.adamson@oracle.com>

Add support to set block layer request_queue atomic write limits. The
limits will be derived from either the namespace or controller atomic
parameters.

NVMe atomic-related parameters are grouped into "normal" and "power-fail"
(or PF) class of parameter. For atomic write support, only PF parameters
are of interest. The "normal" parameters are concerned with racing reads
and writes (which also applies to PF). See NVM Command Set Specification
Revision 1.0d section 2.1.4 for reference.

Whether to use per namespace or controller atomic parameters is decided by
NSFEAT bit 1 - see Figure 97: Identify – Identify Namespace Data
Structure, NVM Command Set.

NVMe namespaces may define an atomic boundary, whereby no atomic guarantees
are provided for a write which straddles this per-lba space boundary. The
block layer merging policy is such that no merges may occur in which the
resultant request would straddle such a boundary.

Unlike SCSI, NVMe specifies no granularity or alignment rules, apart from
atomic boundary rule. In addition, again unlike SCSI, there is no
dedicated atomic write command - a write which adheres to the atomic size
limit and boundary is implicitly atomic.

If NSFEAT bit 1 is set, the following parameters are of interest:
- NAWUPF (Namespace Atomic Write Unit Power Fail)
- NABSPF (Namespace Atomic Boundary Size Power Fail)
- NABO (Namespace Atomic Boundary Offset)

and we set request_queue limits as follows:
- atomic_write_unit_max = rounddown_pow_of_two(NAWUPF)
- atomic_write_max_bytes = NAWUPF
- atomic_write_boundary = NABSPF

If in the unlikely scenario that NABO is non-zero, then atomic writes will
not be supported at all as dealing with this adds extra complexity. This
policy may change in future.

In all cases, atomic_write_unit_min is set to the logical block size.

If NSFEAT bit 1 is unset, the following parameter is of interest:
- AWUPF (Atomic Write Unit Power Fail)

and we set request_queue limits as follows:
- atomic_write_unit_max = rounddown_pow_of_two(AWUPF)
- atomic_write_max_bytes = AWUPF
- atomic_write_boundary = 0

A new function, nvme_valid_atomic_write(), is also called from submission
path to verify that a request has been submitted to the driver will
actually be executed atomically. As mentioned, there is no dedicated NVMe
atomic write command (which may error for a command which exceeds the
controller atomic write limits).

Note on NABSPF:
There seems to be some vagueness in the spec as to whether NABSPF applies
for NSFEAT bit 1 being unset. Figure 97 does not explicitly mention NABSPF
and how it is affected by bit 1. However Figure 4 does tell to check Figure
97 for info about per-namespace parameters, which NABSPF is, so it is
implied. However currently nvme_update_disk_info() does check namespace
parameter NABO regardless of this bit.

Signed-off-by: Alan Adamson <alan.adamson@oracle.com>
Reviewed-by: Keith Busch <kbusch@kernel.org>
jpg: total rewrite
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 drivers/nvme/host/core.c | 49 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 943d72bdd794..7d3247be5cb9 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -943,6 +943,30 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
 	return BLK_STS_OK;
 }
 
+static bool nvme_valid_atomic_write(struct request *req)
+{
+	struct request_queue *q = req->q;
+	u32 boundary_bytes = queue_atomic_write_boundary_bytes(q);
+
+	if (blk_rq_bytes(req) > queue_atomic_write_unit_max_bytes(q))
+		return false;
+
+	if (boundary_bytes) {
+		u64 mask = boundary_bytes - 1, imask = ~mask;
+		u64 start = blk_rq_pos(req) << SECTOR_SHIFT;
+		u64 end = start + blk_rq_bytes(req) - 1;
+
+		/* If greater then must be crossing a boundary */
+		if (blk_rq_bytes(req) > boundary_bytes)
+			return false;
+
+		if ((start & imask) != (end & imask))
+			return false;
+	}
+
+	return true;
+}
+
 static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 		struct request *req, struct nvme_command *cmnd,
 		enum nvme_opcode op)
@@ -957,6 +981,12 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns,
 
 	if (req->cmd_flags & REQ_RAHEAD)
 		dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH;
+	/*
+	 * Ensure that nothing has been sent which cannot be executed
+	 * atomically.
+	 */
+	if (req->cmd_flags & REQ_ATOMIC && !nvme_valid_atomic_write(req))
+		return BLK_STS_INVAL;
 
 	cmnd->rw.opcode = op;
 	cmnd->rw.flags = 0;
@@ -1937,6 +1967,23 @@ static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
 	}
 }
 
+
+static void nvme_update_atomic_write_disk_info(struct nvme_ns *ns,
+			struct nvme_id_ns *id, struct queue_limits *lim,
+			u32 bs, u32 atomic_bs)
+{
+	unsigned int boundary = 0;
+
+	if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) {
+		if (le16_to_cpu(id->nabspf))
+			boundary = (le16_to_cpu(id->nabspf) + 1) * bs;
+	}
+	lim->atomic_write_hw_max = atomic_bs;
+	lim->atomic_write_hw_boundary = boundary;
+	lim->atomic_write_hw_unit_min = bs;
+	lim->atomic_write_hw_unit_max = rounddown_pow_of_two(atomic_bs);
+}
+
 static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
 {
 	return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1;
@@ -1983,6 +2030,8 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
 			atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
 		else
 			atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
+
+		nvme_update_atomic_write_disk_info(ns, id, lim, bs, atomic_bs);
 	}
 
 	if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
-- 
2.31.1



^ permalink raw reply related	[relevance 2%]

* [PATCH v6 02/10] block: Call blkdev_dio_unaligned() from blkdev_direct_IO()
    2024-03-26 13:38  3% ` [PATCH v6 01/10] block: Pass blk_queue_get_max_sectors() a request pointer John Garry
@ 2024-03-26 13:38  3% ` John Garry
  2024-04-10 22:53  0%   ` Luis Chamberlain
  2024-03-26 13:38  2% ` [PATCH v6 10/10] nvme: Atomic write support John Garry
  2 siblings, 1 reply; 200+ results
From: John Garry @ 2024-03-26 13:38 UTC (permalink / raw)
  To: axboe, kbusch, hch, sagi, jejb, martin.petersen, djwong, viro,
	brauner, dchinner, jack
  Cc: linux-block, linux-kernel, linux-nvme, linux-fsdevel, tytso,
	jbongio, linux-scsi, ojaswin, linux-aio, linux-btrfs, io-uring,
	nilay, ritesh.list, willy, John Garry

blkdev_dio_unaligned() is called from __blkdev_direct_IO(),
__blkdev_direct_IO_simple(), and __blkdev_direct_IO_async(), and all these
are only called from blkdev_direct_IO().

Move the blkdev_dio_unaligned() call to the common callsite,
blkdev_direct_IO().

Pass those functions the bdev pointer from blkdev_direct_IO() as it is non-
trivial to calculate.

Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 block/fops.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/block/fops.c b/block/fops.c
index 679d9b752fe8..c091ea43bca3 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -44,18 +44,15 @@ static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
 #define DIO_INLINE_BIO_VECS 4
 
 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
-		struct iov_iter *iter, unsigned int nr_pages)
+		struct iov_iter *iter, struct block_device *bdev,
+		unsigned int nr_pages)
 {
-	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
 	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
 	loff_t pos = iocb->ki_pos;
 	bool should_dirty = false;
 	struct bio bio;
 	ssize_t ret;
 
-	if (blkdev_dio_unaligned(bdev, pos, iter))
-		return -EINVAL;
-
 	if (nr_pages <= DIO_INLINE_BIO_VECS)
 		vecs = inline_vecs;
 	else {
@@ -161,9 +158,8 @@ static void blkdev_bio_end_io(struct bio *bio)
 }
 
 static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-		unsigned int nr_pages)
+		struct block_device *bdev, unsigned int nr_pages)
 {
-	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
 	struct blk_plug plug;
 	struct blkdev_dio *dio;
 	struct bio *bio;
@@ -172,9 +168,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	loff_t pos = iocb->ki_pos;
 	int ret = 0;
 
-	if (blkdev_dio_unaligned(bdev, pos, iter))
-		return -EINVAL;
-
 	if (iocb->ki_flags & IOCB_ALLOC_CACHE)
 		opf |= REQ_ALLOC_CACHE;
 	bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@ -302,9 +295,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
 
 static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 					struct iov_iter *iter,
+					struct block_device *bdev,
 					unsigned int nr_pages)
 {
-	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
 	bool is_read = iov_iter_rw(iter) == READ;
 	blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
 	struct blkdev_dio *dio;
@@ -312,9 +305,6 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	loff_t pos = iocb->ki_pos;
 	int ret = 0;
 
-	if (blkdev_dio_unaligned(bdev, pos, iter))
-		return -EINVAL;
-
 	if (iocb->ki_flags & IOCB_ALLOC_CACHE)
 		opf |= REQ_ALLOC_CACHE;
 	bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@ -368,18 +358,23 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 
 static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
+	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
 	unsigned int nr_pages;
 
 	if (!iov_iter_count(iter))
 		return 0;
 
+	if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter))
+		return -EINVAL;
+
 	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
 	if (likely(nr_pages <= BIO_MAX_VECS)) {
 		if (is_sync_kiocb(iocb))
-			return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
-		return __blkdev_direct_IO_async(iocb, iter, nr_pages);
+			return __blkdev_direct_IO_simple(iocb, iter, bdev,
+							nr_pages);
+		return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
 	}
-	return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
+	return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
 }
 
 static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
-- 
2.31.1



^ permalink raw reply related	[relevance 3%]

* [PATCH v6 01/10] block: Pass blk_queue_get_max_sectors() a request pointer
  @ 2024-03-26 13:38  3% ` John Garry
  2024-04-10 22:58  0%   ` Luis Chamberlain
  2024-03-26 13:38  3% ` [PATCH v6 02/10] block: Call blkdev_dio_unaligned() from blkdev_direct_IO() John Garry
  2024-03-26 13:38  2% ` [PATCH v6 10/10] nvme: Atomic write support John Garry
  2 siblings, 1 reply; 200+ results
From: John Garry @ 2024-03-26 13:38 UTC (permalink / raw)
  To: axboe, kbusch, hch, sagi, jejb, martin.petersen, djwong, viro,
	brauner, dchinner, jack
  Cc: linux-block, linux-kernel, linux-nvme, linux-fsdevel, tytso,
	jbongio, linux-scsi, ojaswin, linux-aio, linux-btrfs, io-uring,
	nilay, ritesh.list, willy, John Garry

Currently blk_queue_get_max_sectors() is passed a enum req_op. In future
the value returned from blk_queue_get_max_sectors() may depend on certain
request flags, so pass a request pointer.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 block/blk-merge.c | 3 ++-
 block/blk-mq.c    | 2 +-
 block/blk.h       | 6 ++++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 2a06fd33039d..6f9d9ca7922b 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -592,7 +592,8 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
 	if (blk_rq_is_passthrough(rq))
 		return q->limits.max_hw_sectors;
 
-	max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+	max_sectors = blk_queue_get_max_sectors(rq);
+
 	if (!q->limits.chunk_sectors ||
 	    req_op(rq) == REQ_OP_DISCARD ||
 	    req_op(rq) == REQ_OP_SECURE_ERASE)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 555ada922cf0..5428ca17add8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3042,7 +3042,7 @@ void blk_mq_submit_bio(struct bio *bio)
 blk_status_t blk_insert_cloned_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
-	unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+	unsigned int max_sectors = blk_queue_get_max_sectors(rq);
 	unsigned int max_segments = blk_rq_get_max_segments(rq);
 	blk_status_t ret;
 
diff --git a/block/blk.h b/block/blk.h
index 5cac4e29ae17..dc2fa6f88adc 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -180,9 +180,11 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq)
 	return queue_max_segments(rq->q);
 }
 
-static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
-						     enum req_op op)
+static inline unsigned int blk_queue_get_max_sectors(struct request *rq)
 {
+	struct request_queue *q = rq->q;
+	enum req_op op = req_op(rq);
+
 	if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
 		return min(q->limits.max_discard_sectors,
 			   UINT_MAX >> SECTOR_SHIFT);
-- 
2.31.1



^ permalink raw reply related	[relevance 3%]

* [PATCH AUTOSEL 5.4 3/3] nvme: clear caller pointer on identify failure
       [not found]     <20240324170735.546735-1-sashal@kernel.org>
@ 2024-03-24 17:07  5% ` Sasha Levin
  0 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-03-24 17:07 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Keith Busch, Christoph Hellwig, Sasha Levin, sagi, linux-nvme

From: Keith Busch <kbusch@kernel.org>

[ Upstream commit 7e80eb792bd7377a20f204943ac31c77d859be89 ]

The memory allocated for the identification is freed on failure. Set
it to NULL so the caller doesn't have a pointer to that freed address.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9144ed14b0741..0676637e1eab6 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1080,8 +1080,10 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
 
 	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
 			sizeof(struct nvme_id_ctrl));
-	if (error)
+	if (error) {
 		kfree(*id);
+		*id = NULL;
+	}
 	return error;
 }
 
@@ -1193,6 +1195,7 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl,
 	if (error) {
 		dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
 		kfree(*id);
+		*id = NULL;
 	}
 
 	return error;
-- 
2.43.0



^ permalink raw reply related	[relevance 5%]

* [PATCH AUTOSEL 6.1 7/7] nvme: clear caller pointer on identify failure
       [not found]     <20240324170709.546465-1-sashal@kernel.org>
@ 2024-03-24 17:07  5% ` Sasha Levin
  0 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-03-24 17:07 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Keith Busch, Christoph Hellwig, Sasha Levin, sagi, linux-nvme

From: Keith Busch <kbusch@kernel.org>

[ Upstream commit 7e80eb792bd7377a20f204943ac31c77d859be89 ]

The memory allocated for the identification is freed on failure. Set
it to NULL so the caller doesn't have a pointer to that freed address.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0c088db944706..20c79cc67ce54 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1363,8 +1363,10 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
 
 	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
 			sizeof(struct nvme_id_ctrl));
-	if (error)
+	if (error) {
 		kfree(*id);
+		*id = NULL;
+	}
 	return error;
 }
 
@@ -1493,6 +1495,7 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	if (error) {
 		dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
 		kfree(*id);
+		*id = NULL;
 	}
 	return error;
 }
-- 
2.43.0



^ permalink raw reply related	[relevance 5%]

* [PATCH AUTOSEL 6.6 11/11] nvme: clear caller pointer on identify failure
       [not found]     <20240324170645.546220-1-sashal@kernel.org>
@ 2024-03-24 17:06  5% ` Sasha Levin
  0 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-03-24 17:06 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Keith Busch, Christoph Hellwig, Sasha Levin, sagi, linux-nvme

From: Keith Busch <kbusch@kernel.org>

[ Upstream commit 7e80eb792bd7377a20f204943ac31c77d859be89 ]

The memory allocated for the identification is freed on failure. Set
it to NULL so the caller doesn't have a pointer to that freed address.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index d4564a2517eb5..63d9ec076792a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1333,8 +1333,10 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
 
 	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
 			sizeof(struct nvme_id_ctrl));
-	if (error)
+	if (error) {
 		kfree(*id);
+		*id = NULL;
+	}
 	return error;
 }
 
@@ -1463,6 +1465,7 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	if (error) {
 		dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
 		kfree(*id);
+		*id = NULL;
 	}
 	return error;
 }
-- 
2.43.0



^ permalink raw reply related	[relevance 5%]

* [PATCH AUTOSEL 6.7 11/11] nvme: clear caller pointer on identify failure
       [not found]     <20240324170619.545975-1-sashal@kernel.org>
@ 2024-03-24 17:06  5% ` Sasha Levin
  0 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-03-24 17:06 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Keith Busch, Christoph Hellwig, Sasha Levin, sagi, linux-nvme

From: Keith Busch <kbusch@kernel.org>

[ Upstream commit 7e80eb792bd7377a20f204943ac31c77d859be89 ]

The memory allocated for the identification is freed on failure. Set
it to NULL so the caller doesn't have a pointer to that freed address.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 86149275ccb8e..ab90b8a118351 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1341,8 +1341,10 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
 
 	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
 			sizeof(struct nvme_id_ctrl));
-	if (error)
+	if (error) {
 		kfree(*id);
+		*id = NULL;
+	}
 	return error;
 }
 
@@ -1471,6 +1473,7 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	if (error) {
 		dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
 		kfree(*id);
+		*id = NULL;
 	}
 	return error;
 }
-- 
2.43.0



^ permalink raw reply related	[relevance 5%]

* [PATCH AUTOSEL 6.8 11/11] nvme: clear caller pointer on identify failure
       [not found]     <20240324170552.545730-1-sashal@kernel.org>
@ 2024-03-24 17:05  5% ` Sasha Levin
  0 siblings, 0 replies; 200+ results
From: Sasha Levin @ 2024-03-24 17:05 UTC (permalink / raw)
  To: linux-kernel, stable
  Cc: Keith Busch, Christoph Hellwig, Sasha Levin, sagi, linux-nvme

From: Keith Busch <kbusch@kernel.org>

[ Upstream commit 7e80eb792bd7377a20f204943ac31c77d859be89 ]

The memory allocated for the identification is freed on failure. Set
it to NULL so the caller doesn't have a pointer to that freed address.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 drivers/nvme/host/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0a96362912ced..39ee3036bd516 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1398,8 +1398,10 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
 
 	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
 			sizeof(struct nvme_id_ctrl));
-	if (error)
+	if (error) {
 		kfree(*id);
+		*id = NULL;
+	}
 	return error;
 }
 
@@ -1528,6 +1530,7 @@ int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	if (error) {
 		dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
 		kfree(*id);
+		*id = NULL;
 	}
 	return error;
 }
-- 
2.43.0



^ permalink raw reply related	[relevance 5%]

* Re: [PATCH] nvme-multipath: don't inherit LBA-related fields for the multipath node
  @ 2024-03-22 16:22  5% ` Keith Busch
    0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-03-22 16:22 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: sagi, linux-nvme, Nilay Shroff

On Fri, Mar 22, 2024 at 07:08:19AM +1000, Christoph Hellwig wrote:
> Linux 6.9 made the nvme multipath nodes not properly pick up changes when
> the LBA size goes smaller after an nvme format.  This is because we now
> try to inherit the queue settings for the multipath node entirely from
> the individual paths.  That is the right thing to do for I/O size
> limitations, which make up most of the queue limits, but it is wrong for
> changes to the namespace configuration, where we do want to pick up the
> new format, which will eventually show up on all paths once they are
> re-queried.
> 
> Fix this by not inheriting the block size and related fields and always
> for updating them.
> 
> Fixes: 8f03cfa117e0 ("nvme: don't use nvme_update_disk_info for the multipath disk")

Applied to nvme-6.9. This just missed yesterday's pull, but at least
it's queued up for the next one.


^ permalink raw reply	[relevance 5%]

* [GIT PULL] nvme updates for Linux 6.9
@ 2024-03-21 18:15  4% Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-21 18:15 UTC (permalink / raw)
  To: axboe; +Cc: hch, sagi, linux-nvme

The following changes since commit 0dc31b98d7200a0046de5c760feb0aaff6c4b53c:

  cdrom: gdrom: Convert to platform remove callback returning void (2024-03-07 11:53:30 -0700)

are available in the Git repository at:

  git://git.infradead.org/nvme.git tags/nvme-6.9-2024-03-21

for you to fetch changes up to 910934da9444dbb102294796481ab05e4419d311:

  nvmet-rdma: remove NVMET_RDMA_REQ_INVALIDATE_RKEY flag (2024-03-21 10:46:53 -0700)

----------------------------------------------------------------
nvme updates for Linux 6.9

 - Make an informative message less ominous (Keith)
 - Enhanced trace decoding (Guixin)
 - TCP updates (Hannes, Li)
 - Fabrics connect deadlock fix (Chunguang)
 - Platform API migration update (Uwe)
 - A new device quirk (Jiawei)

----------------------------------------------------------------
Chunguang Xu (1):
      nvme: fix reconnection fail due to reserved tag allocation

Guixin Liu (8):
      nvmet: add tracing of authentication commands
      nvmet: add tracing of zns commands
      nvme: use nvme_disk_is_ns_head helper
      nvme: parse zns command's zsa and zrasf to string
      nvme: add tracing of reservation commands
      nvme: parse format command's lbafu when tracing
      nvme: remove redundant BUILD_BUG_ON check
      nvmet-rdma: remove NVMET_RDMA_REQ_INVALIDATE_RKEY flag

Hannes Reinecke (1):
      nvmet-tcp: do not continue for invalid icreq

Jiawei Fu (iBug) (1):
      drivers/nvme: Add quirks for device 126f:2262

Keith Busch (1):
      nvme: change shutdown timeout setting message

Li Feng (2):
      nvme-tcp: Export the nvme_tcp_wq to sysfs
      nvme/tcp: Add wq_unbound modparam for nvme_tcp_wq

Uwe Kleine-König (1):
      nvme-apple: Convert to platform remove callback returning void

 drivers/nvme/host/apple.c   |   6 +--
 drivers/nvme/host/core.c    |  11 +++--
 drivers/nvme/host/fabrics.h |   7 ---
 drivers/nvme/host/pci.c     |   3 ++
 drivers/nvme/host/pr.c      |   3 +-
 drivers/nvme/host/sysfs.c   |   3 +-
 drivers/nvme/host/tcp.c     |  21 +++++++--
 drivers/nvme/host/trace.c   | 105 ++++++++++++++++++++++++++++++++++++++++++--
 drivers/nvme/target/rdma.c  |   8 ++--
 drivers/nvme/target/tcp.c   |   1 +
 drivers/nvme/target/trace.c |  98 +++++++++++++++++++++++++++++++++++++++++
 11 files changed, 233 insertions(+), 33 deletions(-)



^ permalink raw reply	[relevance 4%]

* Re: [PATCH v2 1/2] nvme-tcp: Export the nvme_tcp_wq to sysfs
    @ 2024-03-18 20:41  5% ` Keith Busch
  1 sibling, 0 replies; 200+ results
From: Keith Busch @ 2024-03-18 20:41 UTC (permalink / raw)
  To: Li Feng
  Cc: Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	open list:NVM EXPRESS DRIVER, open list, Anton.Gavriliuk

On Wed, Mar 13, 2024 at 08:38:09PM +0800, Li Feng wrote:
> Make the workqueue userspace visible for easy viewing and configuration.

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] drivers/nvme: Add quirks for device 126f:2262
  @ 2024-03-18 20:38  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-18 20:38 UTC (permalink / raw)
  To: iBug; +Cc: axboe, hch, sagi, linux-nvme, linux-kernel

On Sat, Mar 16, 2024 at 03:27:49AM +0800, iBug wrote:
> From: "Jiawei Fu (iBug)" <i@ibugone.com>
> 
> This commit adds NVME_QUIRK_NO_DEEPEST_PS and NVME_QUIRK_BOGUS_NID for
> device [126f:2262], which appears to be a generic VID:PID pair used for
> many SSDs based on the Silicon Motion SM2262/SM2262EN controller.

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* [Bug Report] nvme-cli fails re-formatting NVMe namespace
@ 2024-03-15 14:31  3% Nilay Shroff
  0 siblings, 0 replies; 200+ results
From: Nilay Shroff @ 2024-03-15 14:31 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Keith Busch, axboe, linux-block, linux-nvme, Gregory Joyce

Hi,

We found that "nvme format ..." command fails to format nvme disk with block-size set to 512.

Notes and observations:
====================== 
This is observed on the latest linus kernel tree. This was working well on kernel v6.8.

Test details:
=============
At system boot or when nvme is hot plugin, the nvme block size is 4096 and later if we try format
it with the block-size of 512 (lbaf=2) then it fails. Interestingly, if we start with the nvme block
size of 512 and later if we try format it with block-size of 4096 (lbaf=0) then it doesn't fail. 
Please note that CONFIG_NVME_MULTIPATH is enabled.
 
Please find below further details:

# lspci 
0018:01:00.0 Non-Volatile memory controller: Samsung Electronics Co Ltd NVMe SSD Controller PM173X

# nvme list 
Node                  Generic               SN                   Model                                    Namespace  Usage                      Format           FW Rev  
--------------------- --------------------- -------------------- ---------------------------------------- ---------- -------------------------- ---------------- --------
/dev/nvme0n1          /dev/ng0n1            S6EUNA0R500358       1.6TB NVMe Gen4 U.2 SSD                  0x1          1.60  TB /   1.60  TB    512   B +  0 B   REV.SN49

# nvme id-ns /dev/nvme0n1 -H 
NVME Identify Namespace 1:
nsze    : 0xba4d4ab0
ncap    : 0xba4d4ab0
nuse    : 0xba4d4ab0

<snip>
<snip>

nlbaf   : 4
flbas   : 0
  [6:5] : 0	Most significant 2 bits of Current LBA Format Selected
  [4:4] : 0	Metadata Transferred in Separate Contiguous Buffer
  [3:0] : 0	Least significant 4 bits of Current LBA Format Selected
  
<snip>
<snip>  

LBA Format  0 : Metadata Size: 0   bytes - Data Size: 4096 bytes - Relative Performance: 0 Best (in use)
LBA Format  1 : Metadata Size: 8   bytes - Data Size: 4096 bytes - Relative Performance: 0x2 Good 
LBA Format  2 : Metadata Size: 0   bytes - Data Size: 512 bytes - Relative Performance: 0x1 Better 
LBA Format  3 : Metadata Size: 8   bytes - Data Size: 512 bytes - Relative Performance: 0x3 Degraded 
LBA Format  4 : Metadata Size: 64  bytes - Data Size: 4096 bytes - Relative Performance: 0x3 Degraded 

# lsblk -t /dev/nvme0n1 
NAME    ALIGNMENT MIN-IO OPT-IO PHY-SEC LOG-SEC ROTA SCHED RQ-SIZE  RA WSAME
nvme0n1         0   4096      0    4096    4096    0               128    0B
                                   ^^^     ^^^ 	

!!!! FAILING TO FORMAT with 512 bytes of block size !!!!

# nvme format /dev/nvme0n1 --lbaf=2 --pil=0 --ms=0 --pi=0 -f 
Success formatting namespace:1
failed to set block size to 512
^^^

# lsblk -t /dev/nvme0n1 
NAME    ALIGNMENT MIN-IO OPT-IO PHY-SEC LOG-SEC ROTA SCHED RQ-SIZE  RA WSAME
nvme0n1         0   4096      0    4096    4096    0               128    0B
                                   ^^^     ^^^
# cat /sys/block/nvme0n1/queue/logical_block_size:4096
# cat /sys/block/nvme0n1/queue/physical_block_size:4096

# cat /sys/block/nvme0c0n1/queue/logical_block_size:512
# cat /sys/block/nvme0c0n1/queue/physical_block_size:512


# nvme id-ns /dev/nvme0n1 -H 
NVME Identify Namespace 1:
nsze    : 0xba4d4ab0
ncap    : 0xba4d4ab0
nuse    : 0xba4d4ab0
<snip>
<snip>
nlbaf   : 4
flbas   : 0x2
  [6:5] : 0	Most significant 2 bits of Current LBA Format Selected
  [4:4] : 0	Metadata Transferred in Separate Contiguous Buffer
  [3:0] : 0x2	Least significant 4 bits of Current LBA Format Selected
<snip>
<snip>

LBA Format  0 : Metadata Size: 0   bytes - Data Size: 4096 bytes - Relative Performance: 0 Best 
LBA Format  1 : Metadata Size: 8   bytes - Data Size: 4096 bytes - Relative Performance: 0x2 Good 
LBA Format  2 : Metadata Size: 0   bytes - Data Size: 512 bytes - Relative Performance: 0x1 Better (in use)
LBA Format  3 : Metadata Size: 8   bytes - Data Size: 512 bytes - Relative Performance: 0x3 Degraded 
LBA Format  4 : Metadata Size: 64  bytes - Data Size: 4096 bytes - Relative Performance: 0x3 Degraded 


Note : We could see above that the NVMe is indeed formatted with lbaf 2(block size 512). However,
the block queue limits are not correctly updated.

Git bisect:
==========
Git bisect reveals the following commit as bad commit:

8f03cfa117e06bd2d3ba7ed8bba70a3dda310cae is the first bad commit
commit 8f03cfa117e06bd2d3ba7ed8bba70a3dda310cae
Author: Christoph Hellwig <hch@lst.de>
Date:   Mon Mar 4 07:04:51 2024 -0700

    nvme: don't use nvme_update_disk_info for the multipath disk
    
    Currently nvme_update_ns_info_block calls nvme_update_disk_info both for
    the namespace attached disk, and the multipath one (if it exists).  This
    is very different from how other stacking drivers work, and leads to
    a lot of complexity.
    
    Switch to setting the disk capacity and initializing the integrity
    profile, and let blk_stack_limits which already is called just below
    deal with updating the other limits.
    
    Signed-off-by: Christoph Hellwig <hch@lst.de>
    Signed-off-by: Keith Busch <kbusch@kernel.org>

 drivers/nvme/host/core.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)


The above commit is part of the new atomic queue limit updates patch series. For 
NVMe device if multipath config is enabled then we rely on blk_stack_limits to 
update the queue limits for the stacked device. For updating the logical/physical
queue limit of the top (nvme%dn%d) device, the blk_stack_limits() uses the max of 
top and bottom limit:

	t->logical_block_size = max(t->logical_block_size,
				    b->logical_block_size);

	t->physical_block_size = max(t->physical_block_size,
				     b->physical_block_size);

When we try formatting the nvme disk with block-size of 512, the value of 
t->logical_block_size would be 4096 (as this is the initial block-size) however the
value of b->logical_block_size would be 512 (the block size of the bottom device is first 
updated in nvme_update_ns_info_block()).

I think we may want to update the queue limits of both top and bottom devices in the
nvme_update_ns_info_block(). Or if there's some other way?

Let me know if you need any further information.

Thanks,
--Nilay








^ permalink raw reply	[relevance 3%]

* Re: [PATCH] nvme: parse zns command's zsa and zrasf to string
  @ 2024-03-14 18:43  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-14 18:43 UTC (permalink / raw)
  To: Guixin Liu; +Cc: axboe, hch, sagi, linux-nvme

On Tue, Mar 12, 2024 at 03:52:43PM +0800, Guixin Liu wrote:
> Parse zone mgmt send commands's zsa and receive command's
> zrasf to string to make the trace log more human-readable.
> 
> Signed-off-by: Guixin Liu <kanie@linux.alibaba.com>

Applied to nvme-6.9, thanks.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v2] nvme: use nvme_disk_is_ns_head helper
  @ 2024-03-14 18:43  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-14 18:43 UTC (permalink / raw)
  To: Guixin Liu; +Cc: axboe, hch, sagi, linux-nvme

On Wed, Mar 13, 2024 at 10:29:05AM +0800, Guixin Liu wrote:
> Use nvme_disk_is_ns_head helper instead of check fops directly,
> and also drop CONFIG_NVME_MULTIPATH check.

Applied to nvme-6.9, thanks.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v4] nvme: fix reconnection fail due to reserved tag allocation
  @ 2024-03-14 18:42  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-14 18:42 UTC (permalink / raw)
  To: brookxu.cn; +Cc: axboe, hch, sagi, kch, linux-nvme, linux-kernel

I've replaced your v3 with this one in nvme-6.9. Thanks.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v2 0/3] *** nvme: add some commands tracing ***
  @ 2024-03-14 18:41  5%   ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-14 18:41 UTC (permalink / raw)
  To: Guixin Liu; +Cc: axboe, hch, sagi, linux-nvme

On Wed, Mar 13, 2024 at 11:51:49AM +0800, Guixin Liu wrote:
> Hi Keith,
> 
> This series of patches has not been combined yet, could you please
> 
> take a look?

Added to nvme-6.9, thanks.


^ permalink raw reply	[relevance 5%]

* RE: [PATCH v2 2/2] nvme/tcp: Add wq_unbound modparam for nvme_tcp_wq
  @ 2024-03-13 21:07  4%   ` Anton Gavriliuk
  0 siblings, 0 replies; 200+ results
From: Anton Gavriliuk @ 2024-03-13 21:07 UTC (permalink / raw)
  To: Li Feng, Keith Busch, Jens Axboe, Christoph Hellwig,
	Sagi Grimberg, open list:NVM EXPRESS DRIVER, open list

Thanks, it works.

Will it be added by default to the 6.9 mainline ?

Anton

-----Original Message-----
From: Li Feng <fengli@smartx.com>
Sent: Wednesday, March 13, 2024 2:38 PM
To: Keith Busch <kbusch@kernel.org>; Jens Axboe <axboe@kernel.dk>; Christoph Hellwig <hch@lst.de>; Sagi Grimberg <sagi@grimberg.me>; open list:NVM EXPRESS DRIVER <linux-nvme@lists.infradead.org>; open list <linux-kernel@vger.kernel.org>
Cc: Anton Gavriliuk <Anton.Gavriliuk@hpe.ua>; Li Feng <fengli@smartx.com>
Subject: [PATCH v2 2/2] nvme/tcp: Add wq_unbound modparam for nvme_tcp_wq

The default nvme_tcp_wq will use all CPUs to process tasks. Sometimes it is necessary to set CPU affinity to improve performance.

A new module parameter wq_unbound is added here. If set to true, users can configure cpu affinity through /sys/devices/virtual/workqueue/nvme_tcp_wq/cpumask.

Signed-off-by: Li Feng <fengli@smartx.com>
---
 drivers/nvme/host/tcp.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2ec1186db0a3..34a882b2ec53 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -36,6 +36,14 @@ static int so_priority;  module_param(so_priority, int, 0644);  MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");

+/*
+ * Use the unbound workqueue for nvme_tcp_wq, then we can set the cpu
+affinity
+ * from sysfs.
+ */
+static bool wq_unbound;
+module_param(wq_unbound, bool, 0644);
+MODULE_PARM_DESC(wq_unbound, "Use unbound workqueue for nvme-tcp IO
+context (default false)");
+
 /*
  * TLS handshake timeout
  */
@@ -1551,7 +1559,10 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
        else if (nvme_tcp_poll_queue(queue))
                n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
                                ctrl->io_queues[HCTX_TYPE_READ] - 1;
-       queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+       if (wq_unbound)
+               queue->io_cpu = WORK_CPU_UNBOUND;
+       else
+               queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
 }

 static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) @@ -2790,6 +2801,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {

 static int __init nvme_tcp_init_module(void)  {
+       unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
+
        BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
        BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
        BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24); @@ -2799,8 +2812,10 @@ static int __init nvme_tcp_init_module(void)
        BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128);
        BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24);

-       nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
-                       WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS, 0);
+       if (wq_unbound)
+               wq_flags |= WQ_UNBOUND;
+
+       nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0);
        if (!nvme_tcp_wq)
                return -ENOMEM;

--
2.44.0

Anton

-----Original Message-----
From: Li Feng <fengli@smartx.com>
Sent: Wednesday, March 13, 2024 2:38 PM
To: Keith Busch <kbusch@kernel.org>; Jens Axboe <axboe@kernel.dk>; Christoph Hellwig <hch@lst.de>; Sagi Grimberg <sagi@grimberg.me>; open list:NVM EXPRESS DRIVER <linux-nvme@lists.infradead.org>; open list <linux-kernel@vger.kernel.org>
Cc: Anton Gavriliuk <Anton.Gavriliuk@hpe.ua>; Li Feng <fengli@smartx.com>
Subject: [PATCH v2 2/2] nvme/tcp: Add wq_unbound modparam for nvme_tcp_wq

The default nvme_tcp_wq will use all CPUs to process tasks. Sometimes it is necessary to set CPU affinity to improve performance.

A new module parameter wq_unbound is added here. If set to true, users can configure cpu affinity through /sys/devices/virtual/workqueue/nvme_tcp_wq/cpumask.

Signed-off-by: Li Feng <fengli@smartx.com>
---
 drivers/nvme/host/tcp.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2ec1186db0a3..34a882b2ec53 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -36,6 +36,14 @@ static int so_priority;  module_param(so_priority, int, 0644);  MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");

+/*
+ * Use the unbound workqueue for nvme_tcp_wq, then we can set the cpu
+affinity
+ * from sysfs.
+ */
+static bool wq_unbound;
+module_param(wq_unbound, bool, 0644);
+MODULE_PARM_DESC(wq_unbound, "Use unbound workqueue for nvme-tcp IO
+context (default false)");
+
 /*
  * TLS handshake timeout
  */
@@ -1551,7 +1559,10 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
        else if (nvme_tcp_poll_queue(queue))
                n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
                                ctrl->io_queues[HCTX_TYPE_READ] - 1;
-       queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+       if (wq_unbound)
+               queue->io_cpu = WORK_CPU_UNBOUND;
+       else
+               queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
 }

 static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) @@ -2790,6 +2801,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {

 static int __init nvme_tcp_init_module(void)  {
+       unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
+
        BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
        BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
        BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24); @@ -2799,8 +2812,10 @@ static int __init nvme_tcp_init_module(void)
        BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128);
        BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24);

-       nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
-                       WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS, 0);
+       if (wq_unbound)
+               wq_flags |= WQ_UNBOUND;
+
+       nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0);
        if (!nvme_tcp_wq)
                return -ENOMEM;

--
2.44.0



^ permalink raw reply	[relevance 4%]

* Re: [PATCH 1/5] block: move discard checks into the ioctl handler
    2024-03-12 22:12  5%   ` Keith Busch
@ 2024-03-13 15:40  5%   ` Keith Busch
  1 sibling, 0 replies; 200+ results
From: Keith Busch @ 2024-03-13 15:40 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Chandan Babu R, linux-block, linux-nvme, linux-xfs

On Tue, Mar 12, 2024 at 08:45:27AM -0600, Christoph Hellwig wrote:
> @@ -95,6 +95,8 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
>  static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
>  		unsigned long arg)
>  {
> +	sector_t bs_mask = (bdev_logical_block_size(bdev) >> SECTOR_SHIFT) - 1;
> +	sector_t sector, nr_sects;
>  	uint64_t range[2];
>  	uint64_t start, len;
>  	struct inode *inode = bdev->bd_inode;
> @@ -105,18 +107,21 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
>  
>  	if (!bdev_max_discard_sectors(bdev))
>  		return -EOPNOTSUPP;
> +	if (bdev_read_only(bdev))
> +		return -EPERM;
>  
>  	if (copy_from_user(range, (void __user *)arg, sizeof(range)))
>  		return -EFAULT;
>  
>  	start = range[0];
>  	len = range[1];
> +	sector = start >> SECTOR_SHIFT;
> +	nr_sects = len >> SECTOR_SHIFT;
>  
> -	if (start & 511)
> +	if (!nr_sects)
>  		return -EINVAL;
> -	if (len & 511)
> +	if ((sector | nr_sects) & bs_mask)
>  		return -EINVAL;
> -
>  	if (start + len > bdev_nr_bytes(bdev))
>  		return -EINVAL;
>  
> @@ -124,7 +129,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
>  	err = truncate_bdev_range(bdev, mode, start, start + len - 1);
>  	if (err)
>  		goto fail;
> -	err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
> +	err = blkdev_issue_discard(bdev, sector, nr_sects, GFP_KERNEL);
>  fail:
>  	filemap_invalidate_unlock(inode->i_mapping);
>  	return err;
> -- 

The incremental change I think you want atop this patch to keep the
previous behavior:

-- >8 --
diff --git b/block/ioctl.c a/block/ioctl.c
index 57c8171fda93c..e14388548ab97 100644
--- b/block/ioctl.c
+++ a/block/ioctl.c
@@ -95,7 +95,7 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
 static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
 		unsigned long arg)
 {
-	sector_t bs_mask = (bdev_logical_block_size(bdev) >> SECTOR_SHIFT) - 1;
+	sector_t mask = bdev_logical_block_size(bdev) - 1;
 	sector_t sector, nr_sects;
 	uint64_t range[2];
 	uint64_t start, len;
@@ -120,7 +120,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
 
 	if (!nr_sects)
 		return -EINVAL;
-	if ((sector | nr_sects) & bs_mask)
+	if ((start | len) & mask)
 		return -EINVAL;
 	if (start + len > bdev_nr_bytes(bdev))
 		return -EINVAL;


^ permalink raw reply related	[relevance 5%]

* Re: [PATCH 1/5] block: move discard checks into the ioctl handler
  @ 2024-03-13  1:22  5%       ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-13  1:22 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Chandan Babu R, linux-block, linux-nvme, linux-xfs

On Tue, Mar 12, 2024 at 11:31:31PM +0100, Christoph Hellwig wrote:
> On Tue, Mar 12, 2024 at 04:12:54PM -0600, Keith Busch wrote:
> > > +	if (!nr_sects)
> > >  		return -EINVAL;
> > > +	if ((sector | nr_sects) & bs_mask)
> > >  		return -EINVAL;
> > > -
> > >  	if (start + len > bdev_nr_bytes(bdev))
> > >  		return -EINVAL;
> > 
> > Maybe you want to shift lower bytes out of consideration, but it is
> > different, right? For example, if I call this ioctl with start=5 and
> > len=555, it would return EINVAL, but your change would let it succeed
> > the same as if start=0, len=512.
> 
> We did the same before, just down in __blkdev_issue_discard instead of
> in the ioctl handler.

Here's an example program demonstrating the difference:

discard-test.c:
---
#include <stdio.h>
#include <stdint.h>
#include <fcntl.h>
#include <linux/fs.h>
#include <sys/ioctl.h>

int main(int argc, char **argv)
{
	uint64_t range[2];
	int fd;

	if (argc < 2)
	        return -1;

	fd = open(argv[1], O_RDWR);
	if (fd < 0)
	        return fd;

	range[0] = 5;
	range[1] = 555;
	ioctl(fd, BLKDISCARD, &range);
	perror("BLKDISCARD");

	return 0;
}
--

Before:

 # ./discard-test /dev/nvme0n1
 BLKDISCARD: Invalid argument

After:

 # ./discard-test /dev/nvme0n1
 BLKDISCARD: Success


^ permalink raw reply	[relevance 5%]

* Re: [PATCH 1/5] block: move discard checks into the ioctl handler
  @ 2024-03-12 22:12  5%   ` Keith Busch
    2024-03-13 15:40  5%   ` Keith Busch
  1 sibling, 1 reply; 200+ results
From: Keith Busch @ 2024-03-12 22:12 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Chandan Babu R, linux-block, linux-nvme, linux-xfs

On Tue, Mar 12, 2024 at 08:45:27AM -0600, Christoph Hellwig wrote:
> @@ -95,6 +95,8 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
>  static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
>  		unsigned long arg)
>  {
> +	sector_t bs_mask = (bdev_logical_block_size(bdev) >> SECTOR_SHIFT) - 1;
> +	sector_t sector, nr_sects;
>  	uint64_t range[2];
>  	uint64_t start, len;
>  	struct inode *inode = bdev->bd_inode;
> @@ -105,18 +107,21 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
>  
>  	if (!bdev_max_discard_sectors(bdev))
>  		return -EOPNOTSUPP;
> +	if (bdev_read_only(bdev))
> +		return -EPERM;
>  
>  	if (copy_from_user(range, (void __user *)arg, sizeof(range)))
>  		return -EFAULT;
>  
>  	start = range[0];
>  	len = range[1];
> +	sector = start >> SECTOR_SHIFT;
> +	nr_sects = len >> SECTOR_SHIFT;
>  
> -	if (start & 511)
> +	if (!nr_sects)
>  		return -EINVAL;
> -	if (len & 511)
> +	if ((sector | nr_sects) & bs_mask)
>  		return -EINVAL;
> -
>  	if (start + len > bdev_nr_bytes(bdev))
>  		return -EINVAL;

Maybe you want to shift lower bytes out of consideration, but it is
different, right? For example, if I call this ioctl with start=5 and
len=555, it would return EINVAL, but your change would let it succeed
the same as if start=0, len=512.


^ permalink raw reply	[relevance 5%]

* RE: [PATCH v3 0/1] nvme: Fix problem when booting from NVMe drive was leading to a hang.
  2024-03-12 14:38  5% ` Keith Busch
@ 2024-03-12 15:17  4%   ` Michael Kropaczek
  0 siblings, 0 replies; 200+ results
From: Michael Kropaczek @ 2024-03-12 15:17 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-nvme, Jens Axboe, Christoph Hellwig, Sagi Grimberg

Thank you, Keith,

Understood,

  Michael 

-----Original Message-----
From: Keith Busch <kbusch@kernel.org> 
Sent: Tuesday, March 12, 2024 7:39 AM
To: Michael Kropaczek <Michael.Kropaczek@solidigm.com>
Cc: linux-nvme@lists.infradead.org; Jens Axboe <axboe@fb.com>; Christoph Hellwig <hch@lst.de>; Sagi Grimberg <sagi@grimberg.me>
Subject: Re: [PATCH v3 0/1] nvme: Fix problem when booting from NVMe drive was leading to a hang.

Caution: External Email


On Mon, Mar 11, 2024 at 03:08:11PM -0700, Michael Kropaczek wrote:
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 
> e6267a6aa380..bda7cf5ff674 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2593,6 +2593,9 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
>                       nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
>       }
>
> +     if (shutdown && dev->hmb)
> +             nvme_set_host_mem(dev, 0);
> +

As I said last time, this deadlocks if the controller doesn't respond to the command. While that shouldn't happen with a properly functioning controller, we can't count on that to prevent a deadlock.

>       nvme_quiesce_io_queues(&dev->ctrl);
>
>       if (!dead && dev->ctrl.queue_count > 0) {
> --


^ permalink raw reply	[relevance 4%]

* Re: [PATCH 2/5] block: add a bio_chain_and_submit helper
  @ 2024-03-12 14:51  5%   ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-12 14:51 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Chandan Babu R, linux-block, linux-nvme, linux-xfs

On Tue, Mar 12, 2024 at 08:45:28AM -0600, Christoph Hellwig wrote:
> +struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new)
>  {
> -	struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
> -
> -	if (bio) {
> -		bio_chain(bio, new);
> -		submit_bio(bio);
> +	if (prev) {
> +		bio_chain(prev, new);
> +		submit_bio(prev);
>  	}
> -
>  	return new;
>  }
> +EXPORT_SYMBOL_GPL(bio_chain_and_submit);
> +
> +struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
> +		unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
> +{
> +	return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp));
> +}

I realize you're not changing any behavior here, but I want to ask, is
bio_alloc() always guaranteed to return a valid bio? It sure looks like
it can return NULL under some uncommon conditions, but I can't find
anyone checking the result. So I guess it's safe?


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v3 0/1] nvme: Fix problem when booting from NVMe drive was leading to a hang.
  2024-03-11 22:08  3% [PATCH v3 0/1] nvme: Fix problem when booting from NVMe drive was leading to a hang Michael Kropaczek
@ 2024-03-12 14:38  5% ` Keith Busch
  2024-03-12 15:17  4%   ` Michael Kropaczek
  0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-03-12 14:38 UTC (permalink / raw)
  To: Michael Kropaczek
  Cc: linux-nvme, Jens Axboe, Christoph Hellwig, Sagi Grimberg

On Mon, Mar 11, 2024 at 03:08:11PM -0700, Michael Kropaczek wrote:
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index e6267a6aa380..bda7cf5ff674 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2593,6 +2593,9 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
>  			nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
>  	}
>  
> +	if (shutdown && dev->hmb)
> +		nvme_set_host_mem(dev, 0);
> +

As I said last time, this deadlocks if the controller doesn't respond to
the command. While that shouldn't happen with a properly functioning
controller, we can't count on that to prevent a deadlock.

>  	nvme_quiesce_io_queues(&dev->ctrl);
>  
>  	if (!dead && dev->ctrl.queue_count > 0) {
> -- 


^ permalink raw reply	[relevance 5%]

* Re: [PATCH RESEND] nvme-pci: Fix EEH failure on ppc after subsystem reset
  @ 2024-03-12 14:30  5%             ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-12 14:30 UTC (permalink / raw)
  To: Nilay Shroff; +Cc: linux-nvme, axboe, hch, sagi, linux-block, gjoyce

On Mon, Mar 11, 2024 at 06:28:21PM +0530, Nilay Shroff wrote:
> @@ -3295,10 +3304,13 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
>         case pci_channel_io_frozen:
>                 dev_warn(dev->ctrl.device,
>                         "frozen state error detected, reset controller\n");
> -               if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
> -                       nvme_dev_disable(dev, true);
> -                       return PCI_ERS_RESULT_DISCONNECT;
> +               if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_RESETTING) {
> +                       if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
> +                               nvme_dev_disable(dev, true);
> +                               return PCI_ERS_RESULT_DISCONNECT;
> +                       }
>                 }
> +               flush_work(&dev->ctrl.reset_work);

I was messing with a similar idea. I wasn't sure if EEH calls the error
handler inline with the error, in which case this would try to flush the
work within the same work, which obviously doesn't work. As long as its
called from a different thread, then this should be fine.

>                 nvme_dev_disable(dev, false);
>                 return PCI_ERS_RESULT_NEED_RESET;
>         case pci_channel_io_perm_failure:
> 
> The flush_work() would ensure that we don't disable the ctrl if reset_work 
> is running. If the rest_work is *not* running currently then flush_work() should
> return immediately. Moreover, if reset_work is scheduled or start running after
> flush_work() returns then reset_work should not be able to get upto the CONNECTING
> state because pci recovery is in progress and so it should fail early.
> 
> On the reset_work side other than detecting pci error recovery, I think we also 
> need one another change where in case the ctrl state is set to CONNECTING and we 
> detect the pci error recovery in progress then before returning from the reset_work
> we set the ctrl state to RESETTING so that error_detected() could forward progress.
> The changes should be something as below:
> 
> @@ -2776,6 +2776,16 @@ static void nvme_reset_work(struct work_struct *work)
>   out_unlock:
>         mutex_unlock(&dev->shutdown_lock);
>   out:
> +       /*
> +        * If PCI recovery is ongoing then let it finish first
> +        */
> +       if (pci_channel_offline(to_pci_dev(dev->dev))) {
> +               dev_warn(dev->ctrl.device, "PCI recovery is ongoing so let it finish\n");
> +               if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_RESETTING)
> +                       WRITE_ONCE(dev->ctrl.state, NVME_CTRL_RESETTING);

This may break the state machine, like if the device was hot removed
during all this error handling. This will force the state back to
RESETTING when it should be DEAD.

I think what you need is just allow a controller to reset from a
connecting state. Have to be careful that wouldn't break any other
expectations, though.


^ permalink raw reply	[relevance 5%]

* [PATCH v3 0/1] nvme: Fix problem when booting from NVMe drive was leading to a hang.
@ 2024-03-11 22:08  3% Michael Kropaczek
  2024-03-12 14:38  5% ` Keith Busch
  0 siblings, 1 reply; 200+ results
From: Michael Kropaczek @ 2024-03-11 22:08 UTC (permalink / raw)
  To: linux-nvme
  Cc: Michael Kropaczek, Keith Busch, Jens Axboe, Christoph Hellwig,
	Sagi Grimberg

Description:

During endurance test, when a system was rebooted from NVMe drive, boot
process hung occasionally. The number of reboot cycles was set to 1000,
with interval of 120s. Hang occurred after ~300 reboot cycles.
After investigating the cause, it was established that NVMe driver
did not disable host memory during shutdown leaving NVMe controller
in a state preventing proper initialization in BIOS pre-boot stage.
Adding of the call to nvme_set_host_mem(dev, 0), when in shutdown,
fixed the issue.
In this version redundant comments were removed.

Michael Kropaczek (1):
  nvme: Fix problem when booting from NVMe drive was leading to a hang.

 drivers/nvme/host/pci.c | 3 +++
 1 file changed, 3 insertions(+)


base-commit: 8d30528a170905ede9ab6ab81f229e441808590b
-- 
2.34.1

From 054a4e846d967b0b37d0f398166c1f56ef536a72 Mon Sep 17 00:00:00 2001
From: Michael Kropaczek <michael.kropaczek@solidigm.com>
Date: Thu, 7 Mar 2024 14:51:30 -0800
Subject: [PATCH v3 1/1] nvme: Fix problem when booting from NVMe drive was
 leading to a hang.
To: linux-nvme@lists.infradead.org
Cc: Keith Busch <kbusch@kernel.org>,
    Jens Axboe <axboe@fb.com>,
    Christoph Hellwig <hch@lst.de>,
    Sagi Grimberg <sagi@grimberg.me>,
    Michael Kropaczek <michael.kropaczek@solidigm.com>

On certain host architectures/HW, DRAM was keeping memory contents over reboot
cycles. Certain NVMe controllers were accessing host memory after startup which
led to undefined state, preventing proper initialization in BIOS boot stage.
Freeing host memory during host's shutdown prevents the problem from occurring.

Signed-off-by: Michael Kropaczek <michael.kropaczek@solidigm.com>
---
 drivers/nvme/host/pci.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index e6267a6aa380..bda7cf5ff674 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2593,6 +2593,9 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 			nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
 	}
 
+	if (shutdown && dev->hmb)
+		nvme_set_host_mem(dev, 0);
+
 	nvme_quiesce_io_queues(&dev->ctrl);
 
 	if (!dead && dev->ctrl.queue_count > 0) {
-- 
2.34.1



^ permalink raw reply related	[relevance 3%]

* Re: [PATCH RESEND] nvme-pci: Fix EEH failure on ppc after subsystem reset
  @ 2024-03-11  4:41  5%         ` Keith Busch
    0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-03-11  4:41 UTC (permalink / raw)
  To: Nilay Shroff; +Cc: linux-nvme, axboe, hch, sagi, linux-block, gjoyce

On Sun, Mar 10, 2024 at 12:35:06AM +0530, Nilay Shroff wrote:
> On 3/9/24 21:14, Keith Busch wrote:
> > Your patch may observe a ctrl in "RESETTING" state from
> > error_detected(), then disable the controller, which quiesces the admin
> > queue. Meanwhile, reset_work may proceed to CONNECTING state and try
> > nvme_submit_sync_cmd(), which blocks forever because no one is going to
> > unquiesce that admin queue.
> > 
> OK I think I got your point. However, it seems that even without my patch
> the above mentioned deadlock could still be possible. 

I sure hope not. The current design should guarnatee forward progress on
initialization failed devices.

> Without my patch, if error_detcted() observe a ctrl in "RESETTING" state then 
> it still invokes nvme_dev_disable(). The only difference with my patch is that 
> error_detected() returns the PCI_ERS_RESULT_NEED_RESET instead of PCI_ERS_RESULT_DISCONNECT.

There's one more subtle difference: that condition disables with the
'shutdown' parameter set to 'true' which accomplishes a couple things:
all entered requests are flushed to their demise via the final
unquiesce, and all request_queue's are killed which forces error returns
for all new request allocations. No thread will be left waiting for
something that won't happen.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH RESEND] nvme-pci: Fix EEH failure on ppc after subsystem reset
  @ 2024-03-09 15:44  5%     ` Keith Busch
    0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-03-09 15:44 UTC (permalink / raw)
  To: Nilay Shroff; +Cc: linux-nvme

On Sat, Mar 09, 2024 at 07:59:11PM +0530, Nilay Shroff wrote:
> On 3/8/24 21:11, Keith Busch wrote:
> > On Fri, Feb 09, 2024 at 10:32:16AM +0530, Nilay Shroff wrote:
> >> @@ -2776,6 +2776,14 @@ static void nvme_reset_work(struct work_struct *work)
> >>   out_unlock:
> >>  	mutex_unlock(&dev->shutdown_lock);
> >>   out:
> >> +	/*
> >> +	 * If PCI recovery is ongoing then let it finish first
> >> +	 */
> >> +	if (pci_channel_offline(to_pci_dev(dev->dev))) {
> >> +		dev_warn(dev->ctrl.device, "PCI recovery is ongoing so let it finish\n");
> >> +		return;
> >> +	}
> >> +
> >>  	/*
> >>  	 * Set state to deleting now to avoid blocking nvme_wait_reset(), which
> >>  	 * may be holding this pci_dev's device lock.
> >> @@ -3295,9 +3303,11 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
> >>  	case pci_channel_io_frozen:
> >>  		dev_warn(dev->ctrl.device,
> >>  			"frozen state error detected, reset controller\n");
> >> -		if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
> >> -			nvme_dev_disable(dev, true);
> >> -			return PCI_ERS_RESULT_DISCONNECT;
> >> +		if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_RESETTING) {
> >> +			if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
> >> +				nvme_dev_disable(dev, true);
> >> +				return PCI_ERS_RESULT_DISCONNECT;
> >> +			}
> >>  		}
> >>  		nvme_dev_disable(dev, false);
> >>  		return PCI_ERS_RESULT_NEED_RESET;
> > 
> > I get what you're trying to do, but it looks racy. The reset_work may
> > finish before pci sets channel offline, or the error handling work
> > happens to see RESETTING state, but then transitions to CONNECTING state
> > after and deadlocks on the '.resume()' side. You are counting on a very
> > specific sequence tied to the PCIe error handling module, and maybe you
> > are able to count on that sequence for your platform in this unique
> > scenario, but these link errors could happen anytime.
> > 
> I am not sure about the deadlock in '.resume()' side you mentioned above.
> Did you mean that deadlock occur due to someone holding this pci_dev's device lock?
> Or deadlock occur due to the flush_work() from nvme_error_resume() would never 
> return?

Your patch may observe a ctrl in "RESETTING" state from
error_detected(), then disable the controller, which quiesces the admin
queue. Meanwhile, reset_work may proceed to CONNECTING state and try
nvme_submit_sync_cmd(), which blocks forever because no one is going to
unquiesce that admin queue.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v3] nvme: fix reconnection fail due to reserved tag allocation
  2024-03-08 16:47  5%         ` Keith Busch
@ 2024-03-08 16:54  4%           ` 许春光
  0 siblings, 0 replies; 200+ results
From: 许春光 @ 2024-03-08 16:54 UTC (permalink / raw)
  To: Keith Busch; +Cc: Christoph Hellwig, axboe, sagi, linux-nvme, linux-kernel

Keith Busch <kbusch@kernel.org> 于2024年3月9日周六 00:48写道:
>
> On Sat, Mar 09, 2024 at 12:43:12AM +0800, 许春光 wrote:
> > Sorry for delay to reply, I found the patch have applied just about 10
> > minutes ago.
> > According what you plan to do, I think as-is maybe fine, But anyway if
> > need, I will
> > send another patch to cleanup, thanks.
>
> The next pull request will be late next week, so I'll back out the
> commit if you want to submit something else before then. Or you can just
> submit an incremental improvement against the current tree, and that's
> also fine.

Got, I will send V4 according to the suggestion of Christoph, thanks.


^ permalink raw reply	[relevance 4%]

* Re: [PATCH v3] nvme: fix reconnection fail due to reserved tag allocation
  @ 2024-03-08 16:47  5%         ` Keith Busch
  2024-03-08 16:54  4%           ` 许春光
  0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-03-08 16:47 UTC (permalink / raw)
  To: 许春光
  Cc: Christoph Hellwig, axboe, sagi, linux-nvme, linux-kernel

On Sat, Mar 09, 2024 at 12:43:12AM +0800, 许春光 wrote:
> Sorry for delay to reply, I found the patch have applied just about 10
> minutes ago.
> According what you plan to do, I think as-is maybe fine, But anyway if
> need, I will
> send another patch to cleanup, thanks.

The next pull request will be late next week, so I'll back out the
commit if you want to submit something else before then. Or you can just
submit an incremental improvement against the current tree, and that's
also fine.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvme-apple: Convert to platform remove callback returning void
  @ 2024-03-08 16:21  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-08 16:21 UTC (permalink / raw)
  To: Uwe Kleine-König
  Cc: Hector Martin, Sven Peter, Jens Axboe, Christoph Hellwig,
	Sagi Grimberg, Alyssa Rosenzweig, asahi, linux-arm-kernel,
	linux-nvme, kernel

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v3 0/2] nvmet: add some commands tracing
  @ 2024-03-08 16:21  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-08 16:21 UTC (permalink / raw)
  To: Guixin Liu; +Cc: hch, sagi, kch, linux-nvme

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvmet-tcp: do not continue for invalid icreq
  @ 2024-03-08 16:20  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-08 16:20 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Christoph Hellwig, Sagi Grimberg, linux-nvme, Hannes Reinecke

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH v3] nvme: fix reconnection fail due to reserved tag allocation
    @ 2024-03-08 16:20  5% ` Keith Busch
  1 sibling, 0 replies; 200+ results
From: Keith Busch @ 2024-03-08 16:20 UTC (permalink / raw)
  To: brookxu.cn; +Cc: axboe, hch, sagi, linux-nvme, linux-kernel

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH RESEND] nvme-pci: Fix EEH failure on ppc after subsystem reset
    @ 2024-03-08 15:41  4% ` Keith Busch
    1 sibling, 1 reply; 200+ results
From: Keith Busch @ 2024-03-08 15:41 UTC (permalink / raw)
  To: Nilay Shroff; +Cc: axboe, hch, sagi, linux-nvme, linux-block, gjoyce

On Fri, Feb 09, 2024 at 10:32:16AM +0530, Nilay Shroff wrote:
> @@ -2776,6 +2776,14 @@ static void nvme_reset_work(struct work_struct *work)
>   out_unlock:
>  	mutex_unlock(&dev->shutdown_lock);
>   out:
> +	/*
> +	 * If PCI recovery is ongoing then let it finish first
> +	 */
> +	if (pci_channel_offline(to_pci_dev(dev->dev))) {
> +		dev_warn(dev->ctrl.device, "PCI recovery is ongoing so let it finish\n");
> +		return;
> +	}
> +
>  	/*
>  	 * Set state to deleting now to avoid blocking nvme_wait_reset(), which
>  	 * may be holding this pci_dev's device lock.
> @@ -3295,9 +3303,11 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
>  	case pci_channel_io_frozen:
>  		dev_warn(dev->ctrl.device,
>  			"frozen state error detected, reset controller\n");
> -		if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
> -			nvme_dev_disable(dev, true);
> -			return PCI_ERS_RESULT_DISCONNECT;
> +		if (nvme_ctrl_state(&dev->ctrl) != NVME_CTRL_RESETTING) {
> +			if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
> +				nvme_dev_disable(dev, true);
> +				return PCI_ERS_RESULT_DISCONNECT;
> +			}
>  		}
>  		nvme_dev_disable(dev, false);
>  		return PCI_ERS_RESULT_NEED_RESET;

I get what you're trying to do, but it looks racy. The reset_work may
finish before pci sets channel offline, or the error handling work
happens to see RESETTING state, but then transitions to CONNECTING state
after and deadlocks on the '.resume()' side. You are counting on a very
specific sequence tied to the PCIe error handling module, and maybe you
are able to count on that sequence for your platform in this unique
scenario, but these link errors could happen anytime.

And nvme subsystem reset is just odd, it's not clear how it was intended
to be handled. It takes the links down so seems like it requires
re-enumeration from a pcie hotplug driver, and that's kind of how it was
expected to work here, but your platform has a special way to contain
the link event and bring things back up the way they were before. And
the fact you *require* IO to be in flight just so the timeout handler
can dispatch a non-posted transaction 30 seconds later to trigger EEH is
also odd. Why can't EEH just detect the link down event directly?

This driver unfortunately doesn't handle errors during a reset well.
Trying to handle that has been problematic, so the driver just bails if
anything goes wrong at this critical initialization point. Maybe we need
to address the reset/initialization failure handling more generically
and delegate the teardown or retry decision to something else. Messing
with that is pretty fragile right now, though.

Or you could just re-enumerate the slot.

I don't know, sorry my message is not really helping much to get this
fixed.


^ permalink raw reply	[relevance 4%]

* Re: [PATCH] nvme: Re-word D3 Entry Latency message
  2024-03-07 19:17  4% [PATCH] nvme: Re-word D3 Entry Latency message Len Brown
  2024-03-07 19:51  5% ` Keith Busch
@ 2024-03-08  0:11  0% ` Chaitanya Kulkarni
  1 sibling, 0 replies; 200+ results
From: Chaitanya Kulkarni @ 2024-03-08  0:11 UTC (permalink / raw)
  To: Len Brown, kbusch, linux-nvme
  Cc: Max Gurtovoy, axboe, hch, sagi, linux-kernel, Len Brown

On 3/7/24 11:17, Len Brown wrote:
> From: Len Brown <len.brown@intel.com>
>
> Some words are alarming in routine kernel messages.
> "timeout" is one of them, so avoid using it.
>
> Fixes: 1a3838d732ea ("nvme: modify the debug level for setting shutdown timeout")
>
> Suggested-by: Keith Busch <kbusch@kernel.org>
> Signed-off-by: Len Brown <len.brown@intel.com>
> ---
>   drivers/nvme/host/core.c | 2 +-
>

not sure which one will get in, hence ...

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck



^ permalink raw reply	[relevance 0%]

* Re: [PATCH] nvme: change shutdown timeout setting message
  2024-03-07 16:59  5% [PATCH] nvme: change shutdown timeout setting message Keith Busch
@ 2024-03-08  0:11  0% ` Chaitanya Kulkarni
  0 siblings, 0 replies; 200+ results
From: Chaitanya Kulkarni @ 2024-03-08  0:11 UTC (permalink / raw)
  To: Keith Busch, linux-nvme; +Cc: hch, sagi, Keith Busch, Len Brown

On 3/7/24 08:59, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> User visible messages containing the word "timeout" can be alarming.
> This one nvme is just reporting a potentially informative device
> configuration, and everything is working as designed. Change the text to
> report the less concerning "D3 entry latency", which is where this value
> comes from anyway.
>
> Reported-by: Len Brown <lenb@kernel.org>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---


Looks good.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck



^ permalink raw reply	[relevance 0%]

* Re: RFC: untangle and fix __blkdev_issue_discard
    @ 2024-03-07 21:05  5% ` Keith Busch
  1 sibling, 0 replies; 200+ results
From: Keith Busch @ 2024-03-07 21:05 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Chandan Babu R, linux-block, linux-nvme, linux-xfs

On Thu, Mar 07, 2024 at 08:11:47AM -0700, Christoph Hellwig wrote:
> this tries to address the block for-next oops Chandan reported on XFS.
> I can't actually reproduce it unfortunately, but this series should
> sort it out by movign the fatal_signal_pending check out of all but
> the ioctl path.

The last patch moves fatal_signal_pending check to blkdev_issue_discard
path, which has a more users than just the ioctl path. 


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvme: Re-word D3 Entry Latency message
  2024-03-07 19:17  4% [PATCH] nvme: Re-word D3 Entry Latency message Len Brown
@ 2024-03-07 19:51  5% ` Keith Busch
  2024-03-08  0:11  0% ` Chaitanya Kulkarni
  1 sibling, 0 replies; 200+ results
From: Keith Busch @ 2024-03-07 19:51 UTC (permalink / raw)
  To: Len Brown; +Cc: linux-nvme, maxg, axboe, hch, sagi, linux-kernel, Len Brown

On Thu, Mar 07, 2024 at 02:17:34PM -0500, Len Brown wrote:
> From: Len Brown <len.brown@intel.com>
> 
> Some words are alarming in routine kernel messages.
> "timeout" is one of them, so avoid using it.
> 
> Fixes: 1a3838d732ea ("nvme: modify the debug level for setting shutdown timeout")
> 
> Suggested-by: Keith Busch <kbusch@kernel.org>
> Signed-off-by: Len Brown <len.brown@intel.com>

Our messages must have crossed:

  https://lore.kernel.org/linux-nvme/20240307165933.3718589-1-kbusch@meta.com/T/#u

I haven't been receiving random messages from the list lately either. Or
maybe I cc'ed the wrong email.

Anyway, we just sent the nvme-6.9 pull request today, so I'll wait a few
more days before starting the next batch.


^ permalink raw reply	[relevance 5%]

* [PATCH] nvme: Re-word D3 Entry Latency message
@ 2024-03-07 19:17  4% Len Brown
  2024-03-07 19:51  5% ` Keith Busch
  2024-03-08  0:11  0% ` Chaitanya Kulkarni
  0 siblings, 2 replies; 200+ results
From: Len Brown @ 2024-03-07 19:17 UTC (permalink / raw)
  To: kbusch, linux-nvme; +Cc: maxg, axboe, hch, sagi, linux-kernel, Len Brown

From: Len Brown <len.brown@intel.com>

Some words are alarming in routine kernel messages.
"timeout" is one of them, so avoid using it.

Fixes: 1a3838d732ea ("nvme: modify the debug level for setting shutdown timeout")

Suggested-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0a96362912ce..2601dc1c4f42 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3206,7 +3206,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
 
 		if (ctrl->shutdown_timeout != shutdown_timeout)
 			dev_info(ctrl->device,
-				 "Shutdown timeout set to %u seconds\n",
+				 "D3 entry latency set to %u seconds\n",
 				 ctrl->shutdown_timeout);
 	} else
 		ctrl->shutdown_timeout = shutdown_timeout;
-- 
2.40.1



^ permalink raw reply related	[relevance 4%]

* [PATCH] nvme: change shutdown timeout setting message
@ 2024-03-07 16:59  5% Keith Busch
  2024-03-08  0:11  0% ` Chaitanya Kulkarni
  0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-03-07 16:59 UTC (permalink / raw)
  To: linux-nvme; +Cc: hch, sagi, Keith Busch, Len Brown

From: Keith Busch <kbusch@kernel.org>

User visible messages containing the word "timeout" can be alarming.
This one nvme is just reporting a potentially informative device
configuration, and everything is working as designed. Change the text to
report the less concerning "D3 entry latency", which is where this value
comes from anyway.

Reported-by: Len Brown <lenb@kernel.org>
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2baf5786a92fe..0dcaf3973dc49 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3233,7 +3233,7 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
 
 		if (ctrl->shutdown_timeout != shutdown_timeout)
 			dev_info(ctrl->device,
-				 "Shutdown timeout set to %u seconds\n",
+				 "D3 entry latency set to %u seconds\n",
 				 ctrl->shutdown_timeout);
 	} else
 		ctrl->shutdown_timeout = shutdown_timeout;
-- 
2.34.1



^ permalink raw reply related	[relevance 5%]

* Re: [PATCH 1/1] nvme: Use pr_dbg, not pr_info, when setting shutdown timeout
  2024-03-07 15:23  5%       ` Keith Busch
@ 2024-03-07 16:25  4%         ` Len Brown
  0 siblings, 0 replies; 200+ results
From: Len Brown @ 2024-03-07 16:25 UTC (permalink / raw)
  To: Keith Busch
  Cc: Max Gurtovoy, linux-nvme, maxg, axboe, hch, sagi, linux-kernel,
	Len Brown

On Thu, Mar 7, 2024 at 10:23 AM Keith Busch <kbusch@kernel.org> wrote:

> Or maybe we can make the print less scary: how about changing "shutdown
> timeout" to "D3 entry latency"?

Works for me!

(The problem with the word "timeout" is that is is usually something a
human should know about, along with "error" "fail" "abort" etc)

you sending a patch, or shall I?

thanks,
Len Brown, Intel


^ permalink raw reply	[relevance 4%]

* Re: [PATCH 06/10] ext4: switch to using blk_next_discard_bio directly
  @ 2024-03-07 16:13  5%   ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-07 16:13 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Chandan Babu R, linux-block, linux-nvme, linux-xfs

On Thu, Mar 07, 2024 at 08:11:53AM -0700, Christoph Hellwig wrote:
> @@ -3840,12 +3840,16 @@ static inline int ext4_issue_discard(struct super_block *sb,
>  	trace_ext4_discard_blocks(sb,
>  			(unsigned long long) discard_block, count);
>  	if (biop) {

Does this 'if' case even need to exist? It looks unreachable since there
are only two callers of ext4_issue_discard(), and they both set 'biop'
to NULL. It looks like the last remaining caller using 'biop' was
removed with 55cdd0af2bc5ffc ("ext4: get discard out of jbd2 commit
kthread contex")

> -		return __blkdev_issue_discard(sb->s_bdev,
> -			(sector_t)discard_block << (sb->s_blocksize_bits - 9),
> -			(sector_t)count << (sb->s_blocksize_bits - 9),
> -			GFP_NOFS, biop);
> -	} else
> -		return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
> +		unsigned int sshift = (sb->s_blocksize_bits - SECTOR_SHIFT);
> +		sector_t sector = (sector_t)discard_block << sshift;
> +		sector_t nr_sects = (sector_t)count << sshift;
> +
> +		while (blk_next_discard_bio(sb->s_bdev, biop, &sector,
> +				&nr_sects, GFP_NOFS))
> +			;

This pattern is repeated often in this series, so perhaps a helper
function for this common use case.


^ permalink raw reply	[relevance 5%]

* [GIT PULL] nvme updates for Linux 6.9
@ 2024-03-07 15:51  4% Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-07 15:51 UTC (permalink / raw)
  To: axboe; +Cc: hch, sagi, linux-nvme

[resending; first one did not CC the list]

This is our first nvme pull request for the upcoming 6.9 merge window.

The following changes since commit 268283244c0f018dec8bf4a9c69ce50684561f46:

  nbd: use the atomic queue limits API in nbd_set_size (2024-03-01 09:08:22 -0700)

are available in the Git repository at:

  git://git.infradead.org/nvme.git tags/nvme-6.9-2024-03-07

for you to fetch changes up to 7e80eb792bd7377a20f204943ac31c77d859be89:

  nvme: clear caller pointer on identify failure (2024-03-06 06:29:01 -0800)

----------------------------------------------------------------
nvme updates for Linux 6.9

 - RDMA target enhancements (Max)
 - Fabrics fixes (Max, Guixin, Hannes)
 - Atomic queue_limits usage (Christoph)
 - Const use for class_register (Ricardo)
 - Identification error handling fixes (Shin'ichiro, Keith)

----------------------------------------------------------------
Christoph Hellwig (16):
      nvme: set max_hw_sectors unconditionally
      nvme: move NVME_QUIRK_DEALLOCATE_ZEROES out of nvme_config_discard
      nvme: remove nvme_revalidate_zones
      nvme: move max_integrity_segments handling out of nvme_init_integrity
      nvme: cleanup the nvme_init_integrity calling conventions
      nvme: move blk_integrity_unregister into nvme_init_integrity
      nvme: don't use nvme_update_disk_info for the multipath disk
      nvme: move a few things out of nvme_update_disk_info
      nvme: move setting the write cache flags out of nvme_set_queue_limits
      nvme: move common logic into nvme_update_ns_info
      nvme: split out a nvme_identify_ns_nvm helper
      nvme: don't query identify data in configure_metadata
      nvme: cleanup nvme_configure_metadata
      nvme: use the atomic queue limits update API
      nvme-multipath: pass queue_limits to blk_alloc_disk
      nvme-multipath: use atomic queue limits API for stacking limits

Guixin Liu (1):
      nvme-fabrics: check max outstanding commands

Hannes Reinecke (1):
      nvme-fabrics: typo in nvmf_parse_key()

Keith Busch (1):
      nvme: clear caller pointer on identify failure

Max Gurtovoy (8):
      nvme-rdma: move NVME_RDMA_IP_PORT from common file
      nvmet: compare mqes and sqsize only for IO SQ
      nvmet: set maxcmd to be per controller
      nvmet: set ctrl pi_support cap before initializing cap reg
      nvme-rdma: introduce NVME_RDMA_MAX_METADATA_QUEUE_SIZE definition
      nvme-rdma: clamp queue size according to ctrl cap
      nvmet: introduce new max queue size configuration entry
      nvmet-rdma: set max_queue_size for RDMA transport

Ricardo B. Marliere (3):
      nvme: core: constify struct class usage
      nvme: fabrics: make nvmf_class constant
      nvme: fcloop: make fcloop_class constant

Shin'ichiro Kawasaki (1):
      nvme: host: fix double-free of struct nvme_id_ns in ns_update_nuse()

 drivers/nvme/host/core.c          | 456 ++++++++++++++++++++------------------
 drivers/nvme/host/fabrics.c       |  22 +-
 drivers/nvme/host/multipath.c     |  13 +-
 drivers/nvme/host/nvme.h          |  11 +-
 drivers/nvme/host/rdma.c          |  14 +-
 drivers/nvme/host/sysfs.c         |   7 +-
 drivers/nvme/host/zns.c           |  24 +-
 drivers/nvme/target/admin-cmd.c   |   2 +-
 drivers/nvme/target/configfs.c    |  28 +++
 drivers/nvme/target/core.c        |  18 +-
 drivers/nvme/target/discovery.c   |   2 +-
 drivers/nvme/target/fabrics-cmd.c |   5 +-
 drivers/nvme/target/fcloop.c      |  17 +-
 drivers/nvme/target/nvmet.h       |   6 +-
 drivers/nvme/target/passthru.c    |   2 +-
 drivers/nvme/target/rdma.c        |  10 +
 include/linux/nvme-rdma.h         |   6 +-
 include/linux/nvme.h              |   2 -
 18 files changed, 356 insertions(+), 289 deletions(-)


^ permalink raw reply	[relevance 4%]

* Re: [PATCH 1/1] nvme: Use pr_dbg, not pr_info, when setting shutdown timeout
  2024-03-07 15:17  5%     ` Keith Busch
@ 2024-03-07 15:23  5%       ` Keith Busch
  2024-03-07 16:25  4%         ` Len Brown
  0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-03-07 15:23 UTC (permalink / raw)
  To: Len Brown
  Cc: Max Gurtovoy, linux-nvme, maxg, axboe, hch, sagi, linux-kernel,
	Len Brown

On Thu, Mar 07, 2024 at 08:17:05AM -0700, Keith Busch wrote:
> But personally, I don't find this print very useful anymore, so I don't
> care if it gets removed.

Or maybe we can make the print less scary: how about changing "shutdown
timeout" to "D3 entry latency"?


^ permalink raw reply	[relevance 5%]

* Re: [PATCH 1/1] nvme: Use pr_dbg, not pr_info, when setting shutdown timeout
  @ 2024-03-07 15:17  5%     ` Keith Busch
  2024-03-07 15:23  5%       ` Keith Busch
  0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-03-07 15:17 UTC (permalink / raw)
  To: Len Brown
  Cc: Max Gurtovoy, linux-nvme, maxg, axboe, hch, sagi, linux-kernel,
	Len Brown

On Thu, Mar 07, 2024 at 09:27:21AM -0500, Len Brown wrote:
> On Thu, Mar 7, 2024 at 4:29 AM Max Gurtovoy <mgurtovoy@nvidia.com> wrote:
> 
> > > Some words are alarming in routine kernel messages.
> > > "timeout" is one of them...
>
> > > Here NVME is routinely setting a timeout value,
> > > rather than reporting that a timeout has occurred.
> >
> > No.
> > see the original commit message
> >
> > "When an NVMe controller reports RTD3 Entry Latency larger than the
> > value of shutdown_timeout module parameter, we update the
> > shutdown_timeout accordingly to honor RTD3 Entry Latency. Use an
> > informational debug level instead of a warning level for it."
> >
> > So this is not a routine flow. This informs users about using a
> > different value than the module param they set.
> 
> I have machines in automated testing.
> Those machines have zero module params.
> This message appears in their dmesg 100% of the time,
> and our dmesg scanner complains about them 100% of the time.
> 
> Is this a bug in the NVME hardware or software?
> 
> If yes, I'll be happy to help  debug it.
> 
> If no, then exactly what action is the informed user supposed to take
> upon seeing this message?
> 
> If none, then the message serves no purpose and should be deleted entirely.

It lets you know that your device takes longer to safely power off than
the module's default tolerance. System low power transitions may take a
long time, and at one point, people wanted to know about that since it
may affect their power management decisions.

This print was partly from when NVMe protocol did not provide a way to
advertise an appropriate shutdown time, and we had no idea what devices
in the wild actually needed. We often just get a dmesg with bug reports,
and knowing device's shutdown timings was helpful at one point with
suspend and power off issues.

You can make the print go away by adding param

  nvme_core.shutdown_timeout=<Largest Observed Value>

But personally, I don't find this print very useful anymore, so I don't
care if it gets removed.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH] nvme: clear caller pointer on identify failure
  2024-03-06 14:25  5% [PATCH] nvme: clear caller pointer on identify failure Keith Busch
@ 2024-03-06 17:45  0% ` Chaitanya Kulkarni
  0 siblings, 0 replies; 200+ results
From: Chaitanya Kulkarni @ 2024-03-06 17:45 UTC (permalink / raw)
  To: Keith Busch, linux-nvme; +Cc: hch, shinichiro.kawasaki, Keith Busch

On 3/6/2024 6:25 AM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> The memory allocated for the identifification is freed on failure. Set
> it to NULL so the caller doesn't have a pointer to that freed address.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---

LGTM.

Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com>

-ck



^ permalink raw reply	[relevance 0%]

* Re: [PATCH RESEND] nvme-pci: Fix EEH failure on ppc after subsystem reset
  @ 2024-03-06 15:19  5%     ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-06 15:19 UTC (permalink / raw)
  To: Nilay Shroff; +Cc: hch, axboe, sagi, linux-nvme, linux-block, gjoyce

On Wed, Mar 06, 2024 at 04:50:10PM +0530, Nilay Shroff wrote:
> Hi Keith and Christoph,

Sorry for the delay, been very busy recently. I'll revisit this this
week.


^ permalink raw reply	[relevance 5%]

* [PATCH] nvme: clear caller pointer on identify failure
@ 2024-03-06 14:25  5% Keith Busch
  2024-03-06 17:45  0% ` Chaitanya Kulkarni
  0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-03-06 14:25 UTC (permalink / raw)
  To: linux-nvme; +Cc: hch, shinichiro.kawasaki, Keith Busch

From: Keith Busch <kbusch@kernel.org>

The memory allocated for the identifification is freed on failure. Set
it to NULL so the caller doesn't have a pointer to that freed address.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/nvme/host/core.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c4d928585ce35..2baf5786a92fe 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1403,8 +1403,10 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
 
 	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
 			sizeof(struct nvme_id_ctrl));
-	if (error)
+	if (error) {
 		kfree(*id);
+		*id = NULL;
+	}
 	return error;
 }
 
@@ -1533,6 +1535,7 @@ int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	if (error) {
 		dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
 		kfree(*id);
+		*id = NULL;
 	}
 	return error;
 }
-- 
2.34.1



^ permalink raw reply related	[relevance 5%]

* Re: [PATCH v2] nvme: host: fix double-free of struct nvme_id_ns in ns_update_nuse()
  @ 2024-03-06 14:00  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-06 14:00 UTC (permalink / raw)
  To: Shin'ichiro Kawasaki
  Cc: linux-nvme, Daniel Wagner, Jens Axboe, Christoph Hellwig,
	Sagi Grimberg, Chaitanya Kulkarni, Hannes Reinecke

On Wed, Mar 06, 2024 at 03:03:03PM +0900, Shin'ichiro Kawasaki wrote:
> When nvme_identify_ns() fails, it frees the pointer to the struct
> nvme_id_ns before it returns. However, ns_update_nuse() calls kfree()
> for the pointer even when nvme_identify_ns() fails. This results in
> KASAN double-free, which was observed with blktests nvme/045 with
> proposed patches [1] on the kernel v6.8-rc7. Fix the double-free by
> skipping kfree() when nvme_identify_ns() fails.

Your patch is good and applied for nvme-6.9. I just want to mention we
have a bit of an inconsistency in how the driver handles this pattern:
nvme_identify_ns_nvm() only sets the caller's pointer on success, but
nvme_identify_ns() and nvme_identify_ctrl() set it all the time. If we'd
only set it on success, then this problem wouldn't happen, so a possible
follow up suggestion to prevent the caller from having a pointer to
freed memory:

---
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c4d928585ce35..2baf5786a92fe 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1403,8 +1403,10 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
 
 	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
 			sizeof(struct nvme_id_ctrl));
-	if (error)
+	if (error) {
 		kfree(*id);
+		*id = NULL;
+	}
 	return error;
 }
 
@@ -1533,6 +1535,7 @@ int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
 	if (error) {
 		dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
 		kfree(*id);
+		*id = NULL;
 	}
 	return error;
 }
--


^ permalink raw reply related	[relevance 5%]

* RE: [PATCH v2 0/1] nvme: Fix problem when booting from NVMe drive was leading to a hang.
  2024-03-05 15:17  5%   ` Keith Busch
@ 2024-03-05 17:49  4%     ` Michael Kropaczek
  0 siblings, 0 replies; 200+ results
From: Michael Kropaczek @ 2024-03-05 17:49 UTC (permalink / raw)
  To: Keith Busch, Christoph Hellwig; +Cc: linux-nvme, Jens Axboe, Sagi Grimberg

Thank you, Keith, Christoph,

The comment will be removed in the next version,

   Michael

-----Original Message-----
From: Keith Busch <kbusch@kernel.org> 
Sent: Tuesday, March 5, 2024 7:18 AM
To: Christoph Hellwig <hch@lst.de>
Cc: Michael Kropaczek <Michael.Kropaczek@solidigm.com>; linux-nvme@lists.infradead.org; Jens Axboe <axboe@fb.com>; Sagi Grimberg <sagi@grimberg.me>
Subject: Re: [PATCH v2 0/1] nvme: Fix problem when booting from NVMe drive was leading to a hang.

Caution: External Email


On Tue, Mar 05, 2024 at 02:51:00PM +0100, Christoph Hellwig wrote:
> > +   /*
> > +    * On certain host architectures/HW, DRAM was keeping memory contents over reboot-cycles.
> > +    * It was observed that certain controllers were accessing host memory after
> > +    * resetting which led to undefined state preventing proper initialization.
> > +    */
>
> Block comments should never span 80 characters.  But more importantly 
> I don't even think we need this comment at all, this is a clear bug 
> fix and the code is self-describing.

It sounds like a firmware bug. Spec says:

  "The host memory resources are not persistent in the controller across
   a reset event."

Exiting a shutdown state requires a CC.EN transition, which is a reset event.

It still may be good practice for the host driver to explicitly disable host memory, but as I said last time, doing this in the shutdown path deadlocks if the drive fails to produce a response, which is why we removed it in the first place.


^ permalink raw reply	[relevance 4%]

* Re: [PATCH 1/2] nvme-core: add ctrl state transition debug helper
  @ 2024-03-05 16:34  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-05 16:34 UTC (permalink / raw)
  To: Chaitanya Kulkarni; +Cc: linux-nvme, hch, sagi

On Sun, Feb 11, 2024 at 08:26:40PM -0800, Chaitanya Kulkarni wrote:
> @@ -621,6 +636,9 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
>  	}
>  
>  	if (changed) {
> +		dev_dbg(ctrl->device, "%s %s -> %s\n", __func__,
> +			nvme_ctrl_state_str(old_state),
> +			nvme_ctrl_state_str(new_state));
>  		WRITE_ONCE(ctrl->state, new_state);
>  		wake_up_all(&ctrl->state_wq);
>  	}

Turing the system logs up to debug level might get you a whole lot of
unrelated stuff. Could a new trace_event get you what you need instead?
There's more fine grain control on those.


^ permalink raw reply	[relevance 5%]

* Re: [PATCH 0/3] nvme: constify struct class usage
  @ 2024-03-05 16:02  5%   ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-05 16:02 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Ricardo B. Marliere, Jens Axboe, Sagi Grimberg, James Smart,
	Chaitanya Kulkarni, linux-nvme, linux-kernel, Greg Kroah-Hartman

Thanks, applied to nvme-6.9.

For some reason this series didn't arrive to my inbox. I got the patches
from lore, but I probably wouldn't have noticed this series if Christoph
hadn't replied.


^ permalink raw reply	[relevance 5%]

* Re: [RFC RESEND 16/16] nvme-pci: use blk_rq_dma_map() for NVMe SGL
  @ 2024-03-05 15:51  4%   ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-05 15:51 UTC (permalink / raw)
  To: Leon Romanovsky
  Cc: Christoph Hellwig, Robin Murphy, Marek Szyprowski, Joerg Roedel,
	Will Deacon, Jason Gunthorpe, Chaitanya Kulkarni,
	Chaitanya Kulkarni, Jonathan Corbet, Jens Axboe, Sagi Grimberg,
	Yishai Hadas, Shameer Kolothum, Kevin Tian, Alex Williamson,
	Jérôme Glisse, Andrew Morton, linux-doc, linux-kernel,
	linux-block, linux-rdma, iommu, linux-nvme, kvm, linux-mm,
	Bart Van Assche, Damien Le Moal, Amir Goldstein, josef,
	Martin K. Petersen, daniel, Dan Williams, jack, Leon Romanovsky,
	Zhu Yanjun

On Tue, Mar 05, 2024 at 01:18:47PM +0200, Leon Romanovsky wrote:
> @@ -236,7 +236,9 @@ struct nvme_iod {
>  	unsigned int dma_len;	/* length of single DMA segment mapping */
>  	dma_addr_t first_dma;
>  	dma_addr_t meta_dma;
> -	struct sg_table sgt;
> +	struct dma_iova_attrs iova;
> +	dma_addr_t dma_link_address[128];
> +	u16 nr_dma_link_address;
>  	union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
>  };

That's quite a lot of space to add to the iod. We preallocate one for
every request, and there could be millions of them. 


^ permalink raw reply	[relevance 4%]

* Re: [PATCH v2 0/1] nvme: Fix problem when booting from NVMe drive was leading to a hang.
  @ 2024-03-05 15:17  5%   ` Keith Busch
  2024-03-05 17:49  4%     ` Michael Kropaczek
  0 siblings, 1 reply; 200+ results
From: Keith Busch @ 2024-03-05 15:17 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Michael Kropaczek, linux-nvme, Jens Axboe, Sagi Grimberg

On Tue, Mar 05, 2024 at 02:51:00PM +0100, Christoph Hellwig wrote:
> > +	/*
> > +	 * On certain host architectures/HW, DRAM was keeping memory contents over reboot-cycles.
> > +	 * It was observed that certain controllers were accessing host memory after
> > +	 * resetting which led to undefined state preventing proper initialization.
> > +	 */
> 
> Block comments should never span 80 characters.  But more importantly
> I don't even think we need this comment at all, this is a clear bug
> fix and the code is self-describing.

It sounds like a firmware bug. Spec says:

  "The host memory resources are not persistent in the controller across
   a reset event."

Exiting a shutdown state requires a CC.EN transition, which is a reset
event.

It still may be good practice for the host driver to explicitly disable
host memory, but as I said last time, doing this in the shutdown path
deadlocks if the drive fails to produce a response, which is why we
removed it in the first place.


^ permalink raw reply	[relevance 5%]

* [PATCH v2 0/1] nvme: Fix problem when booting from NVMe drive was leading to a hang.
@ 2024-03-04 18:25  3% Michael Kropaczek
    0 siblings, 1 reply; 200+ results
From: Michael Kropaczek @ 2024-03-04 18:25 UTC (permalink / raw)
  To: linux-nvme
  Cc: Michael Kropaczek, Keith Busch, Jens Axboe, Christoph Hellwig,
	Sagi Grimberg

Description:

During endurance test, when a system was rebooted from NVMe drive, boot
process hung occasionally. The number of reboot cycles was set to 1000,
with interval of 120s. Hang occurred after ~300 reboot cycles.
After investigating the cause, it was established that NVMe driver
did not disable host memory during shutdown leaving NVMe controller
in a state preventing proper initialization in BIOS pre-boot stage.
Adding of the call to nvme_set_host_mem(dev, 0) when in shutdown
fixed the issue.

Michael Kropaczek (1):
  nvme: Fix problem when booting from NVMe drive was leading to a hang.

 drivers/nvme/host/pci.c | 8 ++++++++
 1 file changed, 8 insertions(+)


base-commit: 8d30528a170905ede9ab6ab81f229e441808590b
-- 
2.34.1

From 9eec234181015af624d8e5cd8670ba5d82d0ce7e Mon Sep 17 00:00:00 2001
From: Michael Kropaczek <michael.kropaczek@solidigm.com>
Date: Thu, 29 Feb 2024 15:33:27 -0800
Subject: [PATCH v2 1/1] nvme: Fix problem when booting from NVMe drive was
 leading to a hang.
To: linux-nvme@lists.infradead.org
Cc: Keith Busch <kbusch@kernel.org>,
    Jens Axboe <axboe@fb.com>,
    Christoph Hellwig <hch@lst.de>,
    Sagi Grimberg <sagi@grimberg.me>,
    Michael Kropaczek <michael.kropaczek@solidigm.com>

On certain host architectures/HW, DRAM was keeping memory contents over reboot
cycles. Certain NVMe controllers were accessing host memory after startup which
led to undefined state, preventing proper initialization in BIOS boot stage.
Freeing host memory during host's shutdown prevents the problem from occurring.

Signed-off-by: Michael Kropaczek <michael.kropaczek@solidigm.com>
---
 drivers/nvme/host/pci.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index e6267a6aa380..e5292c7b301f 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2593,6 +2593,14 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 			nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
 	}
 
+	/*
+	 * On certain host architectures/HW, DRAM was keeping memory contents over reboot-cycles.
+	 * It was observed that certain controllers were accessing host memory after
+	 * resetting which led to undefined state preventing proper initialization.
+	 */
+	if (shutdown && dev->hmb)
+		nvme_set_host_mem(dev, 0);
+
 	nvme_quiesce_io_queues(&dev->ctrl);
 
 	if (!dead && dev->ctrl.queue_count > 0) {
-- 
2.34.1



^ permalink raw reply related	[relevance 3%]

* Re: [PATCH v2 0/2] nvme-fabrics: short-circuit connect retries
  @ 2024-03-04 16:32  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-04 16:32 UTC (permalink / raw)
  To: Daniel Wagner
  Cc: Christoph Hellwig, Sagi Grimberg, Hannes Reinecke, linux-nvme,
	James Smart, Chao Leng

On Mon, Mar 04, 2024 at 05:10:04PM +0100, Daniel Wagner wrote:
> I've picked up Hannes' DNR patches. In short the make the transports behave the
> same way when the DNR bit set on a re-connect attempt. We had a discussion this
> topic in the past and if I got this right we all agreed is that the host should
> honor the DNR bit on a connect attempt [1]

These look good to me. I think, though, if you're submitting a patch on
behalf of another, you should append your 'Signed-off-by' tag after the
author's.


^ permalink raw reply	[relevance 5%]

* Re: convert nvme to atomic queue limits updates v2
  @ 2024-03-04 16:27  5% ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-04 16:27 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Hector Martin, Sven Peter, Sagi Grimberg, James Smart,
	Chaitanya Kulkarni, Alyssa Rosenzweig, asahi, linux-nvme

Thanks, applied to nvme-6.9.


^ permalink raw reply	[relevance 5%]

* Re: convert nvme to atomic queue limits updates
  @ 2024-03-02 23:21  5%     ` Keith Busch
  0 siblings, 0 replies; 200+ results
From: Keith Busch @ 2024-03-02 23:21 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Hector Martin, Sven Peter, Sagi Grimberg, James Smart,
	Chaitanya Kulkarni, Alyssa Rosenzweig, asahi, linux-nvme

On Sat, Mar 02, 2024 at 02:59:54PM +0100, Christoph Hellwig wrote:
> On Fri, Mar 01, 2024 at 09:20:31AM -0700, Keith Busch wrote:
> > 
> > The whole series looks good. I see Jens has got the first two patches,
> > so I'll rebase nvme-6.9 to that and apply the rest of the nvme patches.
> 
> I had to fix up the little thing Max noticed an also added another
> patch for multuipath.  I'll resend with those updates once you
> rebased.

Done, nvme-6.9 is rebased to the current for-6.9/block.


^ permalink raw reply	[relevance 5%]

Results 1-200 of ~5000   | reverse | options above
-- pct% links below jump to the message on this page, permalinks otherwise --
2024-01-31  9:12     [PATCH v2 0/3] *** nvme: add some commands tracing *** Guixin Liu
2024-03-13  3:51     ` Guixin Liu
2024-03-14 18:41  5%   ` Keith Busch
2024-02-09  5:02     [PATCH RESEND] nvme-pci: Fix EEH failure on ppc after subsystem reset Nilay Shroff
2024-02-27 18:29     ` Keith Busch
2024-03-06 11:20       ` Nilay Shroff
2024-03-06 15:19  5%     ` Keith Busch
2024-03-08 15:41  4% ` Keith Busch
2024-03-09 14:29       ` Nilay Shroff
2024-03-09 15:44  5%     ` Keith Busch
2024-03-09 19:05           ` Nilay Shroff
2024-03-11  4:41  5%         ` Keith Busch
2024-03-11 12:58               ` Nilay Shroff
2024-03-12 14:30  5%             ` Keith Busch
2024-02-12  4:26     [PATCH 1/2] nvme-core: add ctrl state transition debug helper Chaitanya Kulkarni
2024-03-05 16:34  5% ` Keith Busch
2024-02-28  8:37     [PATCH v3 0/2] nvmet: add some commands tracing Guixin Liu
2024-03-08 16:21  5% ` Keith Busch
2024-02-28 18:11     convert nvme to atomic queue limits updates Christoph Hellwig
2024-03-01 16:20     ` Keith Busch
2024-03-02 13:59       ` Christoph Hellwig
2024-03-02 23:21  5%     ` Keith Busch
2024-03-04 14:04     convert nvme to atomic queue limits updates v2 Christoph Hellwig
2024-03-04 16:27  5% ` Keith Busch
2024-03-04 16:10     [PATCH v2 0/2] nvme-fabrics: short-circuit connect retries Daniel Wagner
2024-03-04 16:32  5% ` Keith Busch
2024-03-04 18:25  3% [PATCH v2 0/1] nvme: Fix problem when booting from NVMe drive was leading to a hang Michael Kropaczek
2024-03-05 13:51     ` Christoph Hellwig
2024-03-05 15:17  5%   ` Keith Busch
2024-03-05 17:49  4%     ` Michael Kropaczek
2024-03-05 11:18     [RFC RESEND 00/16] Split IOMMU DMA mapping operation to two steps Leon Romanovsky
2024-03-05 11:18     ` [RFC RESEND 16/16] nvme-pci: use blk_rq_dma_map() for NVMe SGL Leon Romanovsky
2024-03-05 15:51  4%   ` Keith Busch
2024-05-02 23:32  0% ` [RFC RESEND 00/16] Split IOMMU DMA mapping operation to two steps Zeng, Oak
2024-05-03 11:57  0%   ` Zhu Yanjun
2024-05-03 16:42       ` Jason Gunthorpe
2024-05-03 20:59  0%     ` Zeng, Oak
2024-03-05 13:15     [PATCH 0/3] nvme: constify struct class usage Ricardo B. Marliere
2024-03-05 13:51     ` Christoph Hellwig
2024-03-05 16:02  5%   ` Keith Busch
2024-03-06  6:03     [PATCH v2] nvme: host: fix double-free of struct nvme_id_ns in ns_update_nuse() Shin'ichiro Kawasaki
2024-03-06 14:00  5% ` Keith Busch
2024-03-06 14:25  5% [PATCH] nvme: clear caller pointer on identify failure Keith Busch
2024-03-06 17:45  0% ` Chaitanya Kulkarni
2024-03-07  1:05     [PATCH 1/1] nvme: Use pr_dbg, not pr_info, when setting shutdown timeout Len Brown
2024-03-07  9:28     ` Max Gurtovoy
2024-03-07 14:27       ` Len Brown
2024-03-07 15:17  5%     ` Keith Busch
2024-03-07 15:23  5%       ` Keith Busch
2024-03-07 16:25  4%         ` Len Brown
2024-03-07 15:11     RFC: untangle and fix __blkdev_issue_discard Christoph Hellwig
2024-03-07 15:11     ` [PATCH 06/10] ext4: switch to using blk_next_discard_bio directly Christoph Hellwig
2024-03-07 16:13  5%   ` Keith Busch
2024-03-07 21:05  5% ` RFC: untangle and fix __blkdev_issue_discard Keith Busch
2024-03-07 15:51  4% [GIT PULL] nvme updates for Linux 6.9 Keith Busch
2024-03-07 16:59  5% [PATCH] nvme: change shutdown timeout setting message Keith Busch
2024-03-08  0:11  0% ` Chaitanya Kulkarni
2024-03-07 19:17  4% [PATCH] nvme: Re-word D3 Entry Latency message Len Brown
2024-03-07 19:51  5% ` Keith Busch
2024-03-08  0:11  0% ` Chaitanya Kulkarni
2024-03-08  1:54     [PATCH v3] nvme: fix reconnection fail due to reserved tag allocation brookxu.cn
2024-03-08 14:42     ` Christoph Hellwig
2024-03-08 16:29       ` 许春光
2024-03-08 16:31         ` Christoph Hellwig
2024-03-08 16:43           ` 许春光
2024-03-08 16:47  5%         ` Keith Busch
2024-03-08 16:54  4%           ` 许春光
2024-03-08 16:20  5% ` Keith Busch
2024-03-08  7:11     [PATCH] nvmet-tcp: do not continue for invalid icreq Hannes Reinecke
2024-03-08 16:20  5% ` Keith Busch
2024-03-08  8:51     [PATCH] nvme-apple: Convert to platform remove callback returning void Uwe Kleine-König
2024-03-08 16:21  5% ` Keith Busch
2024-03-11  2:09     [PATCH v4] nvme: fix reconnection fail due to reserved tag allocation brookxu.cn
2024-03-14 18:42  5% ` Keith Busch
2024-03-11 22:08  3% [PATCH v3 0/1] nvme: Fix problem when booting from NVMe drive was leading to a hang Michael Kropaczek
2024-03-12 14:38  5% ` Keith Busch
2024-03-12 15:17  4%   ` Michael Kropaczek
2024-03-12  7:52     [PATCH] nvme: parse zns command's zsa and zrasf to string Guixin Liu
2024-03-14 18:43  5% ` Keith Busch
2024-03-12 14:45     RFCv2: fix fatal signal handling in __blkdev_issue_discard Christoph Hellwig
2024-03-12 14:45     ` [PATCH 1/5] block: move discard checks into the ioctl handler Christoph Hellwig
2024-03-12 22:12  5%   ` Keith Busch
2024-03-12 22:31         ` Christoph Hellwig
2024-03-13  1:22  5%       ` Keith Busch
2024-03-13 15:40  5%   ` Keith Busch
2024-03-12 14:45     ` [PATCH 2/5] block: add a bio_chain_and_submit helper Christoph Hellwig
2024-03-12 14:51  5%   ` Keith Busch
2024-03-13  2:29     [PATCH v2] nvme: use nvme_disk_is_ns_head helper Guixin Liu
2024-03-14 18:43  5% ` Keith Busch
2024-03-13 12:38     [PATCH v2 1/2] nvme-tcp: Export the nvme_tcp_wq to sysfs Li Feng
2024-03-13 12:38     ` [PATCH v2 2/2] nvme/tcp: Add wq_unbound modparam for nvme_tcp_wq Li Feng
2024-03-13 21:07  4%   ` Anton Gavriliuk
2024-03-18 20:41  5% ` [PATCH v2 1/2] nvme-tcp: Export the nvme_tcp_wq to sysfs Keith Busch
2024-03-15 14:31  3% [Bug Report] nvme-cli fails re-formatting NVMe namespace Nilay Shroff
2024-03-15 19:27     [PATCH] drivers/nvme: Add quirks for device 126f:2262 iBug
2024-03-18 20:38  5% ` Keith Busch
2024-03-18 22:33     WQ_UNBOUND workqueue warnings from multiple drivers Kamaljit Singh
2024-03-20  9:11     ` Sagi Grimberg
2024-03-21 17:36       ` Chaitanya Kulkarni
2024-04-02 23:50  5%     ` Kamaljit Singh
2024-04-07 20:08           ` Sagi Grimberg
2024-05-08 23:16  5%         ` Kamaljit Singh
2024-03-21 18:15  4% [GIT PULL] nvme updates for Linux 6.9 Keith Busch
2024-03-21 21:08     [PATCH] nvme-multipath: don't inherit LBA-related fields for the multipath node Christoph Hellwig
2024-03-22 16:22  5% ` Keith Busch
2024-04-02 13:21       ` Nilay Shroff
2024-04-02 14:15  5%     ` Keith Busch
     [not found]     <20240324170552.545730-1-sashal@kernel.org>
2024-03-24 17:05  5% ` [PATCH AUTOSEL 6.8 11/11] nvme: clear caller pointer on identify failure Sasha Levin
     [not found]     <20240324170619.545975-1-sashal@kernel.org>
2024-03-24 17:06  5% ` [PATCH AUTOSEL 6.7 " Sasha Levin
     [not found]     <20240324170645.546220-1-sashal@kernel.org>
2024-03-24 17:06  5% ` [PATCH AUTOSEL 6.6 " Sasha Levin
     [not found]     <20240324170709.546465-1-sashal@kernel.org>
2024-03-24 17:07  5% ` [PATCH AUTOSEL 6.1 7/7] " Sasha Levin
     [not found]     <20240324170735.546735-1-sashal@kernel.org>
2024-03-24 17:07  5% ` [PATCH AUTOSEL 5.4 3/3] " Sasha Levin
2024-03-25 15:45     [PATCH] nvme: initialize identify ns data to NULL Tokunori Ikegami
2024-03-26 15:37  5% ` Keith Busch
2024-03-26 13:38     [PATCH v6 00/10] block atomic writes John Garry
2024-03-26 13:38  3% ` [PATCH v6 01/10] block: Pass blk_queue_get_max_sectors() a request pointer John Garry
2024-04-10 22:58  0%   ` Luis Chamberlain
2024-03-26 13:38  3% ` [PATCH v6 02/10] block: Call blkdev_dio_unaligned() from blkdev_direct_IO() John Garry
2024-04-10 22:53  0%   ` Luis Chamberlain
2024-04-11  8:06  0%     ` John Garry
2024-03-26 13:38  2% ` [PATCH v6 10/10] nvme: Atomic write support John Garry
2024-03-27 17:21     [PATCH 1/2] block: add a helper to cancel atomic queue limit updates Christoph Hellwig
2024-03-27 17:21     ` [PATCH 2/2] nvme: cancel the queue limit update when nvme_update_zone_info fails Christoph Hellwig
2024-03-27 18:01  5%   ` Keith Busch
2024-03-28  6:30  3% [Bug Report] nvme-cli commands fails to open head disk node and print error Nilay Shroff
2024-03-28  8:45     ` Daniel Wagner
2024-04-02 22:07  4%   ` Kamaljit Singh
2024-04-03  3:07  5%     ` Keith Busch
2024-03-28 13:09     [PATCH] nvme-pci: Add sleep quirk for Samsung 990 Evo Werner Sembach
2024-04-02 13:16     ` Christoph Hellwig
2024-04-02 15:13       ` Georg Gottleuber
2024-04-02 16:08  5%     ` Keith Busch
2024-04-02 14:47     [PATCH v2] nvme: split nvme_update_zone_info Christoph Hellwig
2024-04-02 15:12  5% ` Keith Busch
2024-04-02 19:58  5% ` Keith Busch
2024-04-03 11:31     [PATCHv5] nvmet: implement unique discovery NQN Hannes Reinecke
2024-04-04 15:58  5% ` Keith Busch
2024-04-03 12:47     [PATCH] nvme: don't create a multipath node for zero capacity devices Christoph Hellwig
2024-04-04 15:58  5% ` Keith Busch
2024-04-03 13:34     [PATCH] fabrics : allow host to create duplicate connections to target Nilay Shroff
2024-04-03 13:51     ` Daniel Wagner
2024-04-03 16:43  4%   ` Engel, Amit
2024-04-03 14:17     [PATCHv2 0/2] block,nvme: latency-based I/O scheduler Hannes Reinecke
2024-04-04 21:14  5% ` Keith Busch
2024-04-05  6:21       ` Hannes Reinecke
2024-04-05 15:03  5%     ` Keith Busch
     [not found]     <20240403171656.335224-1-sashal@kernel.org>
2024-04-03 17:16  4% ` [PATCH AUTOSEL 6.8 24/28] drivers/nvme: Add quirks for device 126f:2262 Sasha Levin
     [not found]     <20240403171815.342668-1-sashal@kernel.org>
2024-04-03 17:17  4% ` [PATCH AUTOSEL 6.6 17/20] " Sasha Levin
     [not found]     <20240403171909.345570-1-sashal@kernel.org>
2024-04-03 17:18  4% ` [PATCH AUTOSEL 6.1 14/15] " Sasha Levin
     [not found]     <20240403171945.350716-1-sashal@kernel.org>
2024-04-03 17:19  4% ` [PATCH AUTOSEL 5.15 7/8] " Sasha Levin
     [not found]     <20240403172006.353022-1-sashal@kernel.org>
2024-04-03 17:20  4% ` [PATCH AUTOSEL 5.10 " Sasha Levin
2024-04-04 14:41     [PATCH v2 0/2] nvme(t)-fc: couple of fixes/cleanups Daniel Wagner
2024-04-04 15:57  5% ` Keith Busch
2024-04-04 16:59  5% [GIT PULL] nvme fixes for Linux 6.9 Keith Busch
2024-04-07  2:28  3% [PATCH for 6.1.y] nvme: fix miss command type check Tokunori Ikegami
2024-04-07  9:15  4% [PATCH for 5.15.y] " Tokunori Ikegami
2024-04-08 11:32  0% ` Greg Kroah-Hartman
2024-04-09  9:35     [PATCH v5 0/6] nvme-fabrics: short-circuit connect retries Daniel Wagner
2024-04-09  9:35     ` [PATCH v5 4/6] nvme-rdma: short-circuit reconnect retries Daniel Wagner
2024-04-09 14:00       ` Christoph Hellwig
2024-04-09 14:19  5%     ` Keith Busch
2024-04-09 20:28       ` Sagi Grimberg
2024-04-12  2:50  5%     ` Keith Busch
2024-04-12  0:35  5% ` [PATCH v5 0/6] nvme-fabrics: short-circuit connect retries Keith Busch
2024-04-12  7:24       ` Daniel Wagner
2024-04-12 15:24  5%     ` Keith Busch
2024-04-09 19:28     [PATCH] drivers/nvme: Add quirks for device 1cc4:6a14 Holger Huo
2024-04-09 20:14  5% ` Keith Busch
2024-04-10  2:05       ` Holger Huo
2024-04-10  3:08  5%     ` Keith Busch
2024-04-10  0:57     [PATCH v2] nvme: fix warn output about shared namespaces without CONFIG_NVME_MULTIPATH Yi Zhang
2024-04-12  0:35  5% ` Keith Busch
2024-04-10  9:48     [PATCH 0/2] nvmet-auth: fix some minor bugs Maurizio Lombardi
2024-04-16 16:44  5% ` Keith Busch
2024-04-12 13:41     [PATCH] nvmet-auth: return the error code to the nvmet_auth_ctrl_hash() callers Maurizio Lombardi
2024-05-07 15:02  5% ` Keith Busch
2024-04-13  9:04     [PATCH] nvme: find numa distance only if controller has valid numa id Nilay Shroff
2024-04-14  8:30     ` Sagi Grimberg
2024-04-14 11:02       ` Nilay Shroff
2024-04-15  8:55         ` Sagi Grimberg
2024-04-15  9:30           ` Nilay Shroff
2024-04-15 14:39             ` Hannes Reinecke
2024-04-15 16:56  5%           ` Keith Busch
2024-04-16  8:19     [PATCHv3] " Nilay Shroff
2024-04-16 16:44  5% ` Keith Busch
2024-04-17  8:33     nvme-cli spdk plugin Daniel Wagner
2024-04-17 14:26     ` Christoph Hellwig
2024-04-17 14:31       ` Jens Axboe
2024-04-18  6:25         ` Hannes Reinecke
2024-04-18 14:44  5%       ` Keith Busch
2024-04-18 10:39     [PATCHv3] nvme-tcp: strict pdu pacing to avoid send stalls on TLS Hannes Reinecke
2024-04-29 14:09  5% ` Keith Busch
2024-04-18 12:52     [Bug Report] PCIe errinject and hot-unplug causes nvme driver hang Nilay Shroff
2024-04-21 10:28     ` Sagi Grimberg
2024-04-21 16:56       ` Nilay Shroff
2024-04-22 13:00         ` Sagi Grimberg
2024-04-22 13:52  5%       ` Keith Busch
2024-04-22 14:35  5%         ` Keith Busch
2024-04-23  9:52               ` Nilay Shroff
2024-04-24 17:36  5%             ` Keith Busch
2024-04-22  2:08     help re using nvme-cli to sanitize SSD Deane Coleman
2024-04-22  2:49  4% ` Keith Busch
2024-04-22 16:28     [PATCH] nvme-pci: Add quirk for broken MSIs Sean Anderson
2024-04-22 16:49  5% ` Keith Busch
2024-05-07 15:02  5% ` Keith Busch
2024-04-25 14:03     [PATCH] nvme: cancel pending I/O if nvme controller is in terminal state Nilay Shroff
2024-04-29  8:51  5% ` Keith Busch
2024-04-25 18:39     [PATCH 00/10] Read/Write with meta/integrity Kanchan Joshi
     [not found]     ` <CGME20240425184710epcas5p2968bbc40ed10d1f0184bb511af054fcb@epcas5p2.samsung.com>
2024-04-25 18:39       ` [PATCH 10/10] nvme: add separate handling for user integrity buffer Kanchan Joshi
2024-04-25 19:56  5%     ` Keith Busch
     [not found]     ` <CGME20240425184708epcas5p4f1d95cd8d285614f712868d205a23115@epcas5p4.samsung.com>
2024-04-25 18:39       ` [PATCH 09/10] block: add support to send meta buffer Kanchan Joshi
2024-04-26 15:21  5%     ` Keith Busch
     [not found]     ` <CGME20240425184651epcas5p3404f2390d6cf05148eb96e1af093e7bc@epcas5p3.samsung.com>
2024-04-25 18:39       ` [PATCH 01/10] block: set bip_vcnt correctly Kanchan Joshi
2024-04-27  7:02         ` Christoph Hellwig
2024-04-27 14:16  5%       ` Keith Busch
2024-05-01  7:45             ` Christoph Hellwig
2024-05-01  8:03  5%           ` Keith Busch
     [not found]     ` <CGME20240425184653epcas5p28de1473090e0141ae74f8b0a6eb921a7@epcas5p2.samsung.com>
2024-04-25 18:39       ` [PATCH 02/10] block: copy bip_max_vcnt vecs instead of bip_vcnt during clone Kanchan Joshi
2024-04-27  7:03         ` Christoph Hellwig
2024-04-29 11:28           ` Kanchan Joshi
2024-04-29 12:04  5%         ` Keith Busch
2024-04-29 17:07               ` Christoph Hellwig
2024-04-30  8:25  5%             ` Keith Busch
2024-04-28  8:49     [PATCH] nvmet-tcp: fix possible memory leak when tearing down a controller Sagi Grimberg
2024-04-29  8:51  5% ` Keith Busch
2024-04-28  9:25     [PATCH] nvmet: fix nvme status code when namespace is disabled Sagi Grimberg
2024-04-29 11:52  5% ` Keith Busch
2024-04-29 18:37     [PATCH] block: change rq_integrity_vec to respect the iterator Mikulas Patocka
2024-04-30  8:16  5% ` Keith Busch
2024-04-30 13:19     [PATCH v7 0/5] nvme-fabrics: short-circuit connect retries Daniel Wagner
2024-05-01 10:13  5% ` Keith Busch
     [not found]     <19a6071a-7fb8-4e0f-8050-6f3ba5ee4774@molgen.mpg.de>
2024-05-01  4:51     ` `nvme_disable_ctrl()` takes 411 ms on a Dell XPS 13 with SK hynix PC300 NVMEe Christoph Hellwig
2024-05-01  7:58  5%   ` Keith Busch
2024-05-01 20:58         ` Paul Menzel
2024-05-01 22:03  5%       ` Keith Busch
2024-05-02  6:04             ` Paul Menzel
2024-05-02  6:12               ` Paul Menzel
2024-05-02  8:43  4%             ` Keith Busch
2024-05-02 13:00  4% [GIT PULL] nvme fixes for Linux 6.9 Keith Busch
2024-05-05 15:24     [PATCH v2] nvme-rdma, nvme-tcp: include max reconnects for reconnect logging Tokunori Ikegami
2024-05-07 15:54  5% ` Keith Busch
2024-05-07  6:54     [PATCH v2] nvmet: make nvmet_wq unbound Sagi Grimberg
2024-05-07 15:07  5% ` Keith Busch
2024-05-07  6:54     [PATCH v2] nvmet-rdma: Avoid o(n^2) loop in delete_ctrl Sagi Grimberg
2024-05-07  7:26     ` Christoph Hellwig
2024-05-07 15:08  5%   ` Keith Busch
     [not found]     <20240507230800.392128-1-sashal@kernel.org>
2024-05-07 23:07  4% ` [PATCH AUTOSEL 6.8 45/52] nvme: find numa distance only if controller has valid numa id Sasha Levin
2024-05-07 23:07  4% ` [PATCH AUTOSEL 6.8 46/52] nvmet-auth: return the error code to the nvmet_auth_host_hash() callers Sasha Levin
2024-05-07 23:07  4% ` [PATCH AUTOSEL 6.8 47/52] nvmet-auth: replace pr_debug() with pr_err() to report an error Sasha Levin
2024-05-07 23:07  3% ` [PATCH AUTOSEL 6.8 48/52] nvme: cancel pending I/O if nvme controller is in terminal state Sasha Levin
2024-05-07 23:07  4% ` [PATCH AUTOSEL 6.8 49/52] nvmet-tcp: fix possible memory leak when tearing down a controller Sasha Levin
2024-05-07 23:07  3% ` [PATCH AUTOSEL 6.8 50/52] nvmet: fix nvme status code when namespace is disabled Sasha Levin
2024-05-07 23:07  4% ` [PATCH AUTOSEL 6.8 51/52] nvme-tcp: strict pdu pacing to avoid send stalls on TLS Sasha Levin
     [not found]     <20240507231033.393285-1-sashal@kernel.org>
2024-05-07 23:09  4% ` [PATCH AUTOSEL 6.6 37/43] nvme: find numa distance only if controller has valid numa id Sasha Levin
2024-05-07 23:09  4% ` [PATCH AUTOSEL 6.6 38/43] nvmet-auth: return the error code to the nvmet_auth_host_hash() callers Sasha Levin
2024-05-07 23:10  4% ` [PATCH AUTOSEL 6.6 39/43] nvmet-auth: replace pr_debug() with pr_err() to report an error Sasha Levin
2024-05-07 23:10  3% ` [PATCH AUTOSEL 6.6 40/43] nvme: cancel pending I/O if nvme controller is in terminal state Sasha Levin
2024-05-07 23:10  4% ` [PATCH AUTOSEL 6.6 41/43] nvmet-tcp: fix possible memory leak when tearing down a controller Sasha Levin
2024-05-07 23:10  3% ` [PATCH AUTOSEL 6.6 42/43] nvmet: fix nvme status code when namespace is disabled Sasha Levin
     [not found]     <20240507231231.394219-1-sashal@kernel.org>
2024-05-07 23:12  4% ` [PATCH AUTOSEL 6.1 20/25] nvme: find numa distance only if controller has valid numa id Sasha Levin
2024-05-07 23:12  4% ` [PATCH AUTOSEL 6.1 21/25] nvmet-auth: return the error code to the nvmet_auth_host_hash() callers Sasha Levin
2024-05-07 23:12  4% ` [PATCH AUTOSEL 6.1 22/25] nvmet-auth: replace pr_debug() with pr_err() to report an error Sasha Levin
2024-05-07 23:12  4% ` [PATCH AUTOSEL 6.1 23/25] nvmet-tcp: fix possible memory leak when tearing down a controller Sasha Levin
2024-05-07 23:12  3% ` [PATCH AUTOSEL 6.1 24/25] nvmet: fix nvme status code when namespace is disabled Sasha Levin
     [not found]     <20240507231333.394765-1-sashal@kernel.org>
2024-05-07 23:13  4% ` [PATCH AUTOSEL 5.15 14/15] nvme: find numa distance only if controller has valid numa id Sasha Levin
     [not found]     <20240507231406.395123-1-sashal@kernel.org>
2024-05-07 23:14  4% ` [PATCH AUTOSEL 5.10 9/9] " Sasha Levin
     [not found]     <20240507231424.395315-1-sashal@kernel.org>
2024-05-07 23:14  4% ` [PATCH AUTOSEL 5.4 6/6] " Sasha Levin
2024-05-08  7:43     [PATCH] nvmet: prevent sprintf() overflow in nvmet_subsys_nsid_exists() Dan Carpenter
2024-05-08 13:13  5% ` Keith Busch
2024-05-08  7:53     [PATCH v2] nvmet-rdma: fix possible bad dereference when freeing rsps Sagi Grimberg
2024-05-08 13:20  5% ` Keith Busch
2024-05-09 17:48  5% [GIT PULL] nvme fixes for Linux 6.9 Keith Busch
2024-05-10 14:14  5% [PATCH 1/2] genirq/affinity: remove rsvd check against minvec Keith Busch
2024-05-10 14:14  5% ` [PATCH 2/2] nvme-pci: allow unmanaged interrupts Keith Busch
2024-05-10 15:10  0%   ` Christoph Hellwig
2024-05-10 16:20  5%     ` Keith Busch
2024-05-10 23:50  0%       ` Ming Lei
2024-05-11  0:41  5%         ` Keith Busch
2024-05-11  0:59  0%           ` Ming Lei
2024-05-13 13:12  0%     ` Bart Van Assche
2024-05-10 15:15  0% ` [PATCH 1/2] genirq/affinity: remove rsvd check against minvec Ming Lei
2024-05-10 16:47  5%   ` Keith Busch
2024-05-10 17:46  5% [PATCHv2] nvme-pci: allow unmanaged interrupts Keith Busch
2024-05-10 23:47  0% ` Ming Lei
2024-05-11  0:29  5%   ` Keith Busch
2024-05-11  0:44         ` Ming Lei
2024-05-12 14:16           ` Sagi Grimberg
2024-05-12 22:05  5%         ` Keith Busch
2024-05-13  1:12             ` Ming Lei
2024-05-13  4:09  5%           ` Keith Busch
     [not found]     <CGME20240510095142epcas5p4fde65328020139931417f83ccedbce90@epcas5p4.samsung.com>
2024-05-10  9:44     ` [PATCH] block: unmap and free user mapped integrity via submitter Anuj Gupta
2024-05-10 18:56  5%   ` Keith Busch
2024-05-11 11:22  4%     ` Anuj gupta
     [not found]     <CGME20240510134740epcas5p24ef1c2d6e8934c1c79b01c849e7ccb41@epcas5p2.samsung.com>
2024-05-10 13:40     ` [PATCH] nvme: enable FDP support Kanchan Joshi
2024-05-10 19:30  4%   ` Keith Busch
2024-05-14 15:07  4% [GIT PULL] nvme updates for Linux 6.10 Keith Busch
2024-05-14 17:53     [PATCH v4 0/6] block,nvme: queue-depth and latency I/O schedulers John Meneghini
2024-05-14 17:53     ` [PATCH v4 1/6] nvme: multipath: Implemented new iopolicy "queue-depth" John Meneghini
2024-05-20 14:46  5%   ` Keith Busch
2024-05-16 12:13     [PATCH] nvme-multipath: find NUMA path only for online numa-node Nilay Shroff
2024-05-21 13:47  5% ` Keith Busch
2024-05-20 20:20     [PATCH v3 0/1] nvme: queue-depth multipath iopolicy John Meneghini
2024-05-20 20:20     ` [PATCH v3 1/1] nvme: multipath: Implemented new iopolicy "queue-depth" John Meneghini
2024-05-20 20:50  5%   ` Keith Busch
2024-05-21  6:46       ` Hannes Reinecke
2024-05-21 13:58         ` John Meneghini
2024-05-21 14:10  5%       ` Keith Busch
2024-05-21  8:48       ` Nilay Shroff
2024-05-21 13:05  5%     ` Keith Busch
2024-05-21 17:05  5% [PATCH] nvme: fix multipath batched completion accounting Keith Busch
2024-05-22  0:55  0% ` Chaitanya Kulkarni
2024-05-23  9:59  0% ` Hannes Reinecke
2024-05-21 18:07  5% [PATCH] nvme-multipath: fix io accounting on failover Keith Busch
2024-05-21 18:35  4% ` John Meneghini
2024-05-21 18:55  5%   ` Keith Busch
2024-05-22 13:02  0% ` Nilay Shroff
2024-05-22 14:18  5%   ` Keith Busch
2024-05-23  7:00  0%     ` Nilay Shroff
2024-05-21 20:20     [PATCH v2] nvmet: fix ns enable/disable possible hang Sagi Grimberg
2024-05-22 16:24  5% ` Keith Busch
2024-05-22  9:15  3% [PATCH] nvme-pci: silence a lockdep complaint Sagi Grimberg
2024-05-22 12:18  0% ` Shinichiro Kawasaki
2024-05-22 16:12  5%   ` Keith Busch
2024-05-22 16:28         ` Christoph Hellwig
2024-05-22 18:00           ` Sagi Grimberg
2024-05-22 21:36  6%         ` Keith Busch
2024-05-23  6:54               ` Christoph Hellwig
2024-05-23 10:04                 ` Sagi Grimberg
2024-05-23 12:39                   ` Christoph Hellwig
2024-05-23 13:02                     ` Sagi Grimberg
2024-05-23 13:19                       ` Christoph Hellwig
2024-05-23 13:45                         ` Sagi Grimberg
2024-05-23 15:02  4%                       ` Keith Busch
2024-05-22 15:42     [PATCH v4 0/1] nvme: queue-depth multipath iopolicy John Meneghini
2024-05-22 15:42     ` [PATCH v4 1/1] nvme: multipath: Implemented new iopolicy "queue-depth" John Meneghini
2024-05-22 15:56  5%   ` Keith Busch
2024-05-22 16:23         ` John Meneghini
2024-05-22 16:29  5%       ` Keith Busch
2024-05-22 15:56     [PATCH 0/1] nvme-pci: add quirks for Lexar NM790 Jason Nader
2024-05-22 17:42  5% ` Keith Busch
2024-05-23  9:52  4%   ` Jason Nader
2024-05-22 16:54     [PATCH v5] nvme: multipath: Implemented new iopolicy "queue-depth" John Meneghini
2024-05-22 17:32  5% ` Keith Busch
2024-05-23  6:45       ` Christoph Hellwig
2024-05-23 13:12         ` John Meneghini
2024-05-23 15:56  5%       ` Keith Busch
2024-05-23 15:51  3% [PATCH] nvme: use srcu for namespace list reading Keith Busch
2024-05-23 16:18  0% ` Jens Axboe
2024-05-23 17:20  3% [PATCHv2] nvme: use srcu for iterating namespace list Keith Busch

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).