All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth"
@ 2023-11-07 21:23 Ewan D. Milne
  2023-11-07 21:23 ` [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy Ewan D. Milne
                   ` (8 more replies)
  0 siblings, 9 replies; 24+ messages in thread
From: Ewan D. Milne @ 2023-11-07 21:23 UTC (permalink / raw)
  To: linux-nvme; +Cc: tsong

The existing iopolicies are inefficient in some cases, such as
the presence of a path with high latency. The round-robin
policy would use that path equally with faster paths, which
results in sub-optimal performance.

The queue-depth policy instead sends I/O requests down the path
with the least amount of requests in its request queue. Paths
with lower latency will clear requests more quickly and have less
requests in their queues compared to "bad" paths. The aim is to
use those paths the most to bring down overall latency.

This implementation adds an atomic variable to the nvme_ctrl
struct to represent the queue depth. It is updated each time a
request specific to that controller starts or ends.

[edm: patch developed by Thomas Song @ Pure Storage, fixed whitespace
      and compilation warnings, updated MODULE_PARM description, and
      fixed potential issue with ->current_path[] being used]

Co-developed-by: Thomas Song <tsong@purestorage.com>
Signed-off-by: Ewan D. Milne <emilne@redhat.com>
---
 drivers/nvme/host/multipath.c | 59 +++++++++++++++++++++++++++++++++--
 drivers/nvme/host/nvme.h      |  2 ++
 2 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 0a88d7bdc5e3..4c2690cddef3 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -17,6 +17,7 @@ MODULE_PARM_DESC(multipath,
 static const char *nvme_iopolicy_names[] = {
 	[NVME_IOPOLICY_NUMA]	= "numa",
 	[NVME_IOPOLICY_RR]	= "round-robin",
+	[NVME_IOPOLICY_QD]      = "queue-depth",
 };
 
 static int iopolicy = NVME_IOPOLICY_NUMA;
@@ -29,6 +30,8 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
 		iopolicy = NVME_IOPOLICY_NUMA;
 	else if (!strncmp(val, "round-robin", 11))
 		iopolicy = NVME_IOPOLICY_RR;
+	else if (!strncmp(val, "queue-depth", 11))
+		iopolicy = NVME_IOPOLICY_QD;
 	else
 		return -EINVAL;
 
@@ -43,7 +46,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
 module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
 	&iopolicy, 0644);
 MODULE_PARM_DESC(iopolicy,
-	"Default multipath I/O policy; 'numa' (default) or 'round-robin'");
+	"Default multipath I/O policy; 'numa' (default) , 'round-robin' or 'queue-depth'");
 
 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
 {
@@ -130,6 +133,7 @@ void nvme_mpath_start_request(struct request *rq)
 	if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
 		return;
 
+	atomic_inc(&ns->ctrl->nr_active);
 	nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
 	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
 						      jiffies);
@@ -142,6 +146,8 @@ void nvme_mpath_end_request(struct request *rq)
 
 	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
 		return;
+
+	atomic_dec(&ns->ctrl->nr_active);
 	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
 			 blk_rq_bytes(rq) >> SECTOR_SHIFT,
 			 nvme_req(rq)->start_time);
@@ -329,6 +335,40 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
 	return found;
 }
 
+static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
+{
+	struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
+	unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
+	unsigned int depth;
+
+	list_for_each_entry_rcu(ns, &head->list, siblings) {
+		if (nvme_path_is_disabled(ns))
+			continue;
+
+		depth = atomic_read(&ns->ctrl->nr_active);
+
+		switch (ns->ana_state) {
+		case NVME_ANA_OPTIMIZED:
+			if (depth < min_depth_opt) {
+				min_depth_opt = depth;
+				best_opt = ns;
+			}
+			break;
+
+		case NVME_ANA_NONOPTIMIZED:
+			if (depth < min_depth_nonopt) {
+				min_depth_nonopt = depth;
+				best_nonopt = ns;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	return best_opt ? best_opt : best_nonopt;
+}
+
 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 {
 	return ns->ctrl->state == NVME_CTRL_LIVE &&
@@ -337,15 +377,27 @@ static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 
 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 {
-	int node = numa_node_id();
+	int iopolicy = READ_ONCE(head->subsys->iopolicy);
+	int node;
 	struct nvme_ns *ns;
 
+	/*
+	 * queue-depth iopolicy does not need to reference ->current_path
+	 * but round-robin needs the last path used to advance to the
+	 * next one, and numa will continue to use the last path unless
+	 * it is or has become not optimized
+	 */
+	if (iopolicy == NVME_IOPOLICY_QD)
+		return nvme_queue_depth_path(head);
+
+	node = numa_node_id();
 	ns = srcu_dereference(head->current_path[node], &head->srcu);
 	if (unlikely(!ns))
 		return __nvme_find_path(head, node);
 
-	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
+	if (iopolicy == NVME_IOPOLICY_RR)
 		return nvme_round_robin_path(head, node, ns);
+
 	if (unlikely(!nvme_path_is_optimized(ns)))
 		return __nvme_find_path(head, node);
 	return ns;
@@ -903,6 +955,7 @@ void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
 	mutex_init(&ctrl->ana_lock);
 	timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
 	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
+	atomic_set(&ctrl->nr_active, 0);
 }
 
 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 39a90b7cb125..f0f3fd8b4197 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -347,6 +347,7 @@ struct nvme_ctrl {
 	size_t ana_log_size;
 	struct timer_list anatt_timer;
 	struct work_struct ana_work;
+	atomic_t nr_active;
 #endif
 
 #ifdef CONFIG_NVME_HOST_AUTH
@@ -390,6 +391,7 @@ struct nvme_ctrl {
 enum nvme_iopolicy {
 	NVME_IOPOLICY_NUMA,
 	NVME_IOPOLICY_RR,
+	NVME_IOPOLICY_QD,
 };
 
 struct nvme_subsystem {
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy
  2023-11-07 21:23 [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" Ewan D. Milne
@ 2023-11-07 21:23 ` Ewan D. Milne
  2023-11-07 21:42   ` Chaitanya Kulkarni
                     ` (2 more replies)
  2023-11-07 21:23 ` [PATCH 3/3] nvme: multipath: Invalidate current_path when changing iopolicy Ewan D. Milne
                   ` (7 subsequent siblings)
  8 siblings, 3 replies; 24+ messages in thread
From: Ewan D. Milne @ 2023-11-07 21:23 UTC (permalink / raw)
  To: linux-nvme; +Cc: tsong

The atomic updates of ctrl->nr_active are unnecessary when using
numa or round-robin iopolicy, so avoid that cost on a per-request basis.
Clear nr_active when changing iopolicy and do not decrement below zero.
(This handles changing the iopolicy while requests are in flight.)

Signed-off-by: Ewan D. Milne <emilne@redhat.com>
---
 drivers/nvme/host/core.c      |  2 +-
 drivers/nvme/host/multipath.c | 21 ++++++++++++++++++---
 drivers/nvme/host/nvme.h      |  2 ++
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 75a1b58a7a43..9bc19755be77 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -110,7 +110,7 @@ struct workqueue_struct *nvme_delete_wq;
 EXPORT_SYMBOL_GPL(nvme_delete_wq);
 
 static LIST_HEAD(nvme_subsystems);
-static DEFINE_MUTEX(nvme_subsystems_lock);
+DEFINE_MUTEX(nvme_subsystems_lock);
 
 static DEFINE_IDA(nvme_instance_ida);
 static dev_t nvme_ctrl_base_chr_devt;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 4c2690cddef3..e184e7c377bc 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -133,7 +133,8 @@ void nvme_mpath_start_request(struct request *rq)
 	if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
 		return;
 
-	atomic_inc(&ns->ctrl->nr_active);
+	if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD)
+		atomic_inc(&ns->ctrl->nr_active);
 	nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
 	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
 						      jiffies);
@@ -147,7 +148,8 @@ void nvme_mpath_end_request(struct request *rq)
 	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
 		return;
 
-	atomic_dec(&ns->ctrl->nr_active);
+	if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD)
+		atomic_dec_if_positive(&ns->ctrl->nr_active);
 	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
 			 blk_rq_bytes(rq) >> SECTOR_SHIFT,
 			 nvme_req(rq)->start_time);
@@ -848,6 +850,19 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
 			  nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
 }
 
+void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
+{
+	struct nvme_ctrl *ctrl;
+
+	WRITE_ONCE(subsys->iopolicy, iopolicy);
+
+	mutex_lock(&nvme_subsystems_lock);
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+		atomic_set(&ctrl->nr_active, 0);
+	}
+	mutex_unlock(&nvme_subsystems_lock);
+}
+
 static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
@@ -857,7 +872,7 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
 
 	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
 		if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
-			WRITE_ONCE(subsys->iopolicy, i);
+			nvme_subsys_iopolicy_update(subsys, i);
 			return count;
 		}
 	}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index f0f3fd8b4197..c4469bc38d89 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -49,6 +49,8 @@ extern struct workqueue_struct *nvme_wq;
 extern struct workqueue_struct *nvme_reset_wq;
 extern struct workqueue_struct *nvme_delete_wq;
 
+extern struct mutex nvme_subsystems_lock;
+
 /*
  * List of workarounds for devices that required behavior not specified in
  * the standard.
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 3/3] nvme: multipath: Invalidate current_path when changing iopolicy
  2023-11-07 21:23 [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" Ewan D. Milne
  2023-11-07 21:23 ` [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy Ewan D. Milne
@ 2023-11-07 21:23 ` Ewan D. Milne
  2023-11-08  8:10   ` Christoph Hellwig
  2023-11-07 21:40 ` [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" Chaitanya Kulkarni
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 24+ messages in thread
From: Ewan D. Milne @ 2023-11-07 21:23 UTC (permalink / raw)
  To: linux-nvme; +Cc: tsong

When switching back to numa from round-robin, current_path may refer to
a different path than the one numa would have selected, and it is desirable
to have consistent behavior.

Signed-off-by: Ewan D. Milne <emilne@redhat.com>
---
 drivers/nvme/host/multipath.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index e184e7c377bc..273940b6ca86 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -859,6 +859,7 @@ void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
 	mutex_lock(&nvme_subsystems_lock);
 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 		atomic_set(&ctrl->nr_active, 0);
+		nvme_mpath_clear_ctrl_paths(ctrl);
 	}
 	mutex_unlock(&nvme_subsystems_lock);
 }
-- 
2.20.1



^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth"
  2023-11-07 21:23 [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" Ewan D. Milne
  2023-11-07 21:23 ` [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy Ewan D. Milne
  2023-11-07 21:23 ` [PATCH 3/3] nvme: multipath: Invalidate current_path when changing iopolicy Ewan D. Milne
@ 2023-11-07 21:40 ` Chaitanya Kulkarni
  2023-11-07 21:46 ` Chaitanya Kulkarni
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 24+ messages in thread
From: Chaitanya Kulkarni @ 2023-11-07 21:40 UTC (permalink / raw)
  To: Ewan D. Milne, linux-nvme; +Cc: tsong

> +static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
> +{
> +	struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
> +	unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
> +	unsigned int depth;
> +
> +	list_for_each_entry_rcu(ns, &head->list, siblings) {
> +		if (nvme_path_is_disabled(ns))
> +			continue;
> +
> +		depth = atomic_read(&ns->ctrl->nr_active);
> +
> +		switch (ns->ana_state) {
> +		case NVME_ANA_OPTIMIZED:
> +			if (depth < min_depth_opt) {
> +				min_depth_opt = depth;
> +				best_opt = ns;
> +			}
> +			break;
> +

nit:- extra whiteline above after break ?

-ck




^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy
  2023-11-07 21:23 ` [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy Ewan D. Milne
@ 2023-11-07 21:42   ` Chaitanya Kulkarni
  2023-11-07 21:53   ` Keith Busch
  2023-11-10  1:18   ` Uday Shankar
  2 siblings, 0 replies; 24+ messages in thread
From: Chaitanya Kulkarni @ 2023-11-07 21:42 UTC (permalink / raw)
  To: Ewan D. Milne, linux-nvme; +Cc: tsong

> +void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
> +{
> +	struct nvme_ctrl *ctrl;
> +
> +	WRITE_ONCE(subsys->iopolicy, iopolicy);
> +
> +	mutex_lock(&nvme_subsystems_lock);
> +	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> +		atomic_set(&ctrl->nr_active, 0);
> +	}

nit:- do we need braces for above loop ?

-ck



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth"
  2023-11-07 21:23 [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" Ewan D. Milne
                   ` (2 preceding siblings ...)
  2023-11-07 21:40 ` [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" Chaitanya Kulkarni
@ 2023-11-07 21:46 ` Chaitanya Kulkarni
  2023-11-07 21:56   ` Ewan Milne
  2023-11-07 21:49 ` Keith Busch
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 24+ messages in thread
From: Chaitanya Kulkarni @ 2023-11-07 21:46 UTC (permalink / raw)
  To: Ewan D. Milne, tsong; +Cc: linux-nvme

On 11/7/23 13:23, Ewan D. Milne wrote:
> The existing iopolicies are inefficient in some cases, such as
> the presence of a path with high latency. The round-robin
> policy would use that path equally with faster paths, which
> results in sub-optimal performance.

do you have performance numbers for such case ?

> The queue-depth policy instead sends I/O requests down the path
> with the least amount of requests in its request queue. Paths
> with lower latency will clear requests more quickly and have less
> requests in their queues compared to "bad" paths. The aim is to
> use those paths the most to bring down overall latency.
>
> This implementation adds an atomic variable to the nvme_ctrl
> struct to represent the queue depth. It is updated each time a
> request specific to that controller starts or ends.
>
> [edm: patch developed by Thomas Song @ Pure Storage, fixed whitespace
>        and compilation warnings, updated MODULE_PARM description, and
>        fixed potential issue with ->current_path[] being used]
>
> Co-developed-by: Thomas Song <tsong@purestorage.com>
> Signed-off-by: Ewan D. Milne <emilne@redhat.com>
> ---
>   

any performance comparison that shows the difference ?

-ck



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth"
  2023-11-07 21:23 [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" Ewan D. Milne
                   ` (3 preceding siblings ...)
  2023-11-07 21:46 ` Chaitanya Kulkarni
@ 2023-11-07 21:49 ` Keith Busch
  2023-11-07 22:01   ` Ewan Milne
  2024-05-09 20:29 ` [PATCH v2 0/3] nvme: queue-depth multipath iopolicy John Meneghini
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 24+ messages in thread
From: Keith Busch @ 2023-11-07 21:49 UTC (permalink / raw)
  To: Ewan D. Milne; +Cc: linux-nvme, tsong

On Tue, Nov 07, 2023 at 04:23:29PM -0500, Ewan D. Milne wrote:
>  void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
>  {
> @@ -130,6 +133,7 @@ void nvme_mpath_start_request(struct request *rq)
>  	if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
>  		return;
>  
> +	atomic_inc(&ns->ctrl->nr_active);

Did you intend to skip counting nr_active if disk stats are not enabled,
or if it's a passthough command?

Also, we should restrict the atomics for other new IO Depth  policy.

>  	nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
>  	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
>  						      jiffies);
> @@ -142,6 +146,8 @@ void nvme_mpath_end_request(struct request *rq)
>  
>  	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
>  		return;
> +
> +	atomic_dec(&ns->ctrl->nr_active);

And if disk stats were disabled, this dec isn't paired with an inc.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy
  2023-11-07 21:23 ` [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy Ewan D. Milne
  2023-11-07 21:42   ` Chaitanya Kulkarni
@ 2023-11-07 21:53   ` Keith Busch
  2023-11-07 22:03     ` Ewan Milne
  2023-11-08  8:09     ` Christoph Hellwig
  2023-11-10  1:18   ` Uday Shankar
  2 siblings, 2 replies; 24+ messages in thread
From: Keith Busch @ 2023-11-07 21:53 UTC (permalink / raw)
  To: Ewan D. Milne; +Cc: linux-nvme, tsong

On Tue, Nov 07, 2023 at 04:23:30PM -0500, Ewan D. Milne wrote:
> The atomic updates of ctrl->nr_active are unnecessary when using
> numa or round-robin iopolicy, so avoid that cost on a per-request basis.
> Clear nr_active when changing iopolicy and do not decrement below zero.
> (This handles changing the iopolicy while requests are in flight.)

Oh, here's restricting it to that policy. Any reason not to fold it in
the first one?

> -	atomic_inc(&ns->ctrl->nr_active);
> +	if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD)
> +		atomic_inc(&ns->ctrl->nr_active);

This all looks racy with the ability to change policies in flight. I
think we need to introduce a new flag and do something like this:

	if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) {
		atomic_inc(&ns->ctrl->nr_active);
	  	nvme_req(rq)->flags |= NVME_MPATH_COUNT_ACTIVE;
	}

...

> @@ -147,7 +148,8 @@ void nvme_mpath_end_request(struct request *rq)
>  	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
>  		return;
>  
> -	atomic_dec(&ns->ctrl->nr_active);
> +	if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD)
> +		atomic_dec_if_positive(&ns->ctrl->nr_active);

And then the above becomes:

	if (nvme_req(rq)->flags & NVME_MPATH_COUNT_ACTIVE)
		atomic_dec_if_positive(&ns->ctrl->nr_active);

...

> +void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
> +{
> +	struct nvme_ctrl *ctrl;
> +
> +	WRITE_ONCE(subsys->iopolicy, iopolicy);
> +
> +	mutex_lock(&nvme_subsystems_lock);
> +	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> +		atomic_set(&ctrl->nr_active, 0);
> +	}
> +	mutex_unlock(&nvme_subsystems_lock);

And then you don't need to do the above since atomic_dec's are always
pairs with atomic_inc's. What do you think?


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth"
  2023-11-07 21:46 ` Chaitanya Kulkarni
@ 2023-11-07 21:56   ` Ewan Milne
  2023-11-07 23:32     ` John Meneghini
  2023-11-08  4:14     ` Chaitanya Kulkarni
  0 siblings, 2 replies; 24+ messages in thread
From: Ewan Milne @ 2023-11-07 21:56 UTC (permalink / raw)
  To: Chaitanya Kulkarni; +Cc: tsong, linux-nvme

Yes, we have some graphs.  John M. presented them at ALPSS and there were
some earlier ones at LSF/MM.  I'll see if I can put up the latest set
for download.

The basic issue is that with round-robin, requests for most/all of the
tagset space
can end up on a path that is responding slowly, so we see a
significant imbalance
in path utilization.


-Ewan

On Tue, Nov 7, 2023 at 4:46 PM Chaitanya Kulkarni <chaitanyak@nvidia.com> wrote:
>
> On 11/7/23 13:23, Ewan D. Milne wrote:
> > The existing iopolicies are inefficient in some cases, such as
> > the presence of a path with high latency. The round-robin
> > policy would use that path equally with faster paths, which
> > results in sub-optimal performance.
>
> do you have performance numbers for such case ?
>
> > The queue-depth policy instead sends I/O requests down the path
> > with the least amount of requests in its request queue. Paths
> > with lower latency will clear requests more quickly and have less
> > requests in their queues compared to "bad" paths. The aim is to
> > use those paths the most to bring down overall latency.
> >
> > This implementation adds an atomic variable to the nvme_ctrl
> > struct to represent the queue depth. It is updated each time a
> > request specific to that controller starts or ends.
> >
> > [edm: patch developed by Thomas Song @ Pure Storage, fixed whitespace
> >        and compilation warnings, updated MODULE_PARM description, and
> >        fixed potential issue with ->current_path[] being used]
> >
> > Co-developed-by: Thomas Song <tsong@purestorage.com>
> > Signed-off-by: Ewan D. Milne <emilne@redhat.com>
> > ---
> >
>
> any performance comparison that shows the difference ?
>
> -ck
>
>



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth"
  2023-11-07 21:49 ` Keith Busch
@ 2023-11-07 22:01   ` Ewan Milne
  2023-11-07 22:14     ` Keith Busch
  0 siblings, 1 reply; 24+ messages in thread
From: Ewan Milne @ 2023-11-07 22:01 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-nvme, tsong

The patch uses the existing flag NVME_MPATH_IO_STATS in the request
for simplicity to see if an atomic_inc() was performed before doing the _dec().
If we really need to keep the count for passthrough requests (maybe we do)
then we would need another flag.

The reason to use the atomic_dec_if_positive() in the 2nd patch and set the
nr_active to zero when changing the iopolicy is so that if e.g. someone changes
the iopolicy repeatedly while requests are in-flight the counter will
work properly.

(I have a test case and another path to make the nr_active counter
visible in sysfs,
btw, if that would be useful.)

-Ewan

On Tue, Nov 7, 2023 at 4:49 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Tue, Nov 07, 2023 at 04:23:29PM -0500, Ewan D. Milne wrote:
> >  void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
> >  {
> > @@ -130,6 +133,7 @@ void nvme_mpath_start_request(struct request *rq)
> >       if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
> >               return;
> >
> > +     atomic_inc(&ns->ctrl->nr_active);
>
> Did you intend to skip counting nr_active if disk stats are not enabled,
> or if it's a passthough command?
>
> Also, we should restrict the atomics for other new IO Depth  policy.
>
> >       nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
> >       nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
> >                                                     jiffies);
> > @@ -142,6 +146,8 @@ void nvme_mpath_end_request(struct request *rq)
> >
> >       if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
> >               return;
> > +
> > +     atomic_dec(&ns->ctrl->nr_active);
>
> And if disk stats were disabled, this dec isn't paired with an inc.
>



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy
  2023-11-07 21:53   ` Keith Busch
@ 2023-11-07 22:03     ` Ewan Milne
  2023-11-08  8:09     ` Christoph Hellwig
  1 sibling, 0 replies; 24+ messages in thread
From: Ewan Milne @ 2023-11-07 22:03 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-nvme, tsong

I did 2/3 as a separate patch because Tomas wrote the first one
and I thought it was a little more clear to have the optimization
separated out, but I can combine them if we like.

-Ewan

On Tue, Nov 7, 2023 at 4:53 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Tue, Nov 07, 2023 at 04:23:30PM -0500, Ewan D. Milne wrote:
> > The atomic updates of ctrl->nr_active are unnecessary when using
> > numa or round-robin iopolicy, so avoid that cost on a per-request basis.
> > Clear nr_active when changing iopolicy and do not decrement below zero.
> > (This handles changing the iopolicy while requests are in flight.)
>
> Oh, here's restricting it to that policy. Any reason not to fold it in
> the first one?
>
> > -     atomic_inc(&ns->ctrl->nr_active);
> > +     if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD)
> > +             atomic_inc(&ns->ctrl->nr_active);
>
> This all looks racy with the ability to change policies in flight. I
> think we need to introduce a new flag and do something like this:
>
>         if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) {
>                 atomic_inc(&ns->ctrl->nr_active);
>                 nvme_req(rq)->flags |= NVME_MPATH_COUNT_ACTIVE;
>         }
>
> ...
>
> > @@ -147,7 +148,8 @@ void nvme_mpath_end_request(struct request *rq)
> >       if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
> >               return;
> >
> > -     atomic_dec(&ns->ctrl->nr_active);
> > +     if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD)
> > +             atomic_dec_if_positive(&ns->ctrl->nr_active);
>
> And then the above becomes:
>
>         if (nvme_req(rq)->flags & NVME_MPATH_COUNT_ACTIVE)
>                 atomic_dec_if_positive(&ns->ctrl->nr_active);
>
> ...
>
> > +void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
> > +{
> > +     struct nvme_ctrl *ctrl;
> > +
> > +     WRITE_ONCE(subsys->iopolicy, iopolicy);
> > +
> > +     mutex_lock(&nvme_subsystems_lock);
> > +     list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> > +             atomic_set(&ctrl->nr_active, 0);
> > +     }
> > +     mutex_unlock(&nvme_subsystems_lock);
>
> And then you don't need to do the above since atomic_dec's are always
> pairs with atomic_inc's. What do you think?
>



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth"
  2023-11-07 22:01   ` Ewan Milne
@ 2023-11-07 22:14     ` Keith Busch
  0 siblings, 0 replies; 24+ messages in thread
From: Keith Busch @ 2023-11-07 22:14 UTC (permalink / raw)
  To: Ewan Milne; +Cc: linux-nvme, tsong

On Tue, Nov 07, 2023 at 05:01:22PM -0500, Ewan Milne wrote:
> The patch uses the existing flag NVME_MPATH_IO_STATS in the request
> for simplicity to see if an atomic_inc() was performed before doing the _dec().
> If we really need to keep the count for passthrough requests (maybe we do)
> then we would need another flag.

Oh, I see.

I didn't think you wanted to tie path selection to whether or not the
disk stats are enabled (and maybe you don't?), so that's why I thought
we needed another flag for this.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth"
  2023-11-07 21:56   ` Ewan Milne
@ 2023-11-07 23:32     ` John Meneghini
  2023-11-08  4:14     ` Chaitanya Kulkarni
  1 sibling, 0 replies; 24+ messages in thread
From: John Meneghini @ 2023-11-07 23:32 UTC (permalink / raw)
  To: Ewan Milne, Chaitanya Kulkarni; +Cc: tsong, linux-nvme

Here are the slides with the performance numbers I presented at ALPSS.

https://people.redhat.com/jmeneghi/ALPSS_2023/NVMe_QD_Multipathing.pdf

This is the same information I posted in:

https://lists.infradead.org/pipermail/linux-nvme/2023-September/042371.html

/John

On 11/7/23 16:56, Ewan Milne wrote:
> Yes, we have some graphs.  John M. presented them at ALPSS and there were
> some earlier ones at LSF/MM.  I'll see if I can put up the latest set
> for download.
> 
> The basic issue is that with round-robin, requests for most/all of the
> tagset space
> can end up on a path that is responding slowly, so we see a
> significant imbalance
> in path utilization.
> 
> 
> -Ewan
> 
> On Tue, Nov 7, 2023 at 4:46 PM Chaitanya Kulkarni <chaitanyak@nvidia.com> wrote:
>>
>> On 11/7/23 13:23, Ewan D. Milne wrote:
>>> The existing iopolicies are inefficient in some cases, such as
>>> the presence of a path with high latency. The round-robin
>>> policy would use that path equally with faster paths, which
>>> results in sub-optimal performance.
>>
>> do you have performance numbers for such case ?
>>
>>> The queue-depth policy instead sends I/O requests down the path
>>> with the least amount of requests in its request queue. Paths
>>> with lower latency will clear requests more quickly and have less
>>> requests in their queues compared to "bad" paths. The aim is to
>>> use those paths the most to bring down overall latency.
>>>
>>> This implementation adds an atomic variable to the nvme_ctrl
>>> struct to represent the queue depth. It is updated each time a
>>> request specific to that controller starts or ends.
>>>
>>> [edm: patch developed by Thomas Song @ Pure Storage, fixed whitespace
>>>         and compilation warnings, updated MODULE_PARM description, and
>>>         fixed potential issue with ->current_path[] being used]
>>>
>>> Co-developed-by: Thomas Song <tsong@purestorage.com>
>>> Signed-off-by: Ewan D. Milne <emilne@redhat.com>
>>> ---
>>>
>>
>> any performance comparison that shows the difference ?
>>
>> -ck
>>
>>
> 
> 



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth"
  2023-11-07 21:56   ` Ewan Milne
  2023-11-07 23:32     ` John Meneghini
@ 2023-11-08  4:14     ` Chaitanya Kulkarni
  1 sibling, 0 replies; 24+ messages in thread
From: Chaitanya Kulkarni @ 2023-11-08  4:14 UTC (permalink / raw)
  To: Ewan Milne; +Cc: tsong, linux-nvme

On 11/7/23 13:56, Ewan Milne wrote:
> Yes, we have some graphs.  John M. presented them at ALPSS and there were
> some earlier ones at LSF/MM.  I'll see if I can put up the latest set
> for download.
>
> The basic issue is that with round-robin, requests for most/all of the
> tagset space
> can end up on a path that is responding slowly, so we see a
> significant imbalance
> in path utilization.
>
>
> -Ewan

Sounds good, please add a cover-letter on your next version and document
those so everyone can see them, I'm pretty sure I'm not the only whose
going to ask this question :) ...

Can you also please write a block test (cc me) for this under nvme
category ? This needs to be tested every-time we send pull request.

-ck



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy
  2023-11-07 21:53   ` Keith Busch
  2023-11-07 22:03     ` Ewan Milne
@ 2023-11-08  8:09     ` Christoph Hellwig
  2023-11-08 16:58       ` John Meneghini
  2023-11-08 18:38       ` Ewan Milne
  1 sibling, 2 replies; 24+ messages in thread
From: Christoph Hellwig @ 2023-11-08  8:09 UTC (permalink / raw)
  To: Keith Busch; +Cc: Ewan D. Milne, linux-nvme, tsong

On Tue, Nov 07, 2023 at 02:53:48PM -0700, Keith Busch wrote:
> On Tue, Nov 07, 2023 at 04:23:30PM -0500, Ewan D. Milne wrote:
> > The atomic updates of ctrl->nr_active are unnecessary when using
> > numa or round-robin iopolicy, so avoid that cost on a per-request basis.
> > Clear nr_active when changing iopolicy and do not decrement below zero.
> > (This handles changing the iopolicy while requests are in flight.)
> 
> Oh, here's restricting it to that policy. Any reason not to fold it in
> the first one?

It should, and I agree with all the other comments.

But I'm also pretty deeply unhappy with the whole thing.  This is a
controller-wise atomic taken for every I/O.  How slow are the subsystems
people want to use it for?  And is a global max active really the
right measure, or would e a per-cpu, or at least batched per-cpu as
used by the percpu counters by a better option?



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3] nvme: multipath: Invalidate current_path when changing iopolicy
  2023-11-07 21:23 ` [PATCH 3/3] nvme: multipath: Invalidate current_path when changing iopolicy Ewan D. Milne
@ 2023-11-08  8:10   ` Christoph Hellwig
  0 siblings, 0 replies; 24+ messages in thread
From: Christoph Hellwig @ 2023-11-08  8:10 UTC (permalink / raw)
  To: Ewan D. Milne; +Cc: linux-nvme, tsong

On Tue, Nov 07, 2023 at 04:23:31PM -0500, Ewan D. Milne wrote:
> When switching back to numa from round-robin, current_path may refer to
> a different path than the one numa would have selected, and it is desirable
> to have consistent behavior.

In general invalidating the current path when switching the I/O policy
seems like the right thing to do.  So independent of the rest of
the work we really should add this:

Reviewed-by: Christoph Hellwig <hch@lst.de>


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy
  2023-11-08  8:09     ` Christoph Hellwig
@ 2023-11-08 16:58       ` John Meneghini
  2023-11-08 18:38       ` Ewan Milne
  1 sibling, 0 replies; 24+ messages in thread
From: John Meneghini @ 2023-11-08 16:58 UTC (permalink / raw)
  To: linux-nvme

On 11/8/23 03:09, Christoph Hellwig wrote:
> But I'm also pretty deeply unhappy with the whole thing. This is a controller-wise atomic taken for every I/O. How slow are the 
> subsystems people want to use it for?

This would never be useful with PCIe subsystems which all have latency/response times in the 10s of microseconds range.

This is really only useful with Fibre channel and TCP attached subsystems. We have several enterprise class storage arrays here 
in our lab at Red Hat - all with multiple 32GB FC and 100Gbps TCP connections. Based on our own private performance analysis of 
these devices they have latency/response times in 10s of milliseconds range... at best... and the performance bottle necks are 
in the storage arrays, not in the fabrics. Even some of the RDMA attached storage arrays are slow. These devices have trouble 
scaling up performance... so they they scale-out instead, adding more and more paths and controllers to different domains in 
their subsystems.

I am not concerned about the addition of one atomic counter with these use cases. In a grand scheme of things, when using an 
nvme-of attached storage array, the trade offs are worth it.  Besides, if you've dived into the TCP socket layer you'll see 
there are plenty of atomic counters in the data path already.  This is the main use case which needs QD scheduling - nvme/tcp - 
where latency can build up in the connection as well as in the storage array.

/John



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy
  2023-11-08  8:09     ` Christoph Hellwig
  2023-11-08 16:58       ` John Meneghini
@ 2023-11-08 18:38       ` Ewan Milne
  1 sibling, 0 replies; 24+ messages in thread
From: Ewan Milne @ 2023-11-08 18:38 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Keith Busch, linux-nvme, tsong

I did attempt an implementation with percpu counters but I did not
see enough of a benefit.  I also tried updating the counters for only
a fraction of the requests, what I observed is that it does not work
nearly as well as counting them all, unfortunately.

-Ewan

On Wed, Nov 8, 2023 at 3:10 AM Christoph Hellwig <hch@infradead.org> wrote:
>
> On Tue, Nov 07, 2023 at 02:53:48PM -0700, Keith Busch wrote:
> > On Tue, Nov 07, 2023 at 04:23:30PM -0500, Ewan D. Milne wrote:
> > > The atomic updates of ctrl->nr_active are unnecessary when using
> > > numa or round-robin iopolicy, so avoid that cost on a per-request basis.
> > > Clear nr_active when changing iopolicy and do not decrement below zero.
> > > (This handles changing the iopolicy while requests are in flight.)
> >
> > Oh, here's restricting it to that policy. Any reason not to fold it in
> > the first one?
>
> It should, and I agree with all the other comments.
>
> But I'm also pretty deeply unhappy with the whole thing.  This is a
> controller-wise atomic taken for every I/O.  How slow are the subsystems
> people want to use it for?  And is a global max active really the
> right measure, or would e a per-cpu, or at least batched per-cpu as
> used by the percpu counters by a better option?
>
>



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy
  2023-11-07 21:23 ` [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy Ewan D. Milne
  2023-11-07 21:42   ` Chaitanya Kulkarni
  2023-11-07 21:53   ` Keith Busch
@ 2023-11-10  1:18   ` Uday Shankar
  2023-11-13 21:16     ` Ewan Milne
  2 siblings, 1 reply; 24+ messages in thread
From: Uday Shankar @ 2023-11-10  1:18 UTC (permalink / raw)
  To: Ewan D. Milne; +Cc: linux-nvme, tsong

On Tue, Nov 07, 2023 at 04:23:30PM -0500, Ewan D. Milne wrote:
> The atomic updates of ctrl->nr_active are unnecessary when using
> numa or round-robin iopolicy, so avoid that cost on a per-request basis.
> Clear nr_active when changing iopolicy and do not decrement below zero.
> (This handles changing the iopolicy while requests are in flight.)

Instead of trying to handle a changing iopolicy while requests are in
flight, can we quiesce I/O when we change the iopolicy? That should let
us simplify/speed up the logic in the I/O path a bit
(atomic_dec_if_positive seems to hide a cmpxchg loop on most
architectures, which can be slower than an atomic_dec).

> 
> Signed-off-by: Ewan D. Milne <emilne@redhat.com>
> ---
>  drivers/nvme/host/core.c      |  2 +-
>  drivers/nvme/host/multipath.c | 21 ++++++++++++++++++---
>  drivers/nvme/host/nvme.h      |  2 ++
>  3 files changed, 21 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 75a1b58a7a43..9bc19755be77 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -110,7 +110,7 @@ struct workqueue_struct *nvme_delete_wq;
>  EXPORT_SYMBOL_GPL(nvme_delete_wq);
>  
>  static LIST_HEAD(nvme_subsystems);
> -static DEFINE_MUTEX(nvme_subsystems_lock);
> +DEFINE_MUTEX(nvme_subsystems_lock);
>  
>  static DEFINE_IDA(nvme_instance_ida);
>  static dev_t nvme_ctrl_base_chr_devt;
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 4c2690cddef3..e184e7c377bc 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -133,7 +133,8 @@ void nvme_mpath_start_request(struct request *rq)
>  	if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
>  		return;
>  
> -	atomic_inc(&ns->ctrl->nr_active);
> +	if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD)
> +		atomic_inc(&ns->ctrl->nr_active);
>  	nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
>  	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
>  						      jiffies);
> @@ -147,7 +148,8 @@ void nvme_mpath_end_request(struct request *rq)
>  	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
>  		return;
>  
> -	atomic_dec(&ns->ctrl->nr_active);
> +	if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD)
> +		atomic_dec_if_positive(&ns->ctrl->nr_active);
>  	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
>  			 blk_rq_bytes(rq) >> SECTOR_SHIFT,
>  			 nvme_req(rq)->start_time);
> @@ -848,6 +850,19 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
>  			  nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
>  }
>  
> +void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
> +{
> +	struct nvme_ctrl *ctrl;
> +
> +	WRITE_ONCE(subsys->iopolicy, iopolicy);
> +
> +	mutex_lock(&nvme_subsystems_lock);
> +	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> +		atomic_set(&ctrl->nr_active, 0);
> +	}
> +	mutex_unlock(&nvme_subsystems_lock);
> +}
> +
>  static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
>  		struct device_attribute *attr, const char *buf, size_t count)
>  {
> @@ -857,7 +872,7 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
>  
>  	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
>  		if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
> -			WRITE_ONCE(subsys->iopolicy, i);
> +			nvme_subsys_iopolicy_update(subsys, i);
>  			return count;
>  		}
>  	}
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index f0f3fd8b4197..c4469bc38d89 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -49,6 +49,8 @@ extern struct workqueue_struct *nvme_wq;
>  extern struct workqueue_struct *nvme_reset_wq;
>  extern struct workqueue_struct *nvme_delete_wq;
>  
> +extern struct mutex nvme_subsystems_lock;
> +
>  /*
>   * List of workarounds for devices that required behavior not specified in
>   * the standard.
> -- 
> 2.20.1
> 
> 


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy
  2023-11-10  1:18   ` Uday Shankar
@ 2023-11-13 21:16     ` Ewan Milne
  0 siblings, 0 replies; 24+ messages in thread
From: Ewan Milne @ 2023-11-13 21:16 UTC (permalink / raw)
  To: Uday Shankar; +Cc: linux-nvme, tsong

I think quiescing all the queues associated with the nvme_ctrl would
be undesirable
just to handle changing the iopolicy, which only needs to affect I/O
that has not yet
gone through the path selection.  It would be better not to make the
sysfs write have
to wait, as well (although this could be done with a work item, it
seems like overkill).

Probably there is not enough contention for the atomic_dec_if_positive() to loop
but I take your point, I'll look at it.  The equipment I have here is
likely not fast enough
to be sure.

-Ewan



On Thu, Nov 9, 2023 at 8:18 PM Uday Shankar <ushankar@purestorage.com> wrote:
>
> On Tue, Nov 07, 2023 at 04:23:30PM -0500, Ewan D. Milne wrote:
> > The atomic updates of ctrl->nr_active are unnecessary when using
> > numa or round-robin iopolicy, so avoid that cost on a per-request basis.
> > Clear nr_active when changing iopolicy and do not decrement below zero.
> > (This handles changing the iopolicy while requests are in flight.)
>
> Instead of trying to handle a changing iopolicy while requests are in
> flight, can we quiesce I/O when we change the iopolicy? That should let
> us simplify/speed up the logic in the I/O path a bit
> (atomic_dec_if_positive seems to hide a cmpxchg loop on most
> architectures, which can be slower than an atomic_dec).
>
> >
> > Signed-off-by: Ewan D. Milne <emilne@redhat.com>
> > ---
> >  drivers/nvme/host/core.c      |  2 +-
> >  drivers/nvme/host/multipath.c | 21 ++++++++++++++++++---
> >  drivers/nvme/host/nvme.h      |  2 ++
> >  3 files changed, 21 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index 75a1b58a7a43..9bc19755be77 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -110,7 +110,7 @@ struct workqueue_struct *nvme_delete_wq;
> >  EXPORT_SYMBOL_GPL(nvme_delete_wq);
> >
> >  static LIST_HEAD(nvme_subsystems);
> > -static DEFINE_MUTEX(nvme_subsystems_lock);
> > +DEFINE_MUTEX(nvme_subsystems_lock);
> >
> >  static DEFINE_IDA(nvme_instance_ida);
> >  static dev_t nvme_ctrl_base_chr_devt;
> > diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> > index 4c2690cddef3..e184e7c377bc 100644
> > --- a/drivers/nvme/host/multipath.c
> > +++ b/drivers/nvme/host/multipath.c
> > @@ -133,7 +133,8 @@ void nvme_mpath_start_request(struct request *rq)
> >       if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
> >               return;
> >
> > -     atomic_inc(&ns->ctrl->nr_active);
> > +     if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD)
> > +             atomic_inc(&ns->ctrl->nr_active);
> >       nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
> >       nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
> >                                                     jiffies);
> > @@ -147,7 +148,8 @@ void nvme_mpath_end_request(struct request *rq)
> >       if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
> >               return;
> >
> > -     atomic_dec(&ns->ctrl->nr_active);
> > +     if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD)
> > +             atomic_dec_if_positive(&ns->ctrl->nr_active);
> >       bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
> >                        blk_rq_bytes(rq) >> SECTOR_SHIFT,
> >                        nvme_req(rq)->start_time);
> > @@ -848,6 +850,19 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
> >                         nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
> >  }
> >
> > +void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
> > +{
> > +     struct nvme_ctrl *ctrl;
> > +
> > +     WRITE_ONCE(subsys->iopolicy, iopolicy);
> > +
> > +     mutex_lock(&nvme_subsystems_lock);
> > +     list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> > +             atomic_set(&ctrl->nr_active, 0);
> > +     }
> > +     mutex_unlock(&nvme_subsystems_lock);
> > +}
> > +
> >  static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
> >               struct device_attribute *attr, const char *buf, size_t count)
> >  {
> > @@ -857,7 +872,7 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
> >
> >       for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
> >               if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
> > -                     WRITE_ONCE(subsys->iopolicy, i);
> > +                     nvme_subsys_iopolicy_update(subsys, i);
> >                       return count;
> >               }
> >       }
> > diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> > index f0f3fd8b4197..c4469bc38d89 100644
> > --- a/drivers/nvme/host/nvme.h
> > +++ b/drivers/nvme/host/nvme.h
> > @@ -49,6 +49,8 @@ extern struct workqueue_struct *nvme_wq;
> >  extern struct workqueue_struct *nvme_reset_wq;
> >  extern struct workqueue_struct *nvme_delete_wq;
> >
> > +extern struct mutex nvme_subsystems_lock;
> > +
> >  /*
> >   * List of workarounds for devices that required behavior not specified in
> >   * the standard.
> > --
> > 2.20.1
> >
> >
>



^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v2 0/3] nvme: queue-depth multipath iopolicy
  2023-11-07 21:23 [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" Ewan D. Milne
                   ` (4 preceding siblings ...)
  2023-11-07 21:49 ` Keith Busch
@ 2024-05-09 20:29 ` John Meneghini
  2024-05-09 20:29 ` [PATCH v2 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" John Meneghini
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 24+ messages in thread
From: John Meneghini @ 2024-05-09 20:29 UTC (permalink / raw)
  To: kbusch, hch, sagi, emilne
  Cc: linux-nvme, linux-kernel, jmeneghi, jrani, randyj, hare, constg,
	aviv.coro

I'm re-issuing Ewan's queue-depth patches in preparation for LSFMM

These patches were first show at ALPSS 2023 where I shared the following
graphs which measure the IO distribution across 4 active-optimized
controllers using the round-robin verses queue-depth iopolicy.

 https://people.redhat.com/jmeneghi/ALPSS_2023/NVMe_QD_Multipathing.pdf

Since that time we have continued testing these patches with a number of
different nvme-of storage arrays and test bed configurations, and I've codified
the tests and methods we use to measure IO distribution 

All of my test results, together with the scripts I used to generate these
graphs, are available at:

 https://github.com/johnmeneghini/iopolicy

Please use the scripts in this repository to do your own testing.

These patches are based on nvme-v6.9

Ewan D. Milne (3):
  nvme: multipath: Implemented new iopolicy "queue-depth"
  nvme: multipath: only update ctrl->nr_active when using queue-depth
    iopolicy
  nvme: multipath: Invalidate current_path when changing iopolicy

 drivers/nvme/host/core.c      |  2 +-
 drivers/nvme/host/multipath.c | 77 +++++++++++++++++++++++++++++++++--
 drivers/nvme/host/nvme.h      |  8 ++++
 3 files changed, 82 insertions(+), 5 deletions(-)

-- 
2.39.3


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v2 1/3] nvme: multipath: Implemented new iopolicy "queue-depth"
  2023-11-07 21:23 [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" Ewan D. Milne
                   ` (5 preceding siblings ...)
  2024-05-09 20:29 ` [PATCH v2 0/3] nvme: queue-depth multipath iopolicy John Meneghini
@ 2024-05-09 20:29 ` John Meneghini
  2024-05-09 20:29 ` [PATCH v2 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy John Meneghini
  2024-05-09 20:29 ` [PATCH v2 3/3] nvme: multipath: Invalidate current_path when changing iopolicy John Meneghini
  8 siblings, 0 replies; 24+ messages in thread
From: John Meneghini @ 2024-05-09 20:29 UTC (permalink / raw)
  To: kbusch, hch, sagi, emilne
  Cc: linux-nvme, linux-kernel, jmeneghi, jrani, randyj, hare, constg,
	aviv.coro

From: "Ewan D. Milne" <emilne@redhat.com>

The existing iopolicies are inefficient in some cases, such as
the presence of a path with high latency. The round-robin
policy would use that path equally with faster paths, which
results in sub-optimal performance.

The queue-depth policy instead sends I/O requests down the path
with the least amount of requests in its request queue. Paths
with lower latency will clear requests more quickly and have less
requests in their queues compared to "bad" paths. The aim is to
use those paths the most to bring down overall latency.

This implementation adds an atomic variable to the nvme_ctrl
struct to represent the queue depth. It is updated each time a
request specific to that controller starts or ends.

[edm: patch developed by Thomas Song @ Pure Storage, fixed whitespace
      and compilation warnings, updated MODULE_PARM description, and
      fixed potential issue with ->current_path[] being used]

Tested-by: John Meneghini <jmeneghi@redhat.com>
Co-developed-by: Thomas Song <tsong@purestorage.com>
Signed-off-by: Thomas Song <tsong@purestorage.com>
Signed-off-by: Ewan D. Milne <emilne@redhat.com>
---
 drivers/nvme/host/multipath.c | 59 +++++++++++++++++++++++++++++++++--
 drivers/nvme/host/nvme.h      |  2 ++
 2 files changed, 58 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index d16e976ae1a4..3603dd2c785f 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -17,6 +17,7 @@ MODULE_PARM_DESC(multipath,
 static const char *nvme_iopolicy_names[] = {
 	[NVME_IOPOLICY_NUMA]	= "numa",
 	[NVME_IOPOLICY_RR]	= "round-robin",
+	[NVME_IOPOLICY_QD]      = "queue-depth",
 };
 
 static int iopolicy = NVME_IOPOLICY_NUMA;
@@ -29,6 +30,8 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
 		iopolicy = NVME_IOPOLICY_NUMA;
 	else if (!strncmp(val, "round-robin", 11))
 		iopolicy = NVME_IOPOLICY_RR;
+	else if (!strncmp(val, "queue-depth", 11))
+		iopolicy = NVME_IOPOLICY_QD;
 	else
 		return -EINVAL;
 
@@ -43,7 +46,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
 module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
 	&iopolicy, 0644);
 MODULE_PARM_DESC(iopolicy,
-	"Default multipath I/O policy; 'numa' (default) or 'round-robin'");
+	"Default multipath I/O policy; 'numa' (default) , 'round-robin' or 'queue-depth'");
 
 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
 {
@@ -130,6 +133,7 @@ void nvme_mpath_start_request(struct request *rq)
 	if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
 		return;
 
+	atomic_inc(&ns->ctrl->nr_active);
 	nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
 	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
 						      jiffies);
@@ -142,6 +146,8 @@ void nvme_mpath_end_request(struct request *rq)
 
 	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
 		return;
+
+	atomic_dec(&ns->ctrl->nr_active);
 	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
 			 blk_rq_bytes(rq) >> SECTOR_SHIFT,
 			 nvme_req(rq)->start_time);
@@ -331,6 +337,40 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
 	return found;
 }
 
+static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
+{
+	struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
+	unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
+	unsigned int depth;
+
+	list_for_each_entry_rcu(ns, &head->list, siblings) {
+		if (nvme_path_is_disabled(ns))
+			continue;
+
+		depth = atomic_read(&ns->ctrl->nr_active);
+
+		switch (ns->ana_state) {
+		case NVME_ANA_OPTIMIZED:
+			if (depth < min_depth_opt) {
+				min_depth_opt = depth;
+				best_opt = ns;
+			}
+			break;
+
+		case NVME_ANA_NONOPTIMIZED:
+			if (depth < min_depth_nonopt) {
+				min_depth_nonopt = depth;
+				best_nonopt = ns;
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	return best_opt ? best_opt : best_nonopt;
+}
+
 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 {
 	return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
@@ -339,15 +379,27 @@ static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 
 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 {
-	int node = numa_node_id();
+	int iopolicy = READ_ONCE(head->subsys->iopolicy);
+	int node;
 	struct nvme_ns *ns;
 
+	/*
+	 * queue-depth iopolicy does not need to reference ->current_path
+	 * but round-robin needs the last path used to advance to the
+	 * next one, and numa will continue to use the last path unless
+	 * it is or has become not optimized
+	 */
+	if (iopolicy == NVME_IOPOLICY_QD)
+		return nvme_queue_depth_path(head);
+
+	node = numa_node_id();
 	ns = srcu_dereference(head->current_path[node], &head->srcu);
 	if (unlikely(!ns))
 		return __nvme_find_path(head, node);
 
-	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
+	if (iopolicy == NVME_IOPOLICY_RR)
 		return nvme_round_robin_path(head, node, ns);
+
 	if (unlikely(!nvme_path_is_optimized(ns)))
 		return __nvme_find_path(head, node);
 	return ns;
@@ -906,6 +958,7 @@ void nvme_mpath_init_ctrl(struct nvme_ctrl *ctrl)
 	mutex_init(&ctrl->ana_lock);
 	timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
 	INIT_WORK(&ctrl->ana_work, nvme_ana_work);
+	atomic_set(&ctrl->nr_active, 0);
 }
 
 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 05532c281177..e23e17ab94b9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -359,6 +359,7 @@ struct nvme_ctrl {
 	size_t ana_log_size;
 	struct timer_list anatt_timer;
 	struct work_struct ana_work;
+	atomic_t nr_active;
 #endif
 
 #ifdef CONFIG_NVME_HOST_AUTH
@@ -407,6 +408,7 @@ static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
 enum nvme_iopolicy {
 	NVME_IOPOLICY_NUMA,
 	NVME_IOPOLICY_RR,
+	NVME_IOPOLICY_QD,
 };
 
 struct nvme_subsystem {
-- 
2.39.3


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v2 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy
  2023-11-07 21:23 [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" Ewan D. Milne
                   ` (6 preceding siblings ...)
  2024-05-09 20:29 ` [PATCH v2 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" John Meneghini
@ 2024-05-09 20:29 ` John Meneghini
  2024-05-09 20:29 ` [PATCH v2 3/3] nvme: multipath: Invalidate current_path when changing iopolicy John Meneghini
  8 siblings, 0 replies; 24+ messages in thread
From: John Meneghini @ 2024-05-09 20:29 UTC (permalink / raw)
  To: kbusch, hch, sagi, emilne
  Cc: linux-nvme, linux-kernel, jmeneghi, jrani, randyj, hare, constg,
	aviv.coro

From: "Ewan D. Milne" <emilne@redhat.com>

The atomic updates of ctrl->nr_active are unnecessary when using
numa or round-robin iopolicy, so avoid that cost on a per-request basis.
Clear nr_active when changing iopolicy and do not decrement below zero.
(This handles changing the iopolicy while requests are in flight.)

Tested-by: John Meneghini <jmeneghi@redhat.com>
Signed-off-by: Ewan D. Milne <emilne@redhat.com>
---
 drivers/nvme/host/core.c      |  2 +-
 drivers/nvme/host/multipath.c | 21 ++++++++++++++++++---
 drivers/nvme/host/nvme.h      |  6 ++++++
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 095f59e7aa93..ffc04d65c45e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -110,7 +110,7 @@ struct workqueue_struct *nvme_delete_wq;
 EXPORT_SYMBOL_GPL(nvme_delete_wq);
 
 static LIST_HEAD(nvme_subsystems);
-static DEFINE_MUTEX(nvme_subsystems_lock);
+DEFINE_MUTEX(nvme_subsystems_lock);
 
 static DEFINE_IDA(nvme_instance_ida);
 static dev_t nvme_ctrl_base_chr_devt;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 3603dd2c785f..02baadb45c82 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -133,7 +133,8 @@ void nvme_mpath_start_request(struct request *rq)
 	if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
 		return;
 
-	atomic_inc(&ns->ctrl->nr_active);
+	if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD)
+		atomic_inc(&ns->ctrl->nr_active);
 	nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
 	nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq),
 						      jiffies);
@@ -147,7 +148,8 @@ void nvme_mpath_end_request(struct request *rq)
 	if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
 		return;
 
-	atomic_dec(&ns->ctrl->nr_active);
+	if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD)
+		atomic_dec_if_positive(&ns->ctrl->nr_active);
 	bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
 			 blk_rq_bytes(rq) >> SECTOR_SHIFT,
 			 nvme_req(rq)->start_time);
@@ -851,6 +853,19 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
 			  nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
 }
 
+void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
+{
+	struct nvme_ctrl *ctrl;
+
+	WRITE_ONCE(subsys->iopolicy, iopolicy);
+
+	mutex_lock(&nvme_subsystems_lock);
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+		atomic_set(&ctrl->nr_active, 0);
+	}
+	mutex_unlock(&nvme_subsystems_lock);
+}
+
 static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
@@ -860,7 +875,7 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
 
 	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
 		if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
-			WRITE_ONCE(subsys->iopolicy, i);
+			nvme_subsys_iopolicy_update(subsys, i);
 			return count;
 		}
 	}
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index e23e17ab94b9..a557b4577c01 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -50,6 +50,8 @@ extern struct workqueue_struct *nvme_wq;
 extern struct workqueue_struct *nvme_reset_wq;
 extern struct workqueue_struct *nvme_delete_wq;
 
+extern struct mutex nvme_subsystems_lock;
+
 /*
  * List of workarounds for devices that required behavior not specified in
  * the standard.
@@ -963,6 +965,7 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
 void nvme_mpath_shutdown_disk(struct nvme_ns_head *head);
 void nvme_mpath_start_request(struct request *rq);
 void nvme_mpath_end_request(struct request *rq);
+void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy);
 
 static inline void nvme_trace_bio_complete(struct request *req)
 {
@@ -1062,6 +1065,9 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
 {
 	return false;
 }
+static inline void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
+{
+}
 #endif /* CONFIG_NVME_MULTIPATH */
 
 struct nvme_zone_info {
-- 
2.39.3


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v2 3/3] nvme: multipath: Invalidate current_path when changing iopolicy
  2023-11-07 21:23 [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" Ewan D. Milne
                   ` (7 preceding siblings ...)
  2024-05-09 20:29 ` [PATCH v2 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy John Meneghini
@ 2024-05-09 20:29 ` John Meneghini
  8 siblings, 0 replies; 24+ messages in thread
From: John Meneghini @ 2024-05-09 20:29 UTC (permalink / raw)
  To: kbusch, hch, sagi, emilne
  Cc: linux-nvme, linux-kernel, jmeneghi, jrani, randyj, hare, constg,
	aviv.coro

From: "Ewan D. Milne" <emilne@redhat.com>

When switching back to numa from round-robin, current_path may refer to
a different path than the one numa would have selected, and it is desirable
to have consistent behavior.

Tested-by: John Meneghini <jmeneghi@redhat.com>
Signed-off-by: Ewan D. Milne <emilne@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/multipath.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 02baadb45c82..d916a5ddf5d4 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -862,6 +862,7 @@ void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
 	mutex_lock(&nvme_subsystems_lock);
 	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
 		atomic_set(&ctrl->nr_active, 0);
+		nvme_mpath_clear_ctrl_paths(ctrl);
 	}
 	mutex_unlock(&nvme_subsystems_lock);
 }
-- 
2.39.3


^ permalink raw reply related	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2024-05-09 20:29 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-11-07 21:23 [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" Ewan D. Milne
2023-11-07 21:23 ` [PATCH 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy Ewan D. Milne
2023-11-07 21:42   ` Chaitanya Kulkarni
2023-11-07 21:53   ` Keith Busch
2023-11-07 22:03     ` Ewan Milne
2023-11-08  8:09     ` Christoph Hellwig
2023-11-08 16:58       ` John Meneghini
2023-11-08 18:38       ` Ewan Milne
2023-11-10  1:18   ` Uday Shankar
2023-11-13 21:16     ` Ewan Milne
2023-11-07 21:23 ` [PATCH 3/3] nvme: multipath: Invalidate current_path when changing iopolicy Ewan D. Milne
2023-11-08  8:10   ` Christoph Hellwig
2023-11-07 21:40 ` [PATCH 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" Chaitanya Kulkarni
2023-11-07 21:46 ` Chaitanya Kulkarni
2023-11-07 21:56   ` Ewan Milne
2023-11-07 23:32     ` John Meneghini
2023-11-08  4:14     ` Chaitanya Kulkarni
2023-11-07 21:49 ` Keith Busch
2023-11-07 22:01   ` Ewan Milne
2023-11-07 22:14     ` Keith Busch
2024-05-09 20:29 ` [PATCH v2 0/3] nvme: queue-depth multipath iopolicy John Meneghini
2024-05-09 20:29 ` [PATCH v2 1/3] nvme: multipath: Implemented new iopolicy "queue-depth" John Meneghini
2024-05-09 20:29 ` [PATCH v2 2/3] nvme: multipath: only update ctrl->nr_active when using queue-depth iopolicy John Meneghini
2024-05-09 20:29 ` [PATCH v2 3/3] nvme: multipath: Invalidate current_path when changing iopolicy John Meneghini

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.