All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHv2] nvme-multipath: round-robin I/O policy
@ 2018-12-21 14:13 Hannes Reinecke
  2019-01-03 20:24 ` Ewan D. Milne
  2019-01-04 14:06 ` Martin Wilck
  0 siblings, 2 replies; 7+ messages in thread
From: Hannes Reinecke @ 2018-12-21 14:13 UTC (permalink / raw)


Implement a simple round-robin I/O policy for multipathing.
Path selection is done in two rounds, first iterating across all
optimized paths, and, if that doesn't return any valid paths,
then iterate over all optimized and non-optimized paths.
If no paths are found we're using the existing algorithm.
This patch also implements a sysfs attribute 'iopolicy' to switch
between the current, NUMA-aware I/O policy and the 'round-robin'
I/O policy.
The original NUMA-aware I/O policy is kept as a default.

Signed-off-by: Hannes Reinecke <hare at suse.com>
---
 drivers/nvme/host/core.c      |   6 +++
 drivers/nvme/host/multipath.c | 100 +++++++++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/nvme.h      |  12 +++++
 3 files changed, 117 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 08f2c92602f4..7603aaa8217e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2275,6 +2275,9 @@ static struct attribute *nvme_subsys_attrs[] = {
 	&subsys_attr_serial.attr,
 	&subsys_attr_firmware_rev.attr,
 	&subsys_attr_subsysnqn.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+	&subsys_attr_iopolicy.attr,
+#endif
 	NULL,
 };
 
@@ -2327,6 +2330,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 	memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
 	subsys->vendor_id = le16_to_cpu(id->vid);
 	subsys->cmic = id->cmic;
+#ifdef CONFIG_NVME_MULTIPATH
+	subsys->iopolicy = NVME_IOPOLICY_NUMA;
+#endif
 
 	subsys->dev.class = nvme_subsys_class;
 	subsys->dev.release = nvme_release_subsystem;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 183ec17ba067..69cccdaea62e 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -141,7 +141,10 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
 			continue;
 
-		distance = node_distance(node, ns->ctrl->numa_node);
+		if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
+			distance = node_distance(node, ns->ctrl->numa_node);
+		else
+			distance = LOCAL_DISTANCE;
 
 		switch (ns->ana_state) {
 		case NVME_ANA_OPTIMIZED:
@@ -168,6 +171,54 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 	return found;
 }
 
+static struct nvme_ns *__nvme_rr_next_path(struct nvme_ns_head *head, int node,
+					   struct nvme_ns *old)
+{
+	struct nvme_ns *ns, *found = NULL;
+	bool try_nonoptimized = false;
+
+	if (!old)
+		return NULL;
+retry:
+	ns = old;
+	do {
+		ns = list_next_or_null_rcu(&head->list, &ns->siblings,
+					   struct nvme_ns, siblings);
+		if (!ns) {
+			ns = list_first_or_null_rcu(&head->list, struct nvme_ns,
+						    siblings);
+			if (!ns)
+				return NULL;
+
+			if (ns == old)
+				/*
+				 * The list consists of just one entry.
+				 * Sorry for the noise :-)
+				 */
+				return old;
+		}
+		if (ns->disk && ns->ctrl->state == NVME_CTRL_LIVE) {
+			if (ns->ana_state == NVME_ANA_OPTIMIZED) {
+				found = ns;
+				break;
+			}
+			if (try_nonoptimized &&
+			    ns->ana_state == NVME_ANA_NONOPTIMIZED) {
+				found = ns;
+				break;
+			}
+		}
+	} while (ns != old);
+
+	if (found)
+		rcu_assign_pointer(head->current_path[node], found);
+	else if (!try_nonoptimized) {
+		try_nonoptimized = true;
+		goto retry;
+	}
+	return found;
+}
+
 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 {
 	return ns->ctrl->state == NVME_CTRL_LIVE &&
@@ -180,6 +231,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 	struct nvme_ns *ns;
 
 	ns = srcu_dereference(head->current_path[node], &head->srcu);
+	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
+		ns = __nvme_rr_next_path(head, node, ns);
 	if (unlikely(!ns || !nvme_path_is_optimized(ns)))
 		ns = __nvme_find_path(head, node);
 	return ns;
@@ -471,6 +524,51 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl)
 	cancel_work_sync(&ctrl->ana_work);
 }
 
+#define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
+	struct device_attribute subsys_attr_##_name =	\
+		__ATTR(_name, _mode, _show, _store)
+
+static const char *nvme_iopolicy_names[] = {
+	[NVME_IOPOLICY_UNKNOWN] = "unknown",
+	[NVME_IOPOLICY_NUMA] = "numa",
+	[NVME_IOPOLICY_RR] = "round-robin",
+};
+
+static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct nvme_subsystem *subsys =
+		container_of(dev, struct nvme_subsystem, dev);
+	int iopolicy = NVME_IOPOLICY_UNKNOWN;
+
+	if (iopolicy < ARRAY_SIZE(nvme_iopolicy_names))
+		iopolicy = READ_ONCE(subsys->iopolicy);
+	return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
+}
+
+static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	enum nvme_iopolicy iopolicy = NVME_IOPOLICY_UNKNOWN;
+	struct nvme_subsystem *subsys =
+		container_of(dev, struct nvme_subsystem, dev);
+
+	if (!strncmp(buf, nvme_iopolicy_names[NVME_IOPOLICY_NUMA],
+		     strlen(nvme_iopolicy_names[NVME_IOPOLICY_NUMA])))
+		iopolicy = NVME_IOPOLICY_NUMA;
+	else if (!strncmp(buf, nvme_iopolicy_names[NVME_IOPOLICY_RR],
+		     strlen(nvme_iopolicy_names[NVME_IOPOLICY_RR])))
+		iopolicy = NVME_IOPOLICY_RR;
+
+	if (iopolicy == NVME_IOPOLICY_UNKNOWN)
+		return -EINVAL;
+
+	WRITE_ONCE(subsys->iopolicy, iopolicy);
+	return count;
+}
+SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
+		      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
+
 static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
 		char *buf)
 {
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 2b36ac922596..e24b51a608de 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -246,6 +246,14 @@ struct nvme_ctrl {
 	unsigned long discard_page_busy;
 };
 
+#ifdef CONFIG_NVME_MULTIPATH
+enum nvme_iopolicy {
+	NVME_IOPOLICY_UNKNOWN,
+	NVME_IOPOLICY_NUMA,
+	NVME_IOPOLICY_RR,
+};
+#endif
+
 struct nvme_subsystem {
 	int			instance;
 	struct device		dev;
@@ -265,6 +273,9 @@ struct nvme_subsystem {
 	u8			cmic;
 	u16			vendor_id;
 	struct ida		ns_ida;
+#ifdef CONFIG_NVME_MULTIPATH
+	enum nvme_iopolicy	iopolicy;
+#endif
 };
 
 /*
@@ -486,6 +497,7 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
 
 extern struct device_attribute dev_attr_ana_grpid;
 extern struct device_attribute dev_attr_ana_state;
+extern struct device_attribute subsys_attr_iopolicy;
 
 #else
 static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
-- 
2.16.4

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCHv2] nvme-multipath: round-robin I/O policy
  2018-12-21 14:13 [PATCHv2] nvme-multipath: round-robin I/O policy Hannes Reinecke
@ 2019-01-03 20:24 ` Ewan D. Milne
  2019-01-08 12:01   ` Hannes Reinecke
  2019-01-04 14:06 ` Martin Wilck
  1 sibling, 1 reply; 7+ messages in thread
From: Ewan D. Milne @ 2019-01-03 20:24 UTC (permalink / raw)


On Fri, 2018-12-21@15:13 +0100, Hannes Reinecke wrote:
> Implement a simple round-robin I/O policy for multipathing.
> Path selection is done in two rounds, first iterating across all
> optimized paths, and, if that doesn't return any valid paths,
> then iterate over all optimized and non-optimized paths.
> If no paths are found we're using the existing algorithm.
> This patch also implements a sysfs attribute 'iopolicy' to switch
> between the current, NUMA-aware I/O policy and the 'round-robin'
> I/O policy.
> The original NUMA-aware I/O policy is kept as a default.
> 
> Signed-off-by: Hannes Reinecke <hare at suse.com>

This works fine for me, and resolves the hang I saw in the earlier
version when all the paths were taken down.  I have one comment about
the state checking in this version, see below...

> ---
>  drivers/nvme/host/core.c      |   6 +++
>  drivers/nvme/host/multipath.c | 100 +++++++++++++++++++++++++++++++++++++++++-
>  drivers/nvme/host/nvme.h      |  12 +++++
>  3 files changed, 117 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 08f2c92602f4..7603aaa8217e 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -2275,6 +2275,9 @@ static struct attribute *nvme_subsys_attrs[] = {
>  	&subsys_attr_serial.attr,
>  	&subsys_attr_firmware_rev.attr,
>  	&subsys_attr_subsysnqn.attr,
> +#ifdef CONFIG_NVME_MULTIPATH
> +	&subsys_attr_iopolicy.attr,
> +#endif
>  	NULL,
>  };
>  
> @@ -2327,6 +2330,9 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
>  	memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev));
>  	subsys->vendor_id = le16_to_cpu(id->vid);
>  	subsys->cmic = id->cmic;
> +#ifdef CONFIG_NVME_MULTIPATH
> +	subsys->iopolicy = NVME_IOPOLICY_NUMA;
> +#endif
>  
>  	subsys->dev.class = nvme_subsys_class;
>  	subsys->dev.release = nvme_release_subsystem;
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 183ec17ba067..69cccdaea62e 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -141,7 +141,10 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
>  		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
>  			continue;
>  
> -		distance = node_distance(node, ns->ctrl->numa_node);
> +		if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
> +			distance = node_distance(node, ns->ctrl->numa_node);
> +		else
> +			distance = LOCAL_DISTANCE;
>  
>  		switch (ns->ana_state) {
>  		case NVME_ANA_OPTIMIZED:
> @@ -168,6 +171,54 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
>  	return found;
>  }
>  
> +static struct nvme_ns *__nvme_rr_next_path(struct nvme_ns_head *head, int node,
> +					   struct nvme_ns *old)
> +{
> +	struct nvme_ns *ns, *found = NULL;
> +	bool try_nonoptimized = false;
> +
> +	if (!old)
> +		return NULL;
> +retry:
> +	ns = old;
> +	do {
> +		ns = list_next_or_null_rcu(&head->list, &ns->siblings,
> +					   struct nvme_ns, siblings);
> +		if (!ns) {
> +			ns = list_first_or_null_rcu(&head->list, struct nvme_ns,
> +						    siblings);
> +			if (!ns)
> +				return NULL;
> +
> +			if (ns == old)
> +				/*
> +				 * The list consists of just one entry.
> +				 * Sorry for the noise :-)
> +				 */
> +				return old;
> +		}
> +		if (ns->disk && ns->ctrl->state == NVME_CTRL_LIVE) {

__nvme_find_path() also checks test_bit(NVME_NS_ANA_PENDING, &ns->flags)
in addition to the NVME_CTRL_LIVE state check, maybe this would be needed
here as well?  (i.e. don't select a path that has just received an ANA error
and has the ana_work queued to get the updated ANA state, and queue if
necessary until a usable up-to-date path is found?)

-Ewan

> +			if (ns->ana_state == NVME_ANA_OPTIMIZED) {
> +				found = ns;
> +				break;
> +			}
> +			if (try_nonoptimized &&
> +			    ns->ana_state == NVME_ANA_NONOPTIMIZED) {
> +				found = ns;
> +				break;
> +			}
> +		}
> +	} while (ns != old);
> +
> +	if (found)
> +		rcu_assign_pointer(head->current_path[node], found);
> +	else if (!try_nonoptimized) {
> +		try_nonoptimized = true;
> +		goto retry;
> +	}
> +	return found;
> +}
> +
>  static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
>  {
>  	return ns->ctrl->state == NVME_CTRL_LIVE &&
> @@ -180,6 +231,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
>  	struct nvme_ns *ns;
>  
>  	ns = srcu_dereference(head->current_path[node], &head->srcu);
> +	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
> +		ns = __nvme_rr_next_path(head, node, ns);
>  	if (unlikely(!ns || !nvme_path_is_optimized(ns)))
>  		ns = __nvme_find_path(head, node);
>  	return ns;
> @@ -471,6 +524,51 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl)
>  	cancel_work_sync(&ctrl->ana_work);
>  }
>  
> +#define SUBSYS_ATTR_RW(_name, _mode, _show, _store)  \
> +	struct device_attribute subsys_attr_##_name =	\
> +		__ATTR(_name, _mode, _show, _store)
> +
> +static const char *nvme_iopolicy_names[] = {
> +	[NVME_IOPOLICY_UNKNOWN] = "unknown",
> +	[NVME_IOPOLICY_NUMA] = "numa",
> +	[NVME_IOPOLICY_RR] = "round-robin",
> +};
> +
> +static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
> +		struct device_attribute *attr, char *buf)
> +{
> +	struct nvme_subsystem *subsys =
> +		container_of(dev, struct nvme_subsystem, dev);
> +	int iopolicy = NVME_IOPOLICY_UNKNOWN;
> +
> +	if (iopolicy < ARRAY_SIZE(nvme_iopolicy_names))
> +		iopolicy = READ_ONCE(subsys->iopolicy);
> +	return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
> +}
> +
> +static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
> +		struct device_attribute *attr, const char *buf, size_t count)
> +{
> +	enum nvme_iopolicy iopolicy = NVME_IOPOLICY_UNKNOWN;
> +	struct nvme_subsystem *subsys =
> +		container_of(dev, struct nvme_subsystem, dev);
> +
> +	if (!strncmp(buf, nvme_iopolicy_names[NVME_IOPOLICY_NUMA],
> +		     strlen(nvme_iopolicy_names[NVME_IOPOLICY_NUMA])))
> +		iopolicy = NVME_IOPOLICY_NUMA;
> +	else if (!strncmp(buf, nvme_iopolicy_names[NVME_IOPOLICY_RR],
> +		     strlen(nvme_iopolicy_names[NVME_IOPOLICY_RR])))
> +		iopolicy = NVME_IOPOLICY_RR;
> +
> +	if (iopolicy == NVME_IOPOLICY_UNKNOWN)
> +		return -EINVAL;
> +
> +	WRITE_ONCE(subsys->iopolicy, iopolicy);
> +	return count;
> +}
> +SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
> +		      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
> +
>  static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
>  		char *buf)
>  {
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 2b36ac922596..e24b51a608de 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -246,6 +246,14 @@ struct nvme_ctrl {
>  	unsigned long discard_page_busy;
>  };
>  
> +#ifdef CONFIG_NVME_MULTIPATH
> +enum nvme_iopolicy {
> +	NVME_IOPOLICY_UNKNOWN,
> +	NVME_IOPOLICY_NUMA,
> +	NVME_IOPOLICY_RR,
> +};
> +#endif
> +
>  struct nvme_subsystem {
>  	int			instance;
>  	struct device		dev;
> @@ -265,6 +273,9 @@ struct nvme_subsystem {
>  	u8			cmic;
>  	u16			vendor_id;
>  	struct ida		ns_ida;
> +#ifdef CONFIG_NVME_MULTIPATH
> +	enum nvme_iopolicy	iopolicy;
> +#endif
>  };
>  
>  /*
> @@ -486,6 +497,7 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
>  
>  extern struct device_attribute dev_attr_ana_grpid;
>  extern struct device_attribute dev_attr_ana_state;
> +extern struct device_attribute subsys_attr_iopolicy;
>  
>  #else
>  static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCHv2] nvme-multipath: round-robin I/O policy
  2018-12-21 14:13 [PATCHv2] nvme-multipath: round-robin I/O policy Hannes Reinecke
  2019-01-03 20:24 ` Ewan D. Milne
@ 2019-01-04 14:06 ` Martin Wilck
  2019-01-04 14:24   ` Hannes Reinecke
  1 sibling, 1 reply; 7+ messages in thread
From: Martin Wilck @ 2019-01-04 14:06 UTC (permalink / raw)


On Fri, 2018-12-21@15:13 +0100, Hannes Reinecke wrote:
> Implement a simple round-robin I/O policy for multipathing.
> Path selection is done in two rounds, first iterating across all
> optimized paths, and, if that doesn't return any valid paths,
> then iterate over all optimized and non-optimized paths.
> If no paths are found we're using the existing algorithm.
> This patch also implements a sysfs attribute 'iopolicy' to switch
> between the current, NUMA-aware I/O policy and the 'round-robin'
> I/O policy.
> The original NUMA-aware I/O policy is kept as a default.
> 
> Signed-off-by: Hannes Reinecke <hare at suse.com>
> ---
>  drivers/nvme/host/core.c      |   6 +++
>  drivers/nvme/host/multipath.c | 100
> +++++++++++++++++++++++++++++++++++++++++-
>  drivers/nvme/host/nvme.h      |  12 +++++
>  3 files changed, 117 insertions(+), 1 deletion(-)
>  
> +static struct nvme_ns *__nvme_rr_next_path(struct nvme_ns_head
> *head, int node,
> +					   struct nvme_ns *old)
> +{
> +	struct nvme_ns *ns, *found = NULL;
> +	bool try_nonoptimized = false;
> +
> +	if (!old)
> +		return NULL;
> +retry:
> +	ns = old;
> +	do {
> +		ns = list_next_or_null_rcu(&head->list, &ns->siblings,
> +					   struct nvme_ns, siblings);
> +		if (!ns) {
> +			ns = list_first_or_null_rcu(&head->list, struct
> nvme_ns,
> +						    siblings);
> +			if (!ns)
> +				return NULL;
> +
> +			if (ns == old)
> +				/*
> +				 * The list consists of just one entry.
> +				 * Sorry for the noise :-)
> +				 */
> +				return old;
> +		}
> +		if (ns->disk && ns->ctrl->state == NVME_CTRL_LIVE) {
> +			if (ns->ana_state == NVME_ANA_OPTIMIZED) {
> +				found = ns;
> +				break;
> +			}
> +			if (try_nonoptimized &&
> +			    ns->ana_state == NVME_ANA_NONOPTIMIZED) {
> +				found = ns;
> +				break;
> +			}
> +		}
> +	} while (ns != old);
> +
> +	if (found)
> +		rcu_assign_pointer(head->current_path[node], found);
> +	else if (!try_nonoptimized) {
> +		try_nonoptimized = true;
> +		goto retry;
> +	}
> +	return found;
> +}

Would it make sense to skip the rcu_assign_pointer call if ns == old?

Also, if no OPTIMIZED paths are found in the first iteration, I'd find
it cleaner to restrict the search to NONOPTIMIZED paths in the second
round (rather than looking at both OPTIMIZED and NONOPTIMIZED), similar
to a pathgroup switch in traditional multipathing. The behavior would
then come down to a PG switch with "immediate" fallback in multipath-
tools terms.

> +
>  static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
>  {
>  	return ns->ctrl->state == NVME_CTRL_LIVE &&
> @@ -180,6 +231,8 @@ inline struct nvme_ns *nvme_find_path(struct
> nvme_ns_head *head)
>  	struct nvme_ns *ns;
>  
>  	ns = srcu_dereference(head->current_path[node], &head->srcu);
> +	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
> +		ns = __nvme_rr_next_path(head, node, ns);
>  	if (unlikely(!ns || !nvme_path_is_optimized(ns)))
>  		ns = __nvme_find_path(head, node);
>  	return ns;

I don't understand why you call __nvme_find_path() here. If the return
value of __nvme_rr_next_path() is NULL, you already know that there's
no usable path; and if its NONOPTIMIZED, you know that there are no
OPTIMIZED paths that __nvme_find_path() could discover.

Regards,
Martin

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCHv2] nvme-multipath: round-robin I/O policy
  2019-01-04 14:06 ` Martin Wilck
@ 2019-01-04 14:24   ` Hannes Reinecke
  2019-01-04 15:17     ` Martin Wilck
  0 siblings, 1 reply; 7+ messages in thread
From: Hannes Reinecke @ 2019-01-04 14:24 UTC (permalink / raw)


On 1/4/19 3:06 PM, Martin Wilck wrote:
> On Fri, 2018-12-21@15:13 +0100, Hannes Reinecke wrote:
>> Implement a simple round-robin I/O policy for multipathing.
>> Path selection is done in two rounds, first iterating across all
>> optimized paths, and, if that doesn't return any valid paths,
>> then iterate over all optimized and non-optimized paths.
>> If no paths are found we're using the existing algorithm.
>> This patch also implements a sysfs attribute 'iopolicy' to switch
>> between the current, NUMA-aware I/O policy and the 'round-robin'
>> I/O policy.
>> The original NUMA-aware I/O policy is kept as a default.
>>
>> Signed-off-by: Hannes Reinecke <hare at suse.com>
>> ---
>>   drivers/nvme/host/core.c      |   6 +++
>>   drivers/nvme/host/multipath.c | 100
>> +++++++++++++++++++++++++++++++++++++++++-
>>   drivers/nvme/host/nvme.h      |  12 +++++
>>   3 files changed, 117 insertions(+), 1 deletion(-)
>>   
>> +static struct nvme_ns *__nvme_rr_next_path(struct nvme_ns_head
>> *head, int node,
>> +					   struct nvme_ns *old)
>> +{
>> +	struct nvme_ns *ns, *found = NULL;
>> +	bool try_nonoptimized = false;
>> +
>> +	if (!old)
>> +		return NULL;
>> +retry:
>> +	ns = old;
>> +	do {
>> +		ns = list_next_or_null_rcu(&head->list, &ns->siblings,
>> +					   struct nvme_ns, siblings);
>> +		if (!ns) {
>> +			ns = list_first_or_null_rcu(&head->list, struct
>> nvme_ns,
>> +						    siblings);
>> +			if (!ns)
>> +				return NULL;
>> +
>> +			if (ns == old)
>> +				/*
>> +				 * The list consists of just one entry.
>> +				 * Sorry for the noise :-)
>> +				 */
>> +				return old;
>> +		}
>> +		if (ns->disk && ns->ctrl->state == NVME_CTRL_LIVE) {
>> +			if (ns->ana_state == NVME_ANA_OPTIMIZED) {
>> +				found = ns;
>> +				break;
>> +			}
>> +			if (try_nonoptimized &&
>> +			    ns->ana_state == NVME_ANA_NONOPTIMIZED) {
>> +				found = ns;
>> +				break;
>> +			}
>> +		}
>> +	} while (ns != old);
>> +
>> +	if (found)
>> +		rcu_assign_pointer(head->current_path[node], found);
>> +	else if (!try_nonoptimized) {
>> +		try_nonoptimized = true;
>> +		goto retry;
>> +	}
>> +	return found;
>> +}
> 
> Would it make sense to skip the rcu_assign_pointer call if ns == old?

Hmm. That would mean an additional condition which would need to offset 
against 'rcu_assign_pointer'; doubt you could measure a difference here.

> Also, if no OPTIMIZED paths are found in the first iteration, I'd find
> it cleaner to restrict the search to NONOPTIMIZED paths in the second
> round (rather than looking at both OPTIMIZED and NONOPTIMIZED), similar
> to a pathgroup switch in traditional multipathing. The behavior would
> then come down to a PG switch with "immediate" fallback in multipath-
> tools terms.
> 
Concurrency is the key. We're running lockless here, so the path state 
might change at any time.
Only checking for non-optimized would cause us to miss any paths which 
went to 'optimized' in between checks.

>> +
>>   static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
>>   {
>>   	return ns->ctrl->state == NVME_CTRL_LIVE &&
>> @@ -180,6 +231,8 @@ inline struct nvme_ns *nvme_find_path(struct
>> nvme_ns_head *head)
>>   	struct nvme_ns *ns;
>>   
>>   	ns = srcu_dereference(head->current_path[node], &head->srcu);
>> +	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
>> +		ns = __nvme_rr_next_path(head, node, ns);
>>   	if (unlikely(!ns || !nvme_path_is_optimized(ns)))
>>   		ns = __nvme_find_path(head, node);
>>   	return ns;
> 
> I don't understand why you call __nvme_find_path() here. If the return
> value of __nvme_rr_next_path() is NULL, you already know that there's
> no usable path; and if its NONOPTIMIZED, you know that there are no
> OPTIMIZED paths that __nvme_find_path() could discover.
> 
Concurrency again.
Paths might have switched priorities after our check.
Plus we have an additional safeguard that we'll never degrade wrt the 
original algorithm in terms of valid paths.

Cheers,

Hannes

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCHv2] nvme-multipath: round-robin I/O policy
  2019-01-04 14:24   ` Hannes Reinecke
@ 2019-01-04 15:17     ` Martin Wilck
  2019-01-29  8:21       ` Christoph Hellwig
  0 siblings, 1 reply; 7+ messages in thread
From: Martin Wilck @ 2019-01-04 15:17 UTC (permalink / raw)


On Fri, 2019-01-04@15:24 +0100, Hannes Reinecke wrote:
> On 1/4/19 3:06 PM, Martin Wilck wrote:
> > 
> > Also, if no OPTIMIZED paths are found in the first iteration, I'd
> > find
> > it cleaner to restrict the search to NONOPTIMIZED paths in the
> > second
> > round (rather than looking at both OPTIMIZED and NONOPTIMIZED),
> > similar
> > to a pathgroup switch in traditional multipathing. The behavior
> > would
> > then come down to a PG switch with "immediate" fallback in
> > multipath-
> > tools terms.
> > 
> Concurrency is the key. We're running lockless here, so the path
> state 
> might change at any time.
> Only checking for non-optimized would cause us to miss any paths
> which 
> went to 'optimized' in between checks.

How likely is that, given the path-search loop is supposed to be very
fast? Your paths could just as easily change state again before you
actually come to submit IO to them. If you take this sort of argument
seriously, you can never be sure about the state of the path you're
trying to use, no matter how often you check; while in practice, such
concurrent state changes will occur no more than once in a million I/O
submissions. Or am I missing something?

Anyway it was just a nit; so fine with me.

Thanks,
Martin

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCHv2] nvme-multipath: round-robin I/O policy
  2019-01-03 20:24 ` Ewan D. Milne
@ 2019-01-08 12:01   ` Hannes Reinecke
  0 siblings, 0 replies; 7+ messages in thread
From: Hannes Reinecke @ 2019-01-08 12:01 UTC (permalink / raw)


On 1/3/19 9:24 PM, Ewan D. Milne wrote:
> On Fri, 2018-12-21@15:13 +0100, Hannes Reinecke wrote:
>> Implement a simple round-robin I/O policy for multipathing.
>> Path selection is done in two rounds, first iterating across all
>> optimized paths, and, if that doesn't return any valid paths,
>> then iterate over all optimized and non-optimized paths.
>> If no paths are found we're using the existing algorithm.
>> This patch also implements a sysfs attribute 'iopolicy' to switch
>> between the current, NUMA-aware I/O policy and the 'round-robin'
>> I/O policy.
>> The original NUMA-aware I/O policy is kept as a default.
>>
>> Signed-off-by: Hannes Reinecke <hare at suse.com>
> 
> This works fine for me, and resolves the hang I saw in the earlier
> version when all the paths were taken down.  I have one comment about
> the state checking in this version, see below...
> 
[ .. ]
>> @@ -168,6 +171,54 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
>>   	return found;
>>   }
>>   
>> +static struct nvme_ns *__nvme_rr_next_path(struct nvme_ns_head *head, int node,
>> +					   struct nvme_ns *old)
>> +{
>> +	struct nvme_ns *ns, *found = NULL;
>> +	bool try_nonoptimized = false;
>> +
>> +	if (!old)
>> +		return NULL;
>> +retry:
>> +	ns = old;
>> +	do {
>> +		ns = list_next_or_null_rcu(&head->list, &ns->siblings,
>> +					   struct nvme_ns, siblings);
>> +		if (!ns) {
>> +			ns = list_first_or_null_rcu(&head->list, struct nvme_ns,
>> +						    siblings);
>> +			if (!ns)
>> +				return NULL;
>> +
>> +			if (ns == old)
>> +				/*
>> +				 * The list consists of just one entry.
>> +				 * Sorry for the noise :-)
>> +				 */
>> +				return old;
>> +		}
>> +		if (ns->disk && ns->ctrl->state == NVME_CTRL_LIVE) {
> 
> __nvme_find_path() also checks test_bit(NVME_NS_ANA_PENDING, &ns->flags)
> in addition to the NVME_CTRL_LIVE state check, maybe this would be needed
> here as well?  (i.e. don't select a path that has just received an ANA error
> and has the ana_work queued to get the updated ANA state, and queue if
> necessary until a usable up-to-date path is found?)
> 
Hmm. Guess you are right.
Will be updating the patch.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCHv2] nvme-multipath: round-robin I/O policy
  2019-01-04 15:17     ` Martin Wilck
@ 2019-01-29  8:21       ` Christoph Hellwig
  0 siblings, 0 replies; 7+ messages in thread
From: Christoph Hellwig @ 2019-01-29  8:21 UTC (permalink / raw)


This looks mostly good to me.  I think we can simplify the sysfs
attribute parsing a bit, though and clean up the path selector
and use the same patterns as in the normal NUMA one.  Untestested
incremental patch below:

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index bfd4f0aa9de1..ed02cc31eb88 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -171,51 +171,48 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 	return found;
 }
 
-static struct nvme_ns *__nvme_rr_next_path(struct nvme_ns_head *head, int node,
-					   struct nvme_ns *old)
+static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
+		struct nvme_ns *ns)
 {
-	struct nvme_ns *ns, *found = NULL;
-	bool try_nonoptimized = false;
+	ns = list_next_or_null_rcu(&head->list, &ns->siblings, struct nvme_ns,
+			siblings);
+	if (ns)
+		return ns;
+	return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
+}
 
-	if (!old)
-		return NULL;
-retry:
-	ns = old;
-	do {
-		ns = list_next_or_null_rcu(&head->list, &ns->siblings,
-					   struct nvme_ns, siblings);
-		if (!ns) {
-			ns = list_first_or_null_rcu(&head->list, struct nvme_ns,
-						    siblings);
-			if (!ns)
-				return NULL;
-
-			if (ns == old)
-				/*
-				 * The list consists of just one entry.
-				 * Sorry for the noise :-)
-				 */
-				return old;
-		}
-		if (ns->disk && ns->ctrl->state == NVME_CTRL_LIVE) {
-			if (ns->ana_state == NVME_ANA_OPTIMIZED) {
-				found = ns;
-				break;
-			}
-			if (try_nonoptimized &&
-			    ns->ana_state == NVME_ANA_NONOPTIMIZED) {
-				found = ns;
-				break;
-			}
-		}
-	} while (ns != old);
+static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
+		int node, struct nvme_ns *old)
+{
+	struct nvme_ns *ns, *found, *fallback = NULL;
 
-	if (found)
-		rcu_assign_pointer(head->current_path[node], found);
-	else if (!try_nonoptimized) {
-		try_nonoptimized = true;
-		goto retry;
+	if (list_is_singular(&head->list))
+		return old;
+
+	for (ns = nvme_next_ns(head, old);
+	     ns != old;
+	     ns = nvme_next_ns(head, ns)) {
+		if (ns->ctrl->state != NVME_CTRL_LIVE ||
+		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
+			continue;
+
+		switch (ns->ana_state) {
+		case NVME_ANA_OPTIMIZED:
+			found = ns;
+			goto out;
+		case NVME_ANA_NONOPTIMIZED:
+			fallback = ns;
+			break;
+		default:
+			break;
+		}
 	}
+
+	if (!fallback)
+		return NULL;
+	found = fallback;
+out:
+	rcu_assign_pointer(head->current_path[node], found);
 	return found;
 }
 
@@ -231,8 +228,8 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 	struct nvme_ns *ns;
 
 	ns = srcu_dereference(head->current_path[node], &head->srcu);
-	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
-		ns = __nvme_rr_next_path(head, node, ns);
+	if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns)
+		ns = nvme_round_robin_path(head, node, ns);
 	if (unlikely(!ns || !nvme_path_is_optimized(ns)))
 		ns = __nvme_find_path(head, node);
 	return ns;
@@ -529,9 +526,8 @@ void nvme_mpath_stop(struct nvme_ctrl *ctrl)
 		__ATTR(_name, _mode, _show, _store)
 
 static const char *nvme_iopolicy_names[] = {
-	[NVME_IOPOLICY_UNKNOWN] = "unknown",
-	[NVME_IOPOLICY_NUMA] = "numa",
-	[NVME_IOPOLICY_RR] = "round-robin",
+	[NVME_IOPOLICY_NUMA]	= "numa",
+	[NVME_IOPOLICY_RR]	= "round-robin",
 };
 
 static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
@@ -539,32 +535,26 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
 {
 	struct nvme_subsystem *subsys =
 		container_of(dev, struct nvme_subsystem, dev);
-	int iopolicy = NVME_IOPOLICY_UNKNOWN;
 
-	if (iopolicy < ARRAY_SIZE(nvme_iopolicy_names))
-		iopolicy = READ_ONCE(subsys->iopolicy);
-	return sprintf(buf, "%s\n", nvme_iopolicy_names[iopolicy]);
+	return sprintf(buf, "%s\n",
+			nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
 }
 
 static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
-	enum nvme_iopolicy iopolicy = NVME_IOPOLICY_UNKNOWN;
 	struct nvme_subsystem *subsys =
 		container_of(dev, struct nvme_subsystem, dev);
+	int i;
 
-	if (!strncmp(buf, nvme_iopolicy_names[NVME_IOPOLICY_NUMA],
-		     strlen(nvme_iopolicy_names[NVME_IOPOLICY_NUMA])))
-		iopolicy = NVME_IOPOLICY_NUMA;
-	else if (!strncmp(buf, nvme_iopolicy_names[NVME_IOPOLICY_RR],
-		     strlen(nvme_iopolicy_names[NVME_IOPOLICY_RR])))
-		iopolicy = NVME_IOPOLICY_RR;
-
-	if (iopolicy == NVME_IOPOLICY_UNKNOWN)
-		return -EINVAL;
+	for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
+		if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
+			WRITE_ONCE(subsys->iopolicy, i);
+			return count;
+		}
+	}
 
-	WRITE_ONCE(subsys->iopolicy, iopolicy);
-	return count;
+	return -EINVAL;
 }
 SUBSYS_ATTR_RW(iopolicy, S_IRUGO | S_IWUSR,
 		      nvme_subsys_iopolicy_show, nvme_subsys_iopolicy_store);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 0fb1f9dc6800..36d4b166a155 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -251,13 +251,10 @@ struct nvme_ctrl {
 	unsigned long discard_page_busy;
 };
 
-#ifdef CONFIG_NVME_MULTIPATH
 enum nvme_iopolicy {
-	NVME_IOPOLICY_UNKNOWN,
 	NVME_IOPOLICY_NUMA,
 	NVME_IOPOLICY_RR,
 };
-#endif
 
 struct nvme_subsystem {
 	int			instance;

^ permalink raw reply related	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2019-01-29  8:21 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-21 14:13 [PATCHv2] nvme-multipath: round-robin I/O policy Hannes Reinecke
2019-01-03 20:24 ` Ewan D. Milne
2019-01-08 12:01   ` Hannes Reinecke
2019-01-04 14:06 ` Martin Wilck
2019-01-04 14:24   ` Hannes Reinecke
2019-01-04 15:17     ` Martin Wilck
2019-01-29  8:21       ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.