All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] nvme-multipath: implement active-active round-robin path selector
@ 2018-03-27  4:38 ` Baegjae Sung
  0 siblings, 0 replies; 20+ messages in thread
From: Baegjae Sung @ 2018-03-27  4:38 UTC (permalink / raw)
  To: keith.busch, axboe, hch, sagi, baegjae; +Cc: linux-nvme, linux-kernel

Some storage environments (e.g., dual-port NVMe SSD) provide higher
performance when using multiple paths simultaneously. Choosing a
path from multiple paths in a round-robin fashion is a simple and
efficient way to meet these requirements.

We implement the active-active round-robin path selector that
chooses the path that is NVME_CTRL_LIVE and next to the previous
path. By maintaining the structure of the active-standby path
selector, we can easily switch between the active-standby path
selector and the active-active round-robin path selector.

Example usage)
  # cat /sys/block/nvme0n1/mpath_policy
  [active-standby] round-robin
  # echo round-robin > /sys/block/nvme0n1/mpath_policy
  # cat /sys/block/nvme0n1/mpath_policy
  active-standby [round-robin]

Below are the results from a physical dual-port NVMe SSD using fio.

(MB/s)                  active-standby     round-robin
Random Read (4k)            1,672             2,640
Sequential Read (128k)      1,707             3,414
Random Write (4k)           1,450             1,728
Sequential Write (128k)     1,481             2,708

A single thread was used for sequential workloads and 16 threads
were used for random workloads. The queue depth for each thread
was 64.

Signed-off-by: Baegjae Sung <baegjae@gmail.com>
---
 drivers/nvme/host/core.c      | 49 +++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/multipath.c | 45 ++++++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/nvme.h      |  8 +++++++
 3 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7aeca5db7916..cc91e8b247d0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -68,6 +68,13 @@ static bool streams;
 module_param(streams, bool, 0644);
 MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
 
+#ifdef CONFIG_NVME_MULTIPATH
+static const char *const mpath_policy_name[] = {
+	[NVME_MPATH_ACTIVE_STANDBY] = "active-standby",
+	[NVME_MPATH_ROUND_ROBIN] = "round-robin",
+};
+#endif
+
 /*
  * nvme_wq - hosts nvme related works that are not reset or delete
  * nvme_reset_wq - hosts nvme reset works
@@ -2603,12 +2610,51 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(nsid);
 
+#ifdef CONFIG_NVME_MULTIPATH
+static ssize_t mpath_policy_show(struct device *dev,
+	struct device_attribute *attr,
+	char *buf)
+{
+	int i, len = 0;
+	struct nvme_ns_head *head = dev_to_ns_head(dev);
+
+	for (i = 0;i < ARRAY_SIZE(mpath_policy_name);i++) {
+		if (i == head->mpath_policy)
+			len += sprintf(buf + len, "[%s] ", mpath_policy_name[i]);
+		else
+			len += sprintf(buf + len, "%s ", mpath_policy_name[i]);
+	}
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+static ssize_t mpath_policy_store(struct device *dev,
+	struct device_attribute *attr, const char *buf,
+	size_t count)
+{
+	int i;
+	struct nvme_ns_head *head = dev_to_ns_head(dev);
+
+	for (i = 0;i < ARRAY_SIZE(mpath_policy_name);i++) {
+		if (strncmp(buf, mpath_policy_name[i], count - 1) == 0) {
+			head->mpath_policy = i;
+			dev_info(dev, "change mpath policy to %s\n", mpath_policy_name[i]);
+		}
+	}
+	return count;
+}
+static DEVICE_ATTR(mpath_policy, S_IRUGO | S_IWUSR, mpath_policy_show, \
+	mpath_policy_store);
+#endif
+
 static struct attribute *nvme_ns_id_attrs[] = {
 	&dev_attr_wwid.attr,
 	&dev_attr_uuid.attr,
 	&dev_attr_nguid.attr,
 	&dev_attr_eui.attr,
 	&dev_attr_nsid.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+	&dev_attr_mpath_policy.attr,
+#endif
 	NULL,
 };
 
@@ -2818,6 +2864,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
 	head->subsys = ctrl->subsys;
 	head->ns_id = nsid;
 	kref_init(&head->ref);
+#ifdef CONFIG_NVME_MULTIPATH
+	head->mpath_policy = NVME_MPATH_ACTIVE_STANDBY;
+#endif
 
 	nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
 
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 060f69e03427..6b6a15ccb542 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -75,6 +75,42 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 	return ns;
 }
 
+inline struct nvme_ns *nvme_find_path_rr(struct nvme_ns_head *head)
+{
+	struct nvme_ns *prev_ns = srcu_dereference(head->current_path, &head->srcu);
+	struct nvme_ns *ns, *cand_ns = NULL;
+	bool after_prev_ns = false;
+
+	/*
+	 * Active-active round-robin path selector
+	 * Choose the path that is NVME_CTRL_LIVE and next to the previous path
+	 */
+
+	/* Case 1. If there is no previous path, choose the first LIVE path */
+	if (!prev_ns) {
+		ns = __nvme_find_path(head);
+		return ns;
+	}
+
+	list_for_each_entry_rcu(ns, &head->list, siblings) {
+		/*
+		 * Case 2-1. Choose the first LIVE path from the next path of
+		 * previous path to end
+		 */
+		if (after_prev_ns && ns->ctrl->state == NVME_CTRL_LIVE) {
+			rcu_assign_pointer(head->current_path, ns);
+			return ns;
+		}
+		/* Case 2-2. Mark the first LIVE path from start to previous path */
+		if (!cand_ns && ns->ctrl->state == NVME_CTRL_LIVE)
+			cand_ns = ns;
+		if (ns == prev_ns)
+			after_prev_ns = true;
+	}
+	rcu_assign_pointer(head->current_path, cand_ns);
+	return cand_ns;
+}
+
 static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 		struct bio *bio)
 {
@@ -85,7 +121,14 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 	int srcu_idx;
 
 	srcu_idx = srcu_read_lock(&head->srcu);
-	ns = nvme_find_path(head);
+	switch (head->mpath_policy) {
+	case NVME_MPATH_ROUND_ROBIN:
+		ns = nvme_find_path_rr(head);
+		break;
+	case NVME_MPATH_ACTIVE_STANDBY:
+	default:
+		ns = nvme_find_path(head);
+	}
 	if (likely(ns)) {
 		bio->bi_disk = ns->disk;
 		bio->bi_opf |= REQ_NVME_MPATH;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index d733b14ede9d..15e1163bbf2b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -128,6 +128,13 @@ enum nvme_ctrl_state {
 	NVME_CTRL_DEAD,
 };
 
+#ifdef CONFIG_NVME_MULTIPATH
+enum nvme_mpath_policy {
+	NVME_MPATH_ACTIVE_STANDBY,
+	NVME_MPATH_ROUND_ROBIN, /* active-active round-robin */
+};
+#endif
+
 struct nvme_ctrl {
 	enum nvme_ctrl_state state;
 	bool identified;
@@ -250,6 +257,7 @@ struct nvme_ns_head {
 	struct bio_list		requeue_list;
 	spinlock_t		requeue_lock;
 	struct work_struct	requeue_work;
+	enum nvme_mpath_policy	mpath_policy;
 #endif
 	struct list_head	list;
 	struct srcu_struct      srcu;
-- 
2.16.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH] nvme-multipath: implement active-active round-robin path selector
@ 2018-03-27  4:38 ` Baegjae Sung
  0 siblings, 0 replies; 20+ messages in thread
From: Baegjae Sung @ 2018-03-27  4:38 UTC (permalink / raw)


Some storage environments (e.g., dual-port NVMe SSD) provide higher
performance when using multiple paths simultaneously. Choosing a
path from multiple paths in a round-robin fashion is a simple and
efficient way to meet these requirements.

We implement the active-active round-robin path selector that
chooses the path that is NVME_CTRL_LIVE and next to the previous
path. By maintaining the structure of the active-standby path
selector, we can easily switch between the active-standby path
selector and the active-active round-robin path selector.

Example usage)
  # cat /sys/block/nvme0n1/mpath_policy
  [active-standby] round-robin
  # echo round-robin > /sys/block/nvme0n1/mpath_policy
  # cat /sys/block/nvme0n1/mpath_policy
  active-standby [round-robin]

Below are the results from a physical dual-port NVMe SSD using fio.

(MB/s)                  active-standby     round-robin
Random Read (4k)            1,672             2,640
Sequential Read (128k)      1,707             3,414
Random Write (4k)           1,450             1,728
Sequential Write (128k)     1,481             2,708

A single thread was used for sequential workloads and 16 threads
were used for random workloads. The queue depth for each thread
was 64.

Signed-off-by: Baegjae Sung <baegjae at gmail.com>
---
 drivers/nvme/host/core.c      | 49 +++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/multipath.c | 45 ++++++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/nvme.h      |  8 +++++++
 3 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7aeca5db7916..cc91e8b247d0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -68,6 +68,13 @@ static bool streams;
 module_param(streams, bool, 0644);
 MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
 
+#ifdef CONFIG_NVME_MULTIPATH
+static const char *const mpath_policy_name[] = {
+	[NVME_MPATH_ACTIVE_STANDBY] = "active-standby",
+	[NVME_MPATH_ROUND_ROBIN] = "round-robin",
+};
+#endif
+
 /*
  * nvme_wq - hosts nvme related works that are not reset or delete
  * nvme_reset_wq - hosts nvme reset works
@@ -2603,12 +2610,51 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(nsid);
 
+#ifdef CONFIG_NVME_MULTIPATH
+static ssize_t mpath_policy_show(struct device *dev,
+	struct device_attribute *attr,
+	char *buf)
+{
+	int i, len = 0;
+	struct nvme_ns_head *head = dev_to_ns_head(dev);
+
+	for (i = 0;i < ARRAY_SIZE(mpath_policy_name);i++) {
+		if (i == head->mpath_policy)
+			len += sprintf(buf + len, "[%s] ", mpath_policy_name[i]);
+		else
+			len += sprintf(buf + len, "%s ", mpath_policy_name[i]);
+	}
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+static ssize_t mpath_policy_store(struct device *dev,
+	struct device_attribute *attr, const char *buf,
+	size_t count)
+{
+	int i;
+	struct nvme_ns_head *head = dev_to_ns_head(dev);
+
+	for (i = 0;i < ARRAY_SIZE(mpath_policy_name);i++) {
+		if (strncmp(buf, mpath_policy_name[i], count - 1) == 0) {
+			head->mpath_policy = i;
+			dev_info(dev, "change mpath policy to %s\n", mpath_policy_name[i]);
+		}
+	}
+	return count;
+}
+static DEVICE_ATTR(mpath_policy, S_IRUGO | S_IWUSR, mpath_policy_show, \
+	mpath_policy_store);
+#endif
+
 static struct attribute *nvme_ns_id_attrs[] = {
 	&dev_attr_wwid.attr,
 	&dev_attr_uuid.attr,
 	&dev_attr_nguid.attr,
 	&dev_attr_eui.attr,
 	&dev_attr_nsid.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+	&dev_attr_mpath_policy.attr,
+#endif
 	NULL,
 };
 
@@ -2818,6 +2864,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
 	head->subsys = ctrl->subsys;
 	head->ns_id = nsid;
 	kref_init(&head->ref);
+#ifdef CONFIG_NVME_MULTIPATH
+	head->mpath_policy = NVME_MPATH_ACTIVE_STANDBY;
+#endif
 
 	nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
 
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 060f69e03427..6b6a15ccb542 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -75,6 +75,42 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 	return ns;
 }
 
+inline struct nvme_ns *nvme_find_path_rr(struct nvme_ns_head *head)
+{
+	struct nvme_ns *prev_ns = srcu_dereference(head->current_path, &head->srcu);
+	struct nvme_ns *ns, *cand_ns = NULL;
+	bool after_prev_ns = false;
+
+	/*
+	 * Active-active round-robin path selector
+	 * Choose the path that is NVME_CTRL_LIVE and next to the previous path
+	 */
+
+	/* Case 1. If there is no previous path, choose the first LIVE path */
+	if (!prev_ns) {
+		ns = __nvme_find_path(head);
+		return ns;
+	}
+
+	list_for_each_entry_rcu(ns, &head->list, siblings) {
+		/*
+		 * Case 2-1. Choose the first LIVE path from the next path of
+		 * previous path to end
+		 */
+		if (after_prev_ns && ns->ctrl->state == NVME_CTRL_LIVE) {
+			rcu_assign_pointer(head->current_path, ns);
+			return ns;
+		}
+		/* Case 2-2. Mark the first LIVE path from start to previous path */
+		if (!cand_ns && ns->ctrl->state == NVME_CTRL_LIVE)
+			cand_ns = ns;
+		if (ns == prev_ns)
+			after_prev_ns = true;
+	}
+	rcu_assign_pointer(head->current_path, cand_ns);
+	return cand_ns;
+}
+
 static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 		struct bio *bio)
 {
@@ -85,7 +121,14 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 	int srcu_idx;
 
 	srcu_idx = srcu_read_lock(&head->srcu);
-	ns = nvme_find_path(head);
+	switch (head->mpath_policy) {
+	case NVME_MPATH_ROUND_ROBIN:
+		ns = nvme_find_path_rr(head);
+		break;
+	case NVME_MPATH_ACTIVE_STANDBY:
+	default:
+		ns = nvme_find_path(head);
+	}
 	if (likely(ns)) {
 		bio->bi_disk = ns->disk;
 		bio->bi_opf |= REQ_NVME_MPATH;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index d733b14ede9d..15e1163bbf2b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -128,6 +128,13 @@ enum nvme_ctrl_state {
 	NVME_CTRL_DEAD,
 };
 
+#ifdef CONFIG_NVME_MULTIPATH
+enum nvme_mpath_policy {
+	NVME_MPATH_ACTIVE_STANDBY,
+	NVME_MPATH_ROUND_ROBIN, /* active-active round-robin */
+};
+#endif
+
 struct nvme_ctrl {
 	enum nvme_ctrl_state state;
 	bool identified;
@@ -250,6 +257,7 @@ struct nvme_ns_head {
 	struct bio_list		requeue_list;
 	spinlock_t		requeue_lock;
 	struct work_struct	requeue_work;
+	enum nvme_mpath_policy	mpath_policy;
 #endif
 	struct list_head	list;
 	struct srcu_struct      srcu;
-- 
2.16.2

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-27  4:38 ` Baegjae Sung
@ 2018-03-28  8:06   ` Christoph Hellwig
  -1 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2018-03-28  8:06 UTC (permalink / raw)
  To: Baegjae Sung; +Cc: keith.busch, axboe, hch, sagi, linux-nvme, linux-kernel

For PCIe devices the right policy is not a round robin but to use
the pcie device closer to the node.  I did a prototype for that
long ago and the concept can work.  Can you look into that and
also make that policy used automatically for PCIe devices?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] nvme-multipath: implement active-active round-robin path selector
@ 2018-03-28  8:06   ` Christoph Hellwig
  0 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2018-03-28  8:06 UTC (permalink / raw)


For PCIe devices the right policy is not a round robin but to use
the pcie device closer to the node.  I did a prototype for that
long ago and the concept can work.  Can you look into that and
also make that policy used automatically for PCIe devices?

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-28  8:06   ` Christoph Hellwig
@ 2018-03-28 19:47     ` Keith Busch
  -1 siblings, 0 replies; 20+ messages in thread
From: Keith Busch @ 2018-03-28 19:47 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Baegjae Sung, axboe, sagi, linux-nvme, linux-kernel

On Wed, Mar 28, 2018 at 10:06:46AM +0200, Christoph Hellwig wrote:
> For PCIe devices the right policy is not a round robin but to use
> the pcie device closer to the node.  I did a prototype for that
> long ago and the concept can work.  Can you look into that and
> also make that policy used automatically for PCIe devices?

Yeah, that is especially true if you've multiple storage accessing
threads scheduled on different nodes. On the other hand, round-robin
may still benefit if both paths are connected to different root ports
on the same node (who would do that?!).

But I wasn't aware people use dual-ported PCIe NVMe connected to a
single host (single path from two hosts seems more common). If that's a
thing, we should get some numa awareness. I couldn't find your prototype,
though. I had one stashed locally from a while back and hope it resembles
what you had in mind:
---
struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
{
        int distance, current = INT_MAX, node = cpu_to_node(smp_processor_id());
        struct nvme_ns *ns, *path = NULL;

        list_for_each_entry_rcu(ns, &head->list, siblings) {
                if (ns->ctrl->state != NVME_CTRL_LIVE)
                        continue;
                if (ns->disk->node_id == node)
                        return ns;

                distance = node_distance(node, ns->disk->node_id);
                if (distance < current) {
                        current = distance;
                        path = ns;
                }
        }
        return path;
}
--

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] nvme-multipath: implement active-active round-robin path selector
@ 2018-03-28 19:47     ` Keith Busch
  0 siblings, 0 replies; 20+ messages in thread
From: Keith Busch @ 2018-03-28 19:47 UTC (permalink / raw)


On Wed, Mar 28, 2018@10:06:46AM +0200, Christoph Hellwig wrote:
> For PCIe devices the right policy is not a round robin but to use
> the pcie device closer to the node.  I did a prototype for that
> long ago and the concept can work.  Can you look into that and
> also make that policy used automatically for PCIe devices?

Yeah, that is especially true if you've multiple storage accessing
threads scheduled on different nodes. On the other hand, round-robin
may still benefit if both paths are connected to different root ports
on the same node (who would do that?!).

But I wasn't aware people use dual-ported PCIe NVMe connected to a
single host (single path from two hosts seems more common). If that's a
thing, we should get some numa awareness. I couldn't find your prototype,
though. I had one stashed locally from a while back and hope it resembles
what you had in mind:
---
struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
{
        int distance, current = INT_MAX, node = cpu_to_node(smp_processor_id());
        struct nvme_ns *ns, *path = NULL;

        list_for_each_entry_rcu(ns, &head->list, siblings) {
                if (ns->ctrl->state != NVME_CTRL_LIVE)
                        continue;
                if (ns->disk->node_id == node)
                        return ns;

                distance = node_distance(node, ns->disk->node_id);
                if (distance < current) {
                        current = distance;
                        path = ns;
                }
        }
        return path;
}
--

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-28 19:47     ` Keith Busch
@ 2018-03-29  8:56       ` Christoph Hellwig
  -1 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2018-03-29  8:56 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Baegjae Sung, axboe, sagi, linux-nvme, linux-kernel

On Wed, Mar 28, 2018 at 01:47:41PM -0600, Keith Busch wrote:
> single host (single path from two hosts seems more common). If that's a
> thing, we should get some numa awareness. I couldn't find your prototype,
> though. I had one stashed locally from a while back and hope it resembles
> what you had in mind:

Mine was way older before the current data structures.  Back then I
used ->map_queues hacks that I'm glad I never posted :)

> ---
> struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
> {
>         int distance, current = INT_MAX, node = cpu_to_node(smp_processor_id());
>         struct nvme_ns *ns, *path = NULL;
> 
>         list_for_each_entry_rcu(ns, &head->list, siblings) {
>                 if (ns->ctrl->state != NVME_CTRL_LIVE)
>                         continue;
>                 if (ns->disk->node_id == node)
>                         return ns;
> 
>                 distance = node_distance(node, ns->disk->node_id);
>                 if (distance < current) {
>                         current = distance;
>                         path = ns;
>                 }
>         }
>         return path;

This is roughly what I'd do now.  The other important change would
be to have a per-node cache of the current path so that we don't do
it in the hot path.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] nvme-multipath: implement active-active round-robin path selector
@ 2018-03-29  8:56       ` Christoph Hellwig
  0 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2018-03-29  8:56 UTC (permalink / raw)


On Wed, Mar 28, 2018@01:47:41PM -0600, Keith Busch wrote:
> single host (single path from two hosts seems more common). If that's a
> thing, we should get some numa awareness. I couldn't find your prototype,
> though. I had one stashed locally from a while back and hope it resembles
> what you had in mind:

Mine was way older before the current data structures.  Back then I
used ->map_queues hacks that I'm glad I never posted :)

> ---
> struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
> {
>         int distance, current = INT_MAX, node = cpu_to_node(smp_processor_id());
>         struct nvme_ns *ns, *path = NULL;
> 
>         list_for_each_entry_rcu(ns, &head->list, siblings) {
>                 if (ns->ctrl->state != NVME_CTRL_LIVE)
>                         continue;
>                 if (ns->disk->node_id == node)
>                         return ns;
> 
>                 distance = node_distance(node, ns->disk->node_id);
>                 if (distance < current) {
>                         current = distance;
>                         path = ns;
>                 }
>         }
>         return path;

This is roughly what I'd do now.  The other important change would
be to have a per-node cache of the current path so that we don't do
it in the hot path.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-28 19:47     ` Keith Busch
@ 2018-03-30  4:57       ` Baegjae Sung
  -1 siblings, 0 replies; 20+ messages in thread
From: Baegjae Sung @ 2018-03-30  4:57 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, axboe, sagi, linux-nvme, linux-kernel, Eric Chang

2018-03-29 4:47 GMT+09:00 Keith Busch <keith.busch@intel.com>:
> On Wed, Mar 28, 2018 at 10:06:46AM +0200, Christoph Hellwig wrote:
>> For PCIe devices the right policy is not a round robin but to use
>> the pcie device closer to the node.  I did a prototype for that
>> long ago and the concept can work.  Can you look into that and
>> also make that policy used automatically for PCIe devices?
>
> Yeah, that is especially true if you've multiple storage accessing
> threads scheduled on different nodes. On the other hand, round-robin
> may still benefit if both paths are connected to different root ports
> on the same node (who would do that?!).
>
> But I wasn't aware people use dual-ported PCIe NVMe connected to a
> single host (single path from two hosts seems more common). If that's a
> thing, we should get some numa awareness. I couldn't find your prototype,
> though. I had one stashed locally from a while back and hope it resembles
> what you had in mind:

Our prototype uses dual-ported PCIe NVMe connected to a single host. The
host's HBA is connected to two switches, and the two switches are connected
to a dual-port NVMe SSD. In this environment, active-active round-robin path
selection is good to utilize the full performance of a dual-port NVMe SSD.
You can also fail over a single switch failure. You can see the prototype
in link below.
https://youtu.be/u_ou-AQsvOs?t=307 (presentation in OCP Summit 2018)

I agree that active-standby closer path selection is the right policy
if multiple
nodes attempt to access the storage system through multiple paths.
However, I believe that NVMe multipath needs to provide multiple policy for
path selection. Some people may want to use multiple paths simultaneously
(active-active) if they use a small number of nodes and want to utilize full
capability. If the capability of paths is same, the round-robin can be
the right
policy. If the capability of paths is different, a more adoptive
method would be
needed (e.g., checking path condition to balance IO).

We are moving to the NVMe fabrics for our next prototype. So, I think we will
have a chance to discuss about this policy issue in more detail. I will continue
to follow this issue.

> ---
> struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
> {
>         int distance, current = INT_MAX, node = cpu_to_node(smp_processor_id());
>         struct nvme_ns *ns, *path = NULL;
>
>         list_for_each_entry_rcu(ns, &head->list, siblings) {
>                 if (ns->ctrl->state != NVME_CTRL_LIVE)
>                         continue;
>                 if (ns->disk->node_id == node)
>                         return ns;
>
>                 distance = node_distance(node, ns->disk->node_id);
>                 if (distance < current) {
>                         current = distance;
>                         path = ns;
>                 }
>         }
>         return path;
> }
> --

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] nvme-multipath: implement active-active round-robin path selector
@ 2018-03-30  4:57       ` Baegjae Sung
  0 siblings, 0 replies; 20+ messages in thread
From: Baegjae Sung @ 2018-03-30  4:57 UTC (permalink / raw)


2018-03-29 4:47 GMT+09:00 Keith Busch <keith.busch at intel.com>:
> On Wed, Mar 28, 2018@10:06:46AM +0200, Christoph Hellwig wrote:
>> For PCIe devices the right policy is not a round robin but to use
>> the pcie device closer to the node.  I did a prototype for that
>> long ago and the concept can work.  Can you look into that and
>> also make that policy used automatically for PCIe devices?
>
> Yeah, that is especially true if you've multiple storage accessing
> threads scheduled on different nodes. On the other hand, round-robin
> may still benefit if both paths are connected to different root ports
> on the same node (who would do that?!).
>
> But I wasn't aware people use dual-ported PCIe NVMe connected to a
> single host (single path from two hosts seems more common). If that's a
> thing, we should get some numa awareness. I couldn't find your prototype,
> though. I had one stashed locally from a while back and hope it resembles
> what you had in mind:

Our prototype uses dual-ported PCIe NVMe connected to a single host. The
host's HBA is connected to two switches, and the two switches are connected
to a dual-port NVMe SSD. In this environment, active-active round-robin path
selection is good to utilize the full performance of a dual-port NVMe SSD.
You can also fail over a single switch failure. You can see the prototype
in link below.
https://youtu.be/u_ou-AQsvOs?t=307 (presentation in OCP Summit 2018)

I agree that active-standby closer path selection is the right policy
if multiple
nodes attempt to access the storage system through multiple paths.
However, I believe that NVMe multipath needs to provide multiple policy for
path selection. Some people may want to use multiple paths simultaneously
(active-active) if they use a small number of nodes and want to utilize full
capability. If the capability of paths is same, the round-robin can be
the right
policy. If the capability of paths is different, a more adoptive
method would be
needed (e.g., checking path condition to balance IO).

We are moving to the NVMe fabrics for our next prototype. So, I think we will
have a chance to discuss about this policy issue in more detail. I will continue
to follow this issue.

> ---
> struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
> {
>         int distance, current = INT_MAX, node = cpu_to_node(smp_processor_id());
>         struct nvme_ns *ns, *path = NULL;
>
>         list_for_each_entry_rcu(ns, &head->list, siblings) {
>                 if (ns->ctrl->state != NVME_CTRL_LIVE)
>                         continue;
>                 if (ns->disk->node_id == node)
>                         return ns;
>
>                 distance = node_distance(node, ns->disk->node_id);
>                 if (distance < current) {
>                         current = distance;
>                         path = ns;
>                 }
>         }
>         return path;
> }
> --

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-30  4:57       ` Baegjae Sung
@ 2018-03-30  7:06         ` Christoph Hellwig
  -1 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2018-03-30  7:06 UTC (permalink / raw)
  To: Baegjae Sung
  Cc: Keith Busch, Christoph Hellwig, axboe, sagi, linux-nvme,
	linux-kernel, Eric Chang

On Fri, Mar 30, 2018 at 01:57:25PM +0900, Baegjae Sung wrote:
> Our prototype uses dual-ported PCIe NVMe connected to a single host. The
> host's HBA is connected to two switches,

What "HBA"?  We are talking about NVMe here..

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] nvme-multipath: implement active-active round-robin path selector
@ 2018-03-30  7:06         ` Christoph Hellwig
  0 siblings, 0 replies; 20+ messages in thread
From: Christoph Hellwig @ 2018-03-30  7:06 UTC (permalink / raw)


On Fri, Mar 30, 2018@01:57:25PM +0900, Baegjae Sung wrote:
> Our prototype uses dual-ported PCIe NVMe connected to a single host. The
> host's HBA is connected to two switches,

What "HBA"?  We are talking about NVMe here..

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-30  7:06         ` Christoph Hellwig
  (?)
@ 2018-03-30  9:04         ` Eric H. Chang
  2018-04-04 14:30             ` Keith Busch
  -1 siblings, 1 reply; 20+ messages in thread
From: Eric H. Chang @ 2018-03-30  9:04 UTC (permalink / raw)


We internally call PCIe-retimer as HBA. It's not a real Host Bus Adapter that translates the interface from PCIe to SATA or SAS. Sorry for the confusion.

Our setup is like below:
 CPU [server]
 <-> PCIe switch based retimer(aka HBA herein) [server]
 <-> dual PCIe switch nodes [NVMe JBOF storage]
 <-> dual port NVMe SSD [NVMe JBOF storage]

Every server is hooked up with dual PCIe switch nodes for HA purpose. 

-----Original Message-----
From: Christoph Hellwig <hch@lst.de> 
Sent: Friday, March 30, 2018 4:07 PM
To: Baegjae Sung <baegjae at gmail.com>
Cc: Keith Busch <keith.busch at intel.com>; Christoph Hellwig <hch at lst.de>; axboe at fb.com; sagi at grimberg.me; linux-nvme at lists.infradead.org; linux-kernel at vger.kernel.org; ????/SW-Defined Storage Lab <echang at sk.com>
Subject: Re: [PATCH] nvme-multipath: implement active-active round-robin path selector

On Fri, Mar 30, 2018@01:57:25PM +0900, Baegjae Sung wrote:
> Our prototype uses dual-ported PCIe NVMe connected to a single host. 
> The host's HBA is connected to two switches,

What "HBA"?  We are talking about NVMe here..

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-28  8:06   ` Christoph Hellwig
@ 2018-04-04 12:36     ` Sagi Grimberg
  -1 siblings, 0 replies; 20+ messages in thread
From: Sagi Grimberg @ 2018-04-04 12:36 UTC (permalink / raw)
  To: Christoph Hellwig, Baegjae Sung
  Cc: keith.busch, axboe, linux-nvme, linux-kernel


> For PCIe devices the right policy is not a round robin but to use
> the pcie device closer to the node.  I did a prototype for that
> long ago and the concept can work.  Can you look into that and
> also make that policy used automatically for PCIe devices?

I think that active/active makes sense for fabrics (link throughput
aggregation) but also for dual-ported pci devices (given that
this is a real use-case).

I agree that the default can be a home-node path selection.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] nvme-multipath: implement active-active round-robin path selector
@ 2018-04-04 12:36     ` Sagi Grimberg
  0 siblings, 0 replies; 20+ messages in thread
From: Sagi Grimberg @ 2018-04-04 12:36 UTC (permalink / raw)



> For PCIe devices the right policy is not a round robin but to use
> the pcie device closer to the node.  I did a prototype for that
> long ago and the concept can work.  Can you look into that and
> also make that policy used automatically for PCIe devices?

I think that active/active makes sense for fabrics (link throughput
aggregation) but also for dual-ported pci devices (given that
this is a real use-case).

I agree that the default can be a home-node path selection.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-27  4:38 ` Baegjae Sung
@ 2018-04-04 12:39   ` Sagi Grimberg
  -1 siblings, 0 replies; 20+ messages in thread
From: Sagi Grimberg @ 2018-04-04 12:39 UTC (permalink / raw)
  To: Baegjae Sung, keith.busch, axboe, hch; +Cc: linux-nvme, linux-kernel


> @@ -85,7 +121,14 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
>   	int srcu_idx;
>   
>   	srcu_idx = srcu_read_lock(&head->srcu);
> -	ns = nvme_find_path(head);
> +	switch (head->mpath_policy) {
> +	case NVME_MPATH_ROUND_ROBIN:
> +		ns = nvme_find_path_rr(head);
> +		break;
> +	case NVME_MPATH_ACTIVE_STANDBY:
> +	default:
> +		ns = nvme_find_path(head);
> +	}

If we grow multiple path selectors, would be more elegant to
use a callout mechanism.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] nvme-multipath: implement active-active round-robin path selector
@ 2018-04-04 12:39   ` Sagi Grimberg
  0 siblings, 0 replies; 20+ messages in thread
From: Sagi Grimberg @ 2018-04-04 12:39 UTC (permalink / raw)



> @@ -85,7 +121,14 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
>   	int srcu_idx;
>   
>   	srcu_idx = srcu_read_lock(&head->srcu);
> -	ns = nvme_find_path(head);
> +	switch (head->mpath_policy) {
> +	case NVME_MPATH_ROUND_ROBIN:
> +		ns = nvme_find_path_rr(head);
> +		break;
> +	case NVME_MPATH_ACTIVE_STANDBY:
> +	default:
> +		ns = nvme_find_path(head);
> +	}

If we grow multiple path selectors, would be more elegant to
use a callout mechanism.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-30  9:04         ` Eric H. Chang
@ 2018-04-04 14:30             ` Keith Busch
  0 siblings, 0 replies; 20+ messages in thread
From: Keith Busch @ 2018-04-04 14:30 UTC (permalink / raw)
  To: Eric H. Chang
  Cc: Christoph Hellwig, Baegjae Sung, axboe, sagi, linux-nvme, linux-kernel

On Fri, Mar 30, 2018 at 09:04:46AM +0000, Eric H. Chang wrote:
> We internally call PCIe-retimer as HBA. It's not a real Host Bus Adapter that translates the interface from PCIe to SATA or SAS. Sorry for the confusion.

Please don't call a PCIe retimer an "HBA"! :)

While your experiment is setup to benefit from round-robin, my only
concern is it has odd performance in a real world scenario with
IO threads executing in different nodes. Christoph's proposal will
naturally utilize both paths optimally there, where round-robin will
saturate node interlinks.

Not that I'm against having the choice; your setup probably does represent
real use. But if we're going to have multiple choice, user documentation
on nvme path selectors will be useful here.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] nvme-multipath: implement active-active round-robin path selector
@ 2018-04-04 14:30             ` Keith Busch
  0 siblings, 0 replies; 20+ messages in thread
From: Keith Busch @ 2018-04-04 14:30 UTC (permalink / raw)


On Fri, Mar 30, 2018@09:04:46AM +0000, Eric H. Chang wrote:
> We internally call PCIe-retimer as HBA. It's not a real Host Bus Adapter that translates the interface from PCIe to SATA or SAS. Sorry for the confusion.

Please don't call a PCIe retimer an "HBA"! :)

While your experiment is setup to benefit from round-robin, my only
concern is it has odd performance in a real world scenario with
IO threads executing in different nodes. Christoph's proposal will
naturally utilize both paths optimally there, where round-robin will
saturate node interlinks.

Not that I'm against having the choice; your setup probably does represent
real use. But if we're going to have multiple choice, user documentation
on nvme path selectors will be useful here.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-04-04 14:30             ` Keith Busch
  (?)
@ 2018-04-05 10:11             ` Eric H. Chang
  -1 siblings, 0 replies; 20+ messages in thread
From: Eric H. Chang @ 2018-04-05 10:11 UTC (permalink / raw)


> While your experiment is setup to benefit from round-robin, my only concern is it has odd performance in a real world scenario with IO threads executing in different nodes. Christoph's proposal will naturally utilize both paths >optimally there, where round-robin will saturate node interlinks.
>
>Not that I'm against having the choice; your setup probably does represent real use. But if we're going to have multiple choice, user documentation on nvme path selectors will be useful here.


Apologies for the confusion. 
My understanding is that your concern is about efficient CPU-node utilization on NUMA. 
If we come down a bit to the PCIe-retimer with multiple ports (or RDMA NICs with multiple ports) level, our concern is there. 


<Host structure with dual retimers>
CPU node A ------------------- CPU node B 
  |                                    |
PCIe retimer A                     PCIe retimer B
  |                                    |
Port A | Port B                    Port A | Port B


<Host structure with a single retimer>
CPU node A ------------------- CPU node B 
  |                                
PCIe retimer A                  
  |                                
Port A | Port B                 


In any structure above, we'd like to make all available ports efficiently utilized to maximize the performance and this case could also apply to the hosts which are using NVMeOF enabled storages. For NVMeoF setup, the PCIe retimer would be replaced by RDMA NICs though. So, we'd like to discuss the acceptable implementation for the multi-path on multi-ports of the PCIe retimers or R-NICs as well. Since your concern on unbalanced IO threads execution on multi-nodes is valid too,  when your proposal on NUMA and our proposal(or any other's proposal) on multi-ports are combined, it could be most effective solution, I believe. 
In our setup, the system architecture under all ports are symmetrical, round-robin approach still make sense, but if there's any other better consideration, it'd be good to discuss. 

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2018-04-05 10:11 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-27  4:38 [PATCH] nvme-multipath: implement active-active round-robin path selector Baegjae Sung
2018-03-27  4:38 ` Baegjae Sung
2018-03-28  8:06 ` Christoph Hellwig
2018-03-28  8:06   ` Christoph Hellwig
2018-03-28 19:47   ` Keith Busch
2018-03-28 19:47     ` Keith Busch
2018-03-29  8:56     ` Christoph Hellwig
2018-03-29  8:56       ` Christoph Hellwig
2018-03-30  4:57     ` Baegjae Sung
2018-03-30  4:57       ` Baegjae Sung
2018-03-30  7:06       ` Christoph Hellwig
2018-03-30  7:06         ` Christoph Hellwig
2018-03-30  9:04         ` Eric H. Chang
2018-04-04 14:30           ` Keith Busch
2018-04-04 14:30             ` Keith Busch
2018-04-05 10:11             ` Eric H. Chang
2018-04-04 12:36   ` Sagi Grimberg
2018-04-04 12:36     ` Sagi Grimberg
2018-04-04 12:39 ` Sagi Grimberg
2018-04-04 12:39   ` Sagi Grimberg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.