linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] nvme-multipath: implement active-active round-robin path selector
@ 2018-03-27  4:38 Baegjae Sung
  2018-03-28  8:06 ` Christoph Hellwig
  2018-04-04 12:39 ` Sagi Grimberg
  0 siblings, 2 replies; 9+ messages in thread
From: Baegjae Sung @ 2018-03-27  4:38 UTC (permalink / raw)
  To: keith.busch, axboe, hch, sagi, baegjae; +Cc: linux-nvme, linux-kernel

Some storage environments (e.g., dual-port NVMe SSD) provide higher
performance when using multiple paths simultaneously. Choosing a
path from multiple paths in a round-robin fashion is a simple and
efficient way to meet these requirements.

We implement the active-active round-robin path selector that
chooses the path that is NVME_CTRL_LIVE and next to the previous
path. By maintaining the structure of the active-standby path
selector, we can easily switch between the active-standby path
selector and the active-active round-robin path selector.

Example usage)
  # cat /sys/block/nvme0n1/mpath_policy
  [active-standby] round-robin
  # echo round-robin > /sys/block/nvme0n1/mpath_policy
  # cat /sys/block/nvme0n1/mpath_policy
  active-standby [round-robin]

Below are the results from a physical dual-port NVMe SSD using fio.

(MB/s)                  active-standby     round-robin
Random Read (4k)            1,672             2,640
Sequential Read (128k)      1,707             3,414
Random Write (4k)           1,450             1,728
Sequential Write (128k)     1,481             2,708

A single thread was used for sequential workloads and 16 threads
were used for random workloads. The queue depth for each thread
was 64.

Signed-off-by: Baegjae Sung <baegjae@gmail.com>
---
 drivers/nvme/host/core.c      | 49 +++++++++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/multipath.c | 45 ++++++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/nvme.h      |  8 +++++++
 3 files changed, 101 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7aeca5db7916..cc91e8b247d0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -68,6 +68,13 @@ static bool streams;
 module_param(streams, bool, 0644);
 MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
 
+#ifdef CONFIG_NVME_MULTIPATH
+static const char *const mpath_policy_name[] = {
+	[NVME_MPATH_ACTIVE_STANDBY] = "active-standby",
+	[NVME_MPATH_ROUND_ROBIN] = "round-robin",
+};
+#endif
+
 /*
  * nvme_wq - hosts nvme related works that are not reset or delete
  * nvme_reset_wq - hosts nvme reset works
@@ -2603,12 +2610,51 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(nsid);
 
+#ifdef CONFIG_NVME_MULTIPATH
+static ssize_t mpath_policy_show(struct device *dev,
+	struct device_attribute *attr,
+	char *buf)
+{
+	int i, len = 0;
+	struct nvme_ns_head *head = dev_to_ns_head(dev);
+
+	for (i = 0;i < ARRAY_SIZE(mpath_policy_name);i++) {
+		if (i == head->mpath_policy)
+			len += sprintf(buf + len, "[%s] ", mpath_policy_name[i]);
+		else
+			len += sprintf(buf + len, "%s ", mpath_policy_name[i]);
+	}
+	len += sprintf(buf + len, "\n");
+	return len;
+}
+static ssize_t mpath_policy_store(struct device *dev,
+	struct device_attribute *attr, const char *buf,
+	size_t count)
+{
+	int i;
+	struct nvme_ns_head *head = dev_to_ns_head(dev);
+
+	for (i = 0;i < ARRAY_SIZE(mpath_policy_name);i++) {
+		if (strncmp(buf, mpath_policy_name[i], count - 1) == 0) {
+			head->mpath_policy = i;
+			dev_info(dev, "change mpath policy to %s\n", mpath_policy_name[i]);
+		}
+	}
+	return count;
+}
+static DEVICE_ATTR(mpath_policy, S_IRUGO | S_IWUSR, mpath_policy_show, \
+	mpath_policy_store);
+#endif
+
 static struct attribute *nvme_ns_id_attrs[] = {
 	&dev_attr_wwid.attr,
 	&dev_attr_uuid.attr,
 	&dev_attr_nguid.attr,
 	&dev_attr_eui.attr,
 	&dev_attr_nsid.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+	&dev_attr_mpath_policy.attr,
+#endif
 	NULL,
 };
 
@@ -2818,6 +2864,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
 	head->subsys = ctrl->subsys;
 	head->ns_id = nsid;
 	kref_init(&head->ref);
+#ifdef CONFIG_NVME_MULTIPATH
+	head->mpath_policy = NVME_MPATH_ACTIVE_STANDBY;
+#endif
 
 	nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
 
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 060f69e03427..6b6a15ccb542 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -75,6 +75,42 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 	return ns;
 }
 
+inline struct nvme_ns *nvme_find_path_rr(struct nvme_ns_head *head)
+{
+	struct nvme_ns *prev_ns = srcu_dereference(head->current_path, &head->srcu);
+	struct nvme_ns *ns, *cand_ns = NULL;
+	bool after_prev_ns = false;
+
+	/*
+	 * Active-active round-robin path selector
+	 * Choose the path that is NVME_CTRL_LIVE and next to the previous path
+	 */
+
+	/* Case 1. If there is no previous path, choose the first LIVE path */
+	if (!prev_ns) {
+		ns = __nvme_find_path(head);
+		return ns;
+	}
+
+	list_for_each_entry_rcu(ns, &head->list, siblings) {
+		/*
+		 * Case 2-1. Choose the first LIVE path from the next path of
+		 * previous path to end
+		 */
+		if (after_prev_ns && ns->ctrl->state == NVME_CTRL_LIVE) {
+			rcu_assign_pointer(head->current_path, ns);
+			return ns;
+		}
+		/* Case 2-2. Mark the first LIVE path from start to previous path */
+		if (!cand_ns && ns->ctrl->state == NVME_CTRL_LIVE)
+			cand_ns = ns;
+		if (ns == prev_ns)
+			after_prev_ns = true;
+	}
+	rcu_assign_pointer(head->current_path, cand_ns);
+	return cand_ns;
+}
+
 static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 		struct bio *bio)
 {
@@ -85,7 +121,14 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
 	int srcu_idx;
 
 	srcu_idx = srcu_read_lock(&head->srcu);
-	ns = nvme_find_path(head);
+	switch (head->mpath_policy) {
+	case NVME_MPATH_ROUND_ROBIN:
+		ns = nvme_find_path_rr(head);
+		break;
+	case NVME_MPATH_ACTIVE_STANDBY:
+	default:
+		ns = nvme_find_path(head);
+	}
 	if (likely(ns)) {
 		bio->bi_disk = ns->disk;
 		bio->bi_opf |= REQ_NVME_MPATH;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index d733b14ede9d..15e1163bbf2b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -128,6 +128,13 @@ enum nvme_ctrl_state {
 	NVME_CTRL_DEAD,
 };
 
+#ifdef CONFIG_NVME_MULTIPATH
+enum nvme_mpath_policy {
+	NVME_MPATH_ACTIVE_STANDBY,
+	NVME_MPATH_ROUND_ROBIN, /* active-active round-robin */
+};
+#endif
+
 struct nvme_ctrl {
 	enum nvme_ctrl_state state;
 	bool identified;
@@ -250,6 +257,7 @@ struct nvme_ns_head {
 	struct bio_list		requeue_list;
 	spinlock_t		requeue_lock;
 	struct work_struct	requeue_work;
+	enum nvme_mpath_policy	mpath_policy;
 #endif
 	struct list_head	list;
 	struct srcu_struct      srcu;
-- 
2.16.2

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-27  4:38 [PATCH] nvme-multipath: implement active-active round-robin path selector Baegjae Sung
@ 2018-03-28  8:06 ` Christoph Hellwig
  2018-03-28 19:47   ` Keith Busch
  2018-04-04 12:36   ` Sagi Grimberg
  2018-04-04 12:39 ` Sagi Grimberg
  1 sibling, 2 replies; 9+ messages in thread
From: Christoph Hellwig @ 2018-03-28  8:06 UTC (permalink / raw)
  To: Baegjae Sung; +Cc: keith.busch, axboe, hch, sagi, linux-nvme, linux-kernel

For PCIe devices the right policy is not a round robin but to use
the pcie device closer to the node.  I did a prototype for that
long ago and the concept can work.  Can you look into that and
also make that policy used automatically for PCIe devices?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-28  8:06 ` Christoph Hellwig
@ 2018-03-28 19:47   ` Keith Busch
  2018-03-29  8:56     ` Christoph Hellwig
  2018-03-30  4:57     ` Baegjae Sung
  2018-04-04 12:36   ` Sagi Grimberg
  1 sibling, 2 replies; 9+ messages in thread
From: Keith Busch @ 2018-03-28 19:47 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Baegjae Sung, axboe, sagi, linux-nvme, linux-kernel

On Wed, Mar 28, 2018 at 10:06:46AM +0200, Christoph Hellwig wrote:
> For PCIe devices the right policy is not a round robin but to use
> the pcie device closer to the node.  I did a prototype for that
> long ago and the concept can work.  Can you look into that and
> also make that policy used automatically for PCIe devices?

Yeah, that is especially true if you've multiple storage accessing
threads scheduled on different nodes. On the other hand, round-robin
may still benefit if both paths are connected to different root ports
on the same node (who would do that?!).

But I wasn't aware people use dual-ported PCIe NVMe connected to a
single host (single path from two hosts seems more common). If that's a
thing, we should get some numa awareness. I couldn't find your prototype,
though. I had one stashed locally from a while back and hope it resembles
what you had in mind:
---
struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
{
        int distance, current = INT_MAX, node = cpu_to_node(smp_processor_id());
        struct nvme_ns *ns, *path = NULL;

        list_for_each_entry_rcu(ns, &head->list, siblings) {
                if (ns->ctrl->state != NVME_CTRL_LIVE)
                        continue;
                if (ns->disk->node_id == node)
                        return ns;

                distance = node_distance(node, ns->disk->node_id);
                if (distance < current) {
                        current = distance;
                        path = ns;
                }
        }
        return path;
}
--

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-28 19:47   ` Keith Busch
@ 2018-03-29  8:56     ` Christoph Hellwig
  2018-03-30  4:57     ` Baegjae Sung
  1 sibling, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2018-03-29  8:56 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Baegjae Sung, axboe, sagi, linux-nvme, linux-kernel

On Wed, Mar 28, 2018 at 01:47:41PM -0600, Keith Busch wrote:
> single host (single path from two hosts seems more common). If that's a
> thing, we should get some numa awareness. I couldn't find your prototype,
> though. I had one stashed locally from a while back and hope it resembles
> what you had in mind:

Mine was way older before the current data structures.  Back then I
used ->map_queues hacks that I'm glad I never posted :)

> ---
> struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
> {
>         int distance, current = INT_MAX, node = cpu_to_node(smp_processor_id());
>         struct nvme_ns *ns, *path = NULL;
> 
>         list_for_each_entry_rcu(ns, &head->list, siblings) {
>                 if (ns->ctrl->state != NVME_CTRL_LIVE)
>                         continue;
>                 if (ns->disk->node_id == node)
>                         return ns;
> 
>                 distance = node_distance(node, ns->disk->node_id);
>                 if (distance < current) {
>                         current = distance;
>                         path = ns;
>                 }
>         }
>         return path;

This is roughly what I'd do now.  The other important change would
be to have a per-node cache of the current path so that we don't do
it in the hot path.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-28 19:47   ` Keith Busch
  2018-03-29  8:56     ` Christoph Hellwig
@ 2018-03-30  4:57     ` Baegjae Sung
  2018-03-30  7:06       ` Christoph Hellwig
  1 sibling, 1 reply; 9+ messages in thread
From: Baegjae Sung @ 2018-03-30  4:57 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, axboe, sagi, linux-nvme, linux-kernel, Eric Chang

2018-03-29 4:47 GMT+09:00 Keith Busch <keith.busch@intel.com>:
> On Wed, Mar 28, 2018 at 10:06:46AM +0200, Christoph Hellwig wrote:
>> For PCIe devices the right policy is not a round robin but to use
>> the pcie device closer to the node.  I did a prototype for that
>> long ago and the concept can work.  Can you look into that and
>> also make that policy used automatically for PCIe devices?
>
> Yeah, that is especially true if you've multiple storage accessing
> threads scheduled on different nodes. On the other hand, round-robin
> may still benefit if both paths are connected to different root ports
> on the same node (who would do that?!).
>
> But I wasn't aware people use dual-ported PCIe NVMe connected to a
> single host (single path from two hosts seems more common). If that's a
> thing, we should get some numa awareness. I couldn't find your prototype,
> though. I had one stashed locally from a while back and hope it resembles
> what you had in mind:

Our prototype uses dual-ported PCIe NVMe connected to a single host. The
host's HBA is connected to two switches, and the two switches are connected
to a dual-port NVMe SSD. In this environment, active-active round-robin path
selection is good to utilize the full performance of a dual-port NVMe SSD.
You can also fail over a single switch failure. You can see the prototype
in link below.
https://youtu.be/u_ou-AQsvOs?t=307 (presentation in OCP Summit 2018)

I agree that active-standby closer path selection is the right policy
if multiple
nodes attempt to access the storage system through multiple paths.
However, I believe that NVMe multipath needs to provide multiple policy for
path selection. Some people may want to use multiple paths simultaneously
(active-active) if they use a small number of nodes and want to utilize full
capability. If the capability of paths is same, the round-robin can be
the right
policy. If the capability of paths is different, a more adoptive
method would be
needed (e.g., checking path condition to balance IO).

We are moving to the NVMe fabrics for our next prototype. So, I think we will
have a chance to discuss about this policy issue in more detail. I will continue
to follow this issue.

> ---
> struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
> {
>         int distance, current = INT_MAX, node = cpu_to_node(smp_processor_id());
>         struct nvme_ns *ns, *path = NULL;
>
>         list_for_each_entry_rcu(ns, &head->list, siblings) {
>                 if (ns->ctrl->state != NVME_CTRL_LIVE)
>                         continue;
>                 if (ns->disk->node_id == node)
>                         return ns;
>
>                 distance = node_distance(node, ns->disk->node_id);
>                 if (distance < current) {
>                         current = distance;
>                         path = ns;
>                 }
>         }
>         return path;
> }
> --

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-30  4:57     ` Baegjae Sung
@ 2018-03-30  7:06       ` Christoph Hellwig
       [not found]         ` <e22676055aca4e8d936b7479d6301d85@skt-tnetpmx2.SKT.AD>
  0 siblings, 1 reply; 9+ messages in thread
From: Christoph Hellwig @ 2018-03-30  7:06 UTC (permalink / raw)
  To: Baegjae Sung
  Cc: Keith Busch, Christoph Hellwig, axboe, sagi, linux-nvme,
	linux-kernel, Eric Chang

On Fri, Mar 30, 2018 at 01:57:25PM +0900, Baegjae Sung wrote:
> Our prototype uses dual-ported PCIe NVMe connected to a single host. The
> host's HBA is connected to two switches,

What "HBA"?  We are talking about NVMe here..

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-28  8:06 ` Christoph Hellwig
  2018-03-28 19:47   ` Keith Busch
@ 2018-04-04 12:36   ` Sagi Grimberg
  1 sibling, 0 replies; 9+ messages in thread
From: Sagi Grimberg @ 2018-04-04 12:36 UTC (permalink / raw)
  To: Christoph Hellwig, Baegjae Sung
  Cc: keith.busch, axboe, linux-nvme, linux-kernel


> For PCIe devices the right policy is not a round robin but to use
> the pcie device closer to the node.  I did a prototype for that
> long ago and the concept can work.  Can you look into that and
> also make that policy used automatically for PCIe devices?

I think that active/active makes sense for fabrics (link throughput
aggregation) but also for dual-ported pci devices (given that
this is a real use-case).

I agree that the default can be a home-node path selection.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
  2018-03-27  4:38 [PATCH] nvme-multipath: implement active-active round-robin path selector Baegjae Sung
  2018-03-28  8:06 ` Christoph Hellwig
@ 2018-04-04 12:39 ` Sagi Grimberg
  1 sibling, 0 replies; 9+ messages in thread
From: Sagi Grimberg @ 2018-04-04 12:39 UTC (permalink / raw)
  To: Baegjae Sung, keith.busch, axboe, hch; +Cc: linux-nvme, linux-kernel


> @@ -85,7 +121,14 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
>   	int srcu_idx;
>   
>   	srcu_idx = srcu_read_lock(&head->srcu);
> -	ns = nvme_find_path(head);
> +	switch (head->mpath_policy) {
> +	case NVME_MPATH_ROUND_ROBIN:
> +		ns = nvme_find_path_rr(head);
> +		break;
> +	case NVME_MPATH_ACTIVE_STANDBY:
> +	default:
> +		ns = nvme_find_path(head);
> +	}

If we grow multiple path selectors, would be more elegant to
use a callout mechanism.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
       [not found]         ` <e22676055aca4e8d936b7479d6301d85@skt-tnetpmx2.SKT.AD>
@ 2018-04-04 14:30           ` Keith Busch
  0 siblings, 0 replies; 9+ messages in thread
From: Keith Busch @ 2018-04-04 14:30 UTC (permalink / raw)
  To: Eric H. Chang
  Cc: Christoph Hellwig, Baegjae Sung, axboe, sagi, linux-nvme, linux-kernel

On Fri, Mar 30, 2018 at 09:04:46AM +0000, Eric H. Chang wrote:
> We internally call PCIe-retimer as HBA. It's not a real Host Bus Adapter that translates the interface from PCIe to SATA or SAS. Sorry for the confusion.

Please don't call a PCIe retimer an "HBA"! :)

While your experiment is setup to benefit from round-robin, my only
concern is it has odd performance in a real world scenario with
IO threads executing in different nodes. Christoph's proposal will
naturally utilize both paths optimally there, where round-robin will
saturate node interlinks.

Not that I'm against having the choice; your setup probably does represent
real use. But if we're going to have multiple choice, user documentation
on nvme path selectors will be useful here.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2018-04-04 14:27 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-27  4:38 [PATCH] nvme-multipath: implement active-active round-robin path selector Baegjae Sung
2018-03-28  8:06 ` Christoph Hellwig
2018-03-28 19:47   ` Keith Busch
2018-03-29  8:56     ` Christoph Hellwig
2018-03-30  4:57     ` Baegjae Sung
2018-03-30  7:06       ` Christoph Hellwig
     [not found]         ` <e22676055aca4e8d936b7479d6301d85@skt-tnetpmx2.SKT.AD>
2018-04-04 14:30           ` Keith Busch
2018-04-04 12:36   ` Sagi Grimberg
2018-04-04 12:39 ` Sagi Grimberg

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).