* [PATCH] nvme-multipath: implement active-active round-robin path selector
@ 2018-03-27 4:38 Baegjae Sung
2018-03-28 8:06 ` Christoph Hellwig
2018-04-04 12:39 ` Sagi Grimberg
0 siblings, 2 replies; 9+ messages in thread
From: Baegjae Sung @ 2018-03-27 4:38 UTC (permalink / raw)
To: keith.busch, axboe, hch, sagi, baegjae; +Cc: linux-nvme, linux-kernel
Some storage environments (e.g., dual-port NVMe SSD) provide higher
performance when using multiple paths simultaneously. Choosing a
path from multiple paths in a round-robin fashion is a simple and
efficient way to meet these requirements.
We implement the active-active round-robin path selector that
chooses the path that is NVME_CTRL_LIVE and next to the previous
path. By maintaining the structure of the active-standby path
selector, we can easily switch between the active-standby path
selector and the active-active round-robin path selector.
Example usage)
# cat /sys/block/nvme0n1/mpath_policy
[active-standby] round-robin
# echo round-robin > /sys/block/nvme0n1/mpath_policy
# cat /sys/block/nvme0n1/mpath_policy
active-standby [round-robin]
Below are the results from a physical dual-port NVMe SSD using fio.
(MB/s) active-standby round-robin
Random Read (4k) 1,672 2,640
Sequential Read (128k) 1,707 3,414
Random Write (4k) 1,450 1,728
Sequential Write (128k) 1,481 2,708
A single thread was used for sequential workloads and 16 threads
were used for random workloads. The queue depth for each thread
was 64.
Signed-off-by: Baegjae Sung <baegjae@gmail.com>
---
drivers/nvme/host/core.c | 49 +++++++++++++++++++++++++++++++++++++++++++
drivers/nvme/host/multipath.c | 45 ++++++++++++++++++++++++++++++++++++++-
drivers/nvme/host/nvme.h | 8 +++++++
3 files changed, 101 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7aeca5db7916..cc91e8b247d0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -68,6 +68,13 @@ static bool streams;
module_param(streams, bool, 0644);
MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
+#ifdef CONFIG_NVME_MULTIPATH
+static const char *const mpath_policy_name[] = {
+ [NVME_MPATH_ACTIVE_STANDBY] = "active-standby",
+ [NVME_MPATH_ROUND_ROBIN] = "round-robin",
+};
+#endif
+
/*
* nvme_wq - hosts nvme related works that are not reset or delete
* nvme_reset_wq - hosts nvme reset works
@@ -2603,12 +2610,51 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RO(nsid);
+#ifdef CONFIG_NVME_MULTIPATH
+static ssize_t mpath_policy_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ int i, len = 0;
+ struct nvme_ns_head *head = dev_to_ns_head(dev);
+
+ for (i = 0;i < ARRAY_SIZE(mpath_policy_name);i++) {
+ if (i == head->mpath_policy)
+ len += sprintf(buf + len, "[%s] ", mpath_policy_name[i]);
+ else
+ len += sprintf(buf + len, "%s ", mpath_policy_name[i]);
+ }
+ len += sprintf(buf + len, "\n");
+ return len;
+}
+static ssize_t mpath_policy_store(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ int i;
+ struct nvme_ns_head *head = dev_to_ns_head(dev);
+
+ for (i = 0;i < ARRAY_SIZE(mpath_policy_name);i++) {
+ if (strncmp(buf, mpath_policy_name[i], count - 1) == 0) {
+ head->mpath_policy = i;
+ dev_info(dev, "change mpath policy to %s\n", mpath_policy_name[i]);
+ }
+ }
+ return count;
+}
+static DEVICE_ATTR(mpath_policy, S_IRUGO | S_IWUSR, mpath_policy_show, \
+ mpath_policy_store);
+#endif
+
static struct attribute *nvme_ns_id_attrs[] = {
&dev_attr_wwid.attr,
&dev_attr_uuid.attr,
&dev_attr_nguid.attr,
&dev_attr_eui.attr,
&dev_attr_nsid.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+ &dev_attr_mpath_policy.attr,
+#endif
NULL,
};
@@ -2818,6 +2864,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
head->subsys = ctrl->subsys;
head->ns_id = nsid;
kref_init(&head->ref);
+#ifdef CONFIG_NVME_MULTIPATH
+ head->mpath_policy = NVME_MPATH_ACTIVE_STANDBY;
+#endif
nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 060f69e03427..6b6a15ccb542 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -75,6 +75,42 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
return ns;
}
+inline struct nvme_ns *nvme_find_path_rr(struct nvme_ns_head *head)
+{
+ struct nvme_ns *prev_ns = srcu_dereference(head->current_path, &head->srcu);
+ struct nvme_ns *ns, *cand_ns = NULL;
+ bool after_prev_ns = false;
+
+ /*
+ * Active-active round-robin path selector
+ * Choose the path that is NVME_CTRL_LIVE and next to the previous path
+ */
+
+ /* Case 1. If there is no previous path, choose the first LIVE path */
+ if (!prev_ns) {
+ ns = __nvme_find_path(head);
+ return ns;
+ }
+
+ list_for_each_entry_rcu(ns, &head->list, siblings) {
+ /*
+ * Case 2-1. Choose the first LIVE path from the next path of
+ * previous path to end
+ */
+ if (after_prev_ns && ns->ctrl->state == NVME_CTRL_LIVE) {
+ rcu_assign_pointer(head->current_path, ns);
+ return ns;
+ }
+ /* Case 2-2. Mark the first LIVE path from start to previous path */
+ if (!cand_ns && ns->ctrl->state == NVME_CTRL_LIVE)
+ cand_ns = ns;
+ if (ns == prev_ns)
+ after_prev_ns = true;
+ }
+ rcu_assign_pointer(head->current_path, cand_ns);
+ return cand_ns;
+}
+
static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
struct bio *bio)
{
@@ -85,7 +121,14 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
int srcu_idx;
srcu_idx = srcu_read_lock(&head->srcu);
- ns = nvme_find_path(head);
+ switch (head->mpath_policy) {
+ case NVME_MPATH_ROUND_ROBIN:
+ ns = nvme_find_path_rr(head);
+ break;
+ case NVME_MPATH_ACTIVE_STANDBY:
+ default:
+ ns = nvme_find_path(head);
+ }
if (likely(ns)) {
bio->bi_disk = ns->disk;
bio->bi_opf |= REQ_NVME_MPATH;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index d733b14ede9d..15e1163bbf2b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -128,6 +128,13 @@ enum nvme_ctrl_state {
NVME_CTRL_DEAD,
};
+#ifdef CONFIG_NVME_MULTIPATH
+enum nvme_mpath_policy {
+ NVME_MPATH_ACTIVE_STANDBY,
+ NVME_MPATH_ROUND_ROBIN, /* active-active round-robin */
+};
+#endif
+
struct nvme_ctrl {
enum nvme_ctrl_state state;
bool identified;
@@ -250,6 +257,7 @@ struct nvme_ns_head {
struct bio_list requeue_list;
spinlock_t requeue_lock;
struct work_struct requeue_work;
+ enum nvme_mpath_policy mpath_policy;
#endif
struct list_head list;
struct srcu_struct srcu;
--
2.16.2
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
2018-03-27 4:38 [PATCH] nvme-multipath: implement active-active round-robin path selector Baegjae Sung
@ 2018-03-28 8:06 ` Christoph Hellwig
2018-03-28 19:47 ` Keith Busch
2018-04-04 12:36 ` Sagi Grimberg
2018-04-04 12:39 ` Sagi Grimberg
1 sibling, 2 replies; 9+ messages in thread
From: Christoph Hellwig @ 2018-03-28 8:06 UTC (permalink / raw)
To: Baegjae Sung; +Cc: keith.busch, axboe, hch, sagi, linux-nvme, linux-kernel
For PCIe devices the right policy is not a round robin but to use
the pcie device closer to the node. I did a prototype for that
long ago and the concept can work. Can you look into that and
also make that policy used automatically for PCIe devices?
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
2018-03-28 8:06 ` Christoph Hellwig
@ 2018-03-28 19:47 ` Keith Busch
2018-03-29 8:56 ` Christoph Hellwig
2018-03-30 4:57 ` Baegjae Sung
2018-04-04 12:36 ` Sagi Grimberg
1 sibling, 2 replies; 9+ messages in thread
From: Keith Busch @ 2018-03-28 19:47 UTC (permalink / raw)
To: Christoph Hellwig; +Cc: Baegjae Sung, axboe, sagi, linux-nvme, linux-kernel
On Wed, Mar 28, 2018 at 10:06:46AM +0200, Christoph Hellwig wrote:
> For PCIe devices the right policy is not a round robin but to use
> the pcie device closer to the node. I did a prototype for that
> long ago and the concept can work. Can you look into that and
> also make that policy used automatically for PCIe devices?
Yeah, that is especially true if you've multiple storage accessing
threads scheduled on different nodes. On the other hand, round-robin
may still benefit if both paths are connected to different root ports
on the same node (who would do that?!).
But I wasn't aware people use dual-ported PCIe NVMe connected to a
single host (single path from two hosts seems more common). If that's a
thing, we should get some numa awareness. I couldn't find your prototype,
though. I had one stashed locally from a while back and hope it resembles
what you had in mind:
---
struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
{
int distance, current = INT_MAX, node = cpu_to_node(smp_processor_id());
struct nvme_ns *ns, *path = NULL;
list_for_each_entry_rcu(ns, &head->list, siblings) {
if (ns->ctrl->state != NVME_CTRL_LIVE)
continue;
if (ns->disk->node_id == node)
return ns;
distance = node_distance(node, ns->disk->node_id);
if (distance < current) {
current = distance;
path = ns;
}
}
return path;
}
--
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
2018-03-28 19:47 ` Keith Busch
@ 2018-03-29 8:56 ` Christoph Hellwig
2018-03-30 4:57 ` Baegjae Sung
1 sibling, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2018-03-29 8:56 UTC (permalink / raw)
To: Keith Busch
Cc: Christoph Hellwig, Baegjae Sung, axboe, sagi, linux-nvme, linux-kernel
On Wed, Mar 28, 2018 at 01:47:41PM -0600, Keith Busch wrote:
> single host (single path from two hosts seems more common). If that's a
> thing, we should get some numa awareness. I couldn't find your prototype,
> though. I had one stashed locally from a while back and hope it resembles
> what you had in mind:
Mine was way older before the current data structures. Back then I
used ->map_queues hacks that I'm glad I never posted :)
> ---
> struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
> {
> int distance, current = INT_MAX, node = cpu_to_node(smp_processor_id());
> struct nvme_ns *ns, *path = NULL;
>
> list_for_each_entry_rcu(ns, &head->list, siblings) {
> if (ns->ctrl->state != NVME_CTRL_LIVE)
> continue;
> if (ns->disk->node_id == node)
> return ns;
>
> distance = node_distance(node, ns->disk->node_id);
> if (distance < current) {
> current = distance;
> path = ns;
> }
> }
> return path;
This is roughly what I'd do now. The other important change would
be to have a per-node cache of the current path so that we don't do
it in the hot path.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
2018-03-28 19:47 ` Keith Busch
2018-03-29 8:56 ` Christoph Hellwig
@ 2018-03-30 4:57 ` Baegjae Sung
2018-03-30 7:06 ` Christoph Hellwig
1 sibling, 1 reply; 9+ messages in thread
From: Baegjae Sung @ 2018-03-30 4:57 UTC (permalink / raw)
To: Keith Busch
Cc: Christoph Hellwig, axboe, sagi, linux-nvme, linux-kernel, Eric Chang
2018-03-29 4:47 GMT+09:00 Keith Busch <keith.busch@intel.com>:
> On Wed, Mar 28, 2018 at 10:06:46AM +0200, Christoph Hellwig wrote:
>> For PCIe devices the right policy is not a round robin but to use
>> the pcie device closer to the node. I did a prototype for that
>> long ago and the concept can work. Can you look into that and
>> also make that policy used automatically for PCIe devices?
>
> Yeah, that is especially true if you've multiple storage accessing
> threads scheduled on different nodes. On the other hand, round-robin
> may still benefit if both paths are connected to different root ports
> on the same node (who would do that?!).
>
> But I wasn't aware people use dual-ported PCIe NVMe connected to a
> single host (single path from two hosts seems more common). If that's a
> thing, we should get some numa awareness. I couldn't find your prototype,
> though. I had one stashed locally from a while back and hope it resembles
> what you had in mind:
Our prototype uses dual-ported PCIe NVMe connected to a single host. The
host's HBA is connected to two switches, and the two switches are connected
to a dual-port NVMe SSD. In this environment, active-active round-robin path
selection is good to utilize the full performance of a dual-port NVMe SSD.
You can also fail over a single switch failure. You can see the prototype
in link below.
https://youtu.be/u_ou-AQsvOs?t=307 (presentation in OCP Summit 2018)
I agree that active-standby closer path selection is the right policy
if multiple
nodes attempt to access the storage system through multiple paths.
However, I believe that NVMe multipath needs to provide multiple policy for
path selection. Some people may want to use multiple paths simultaneously
(active-active) if they use a small number of nodes and want to utilize full
capability. If the capability of paths is same, the round-robin can be
the right
policy. If the capability of paths is different, a more adoptive
method would be
needed (e.g., checking path condition to balance IO).
We are moving to the NVMe fabrics for our next prototype. So, I think we will
have a chance to discuss about this policy issue in more detail. I will continue
to follow this issue.
> ---
> struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
> {
> int distance, current = INT_MAX, node = cpu_to_node(smp_processor_id());
> struct nvme_ns *ns, *path = NULL;
>
> list_for_each_entry_rcu(ns, &head->list, siblings) {
> if (ns->ctrl->state != NVME_CTRL_LIVE)
> continue;
> if (ns->disk->node_id == node)
> return ns;
>
> distance = node_distance(node, ns->disk->node_id);
> if (distance < current) {
> current = distance;
> path = ns;
> }
> }
> return path;
> }
> --
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
2018-03-30 4:57 ` Baegjae Sung
@ 2018-03-30 7:06 ` Christoph Hellwig
[not found] ` <e22676055aca4e8d936b7479d6301d85@skt-tnetpmx2.SKT.AD>
0 siblings, 1 reply; 9+ messages in thread
From: Christoph Hellwig @ 2018-03-30 7:06 UTC (permalink / raw)
To: Baegjae Sung
Cc: Keith Busch, Christoph Hellwig, axboe, sagi, linux-nvme,
linux-kernel, Eric Chang
On Fri, Mar 30, 2018 at 01:57:25PM +0900, Baegjae Sung wrote:
> Our prototype uses dual-ported PCIe NVMe connected to a single host. The
> host's HBA is connected to two switches,
What "HBA"? We are talking about NVMe here..
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
2018-03-28 8:06 ` Christoph Hellwig
2018-03-28 19:47 ` Keith Busch
@ 2018-04-04 12:36 ` Sagi Grimberg
1 sibling, 0 replies; 9+ messages in thread
From: Sagi Grimberg @ 2018-04-04 12:36 UTC (permalink / raw)
To: Christoph Hellwig, Baegjae Sung
Cc: keith.busch, axboe, linux-nvme, linux-kernel
> For PCIe devices the right policy is not a round robin but to use
> the pcie device closer to the node. I did a prototype for that
> long ago and the concept can work. Can you look into that and
> also make that policy used automatically for PCIe devices?
I think that active/active makes sense for fabrics (link throughput
aggregation) but also for dual-ported pci devices (given that
this is a real use-case).
I agree that the default can be a home-node path selection.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
2018-03-27 4:38 [PATCH] nvme-multipath: implement active-active round-robin path selector Baegjae Sung
2018-03-28 8:06 ` Christoph Hellwig
@ 2018-04-04 12:39 ` Sagi Grimberg
1 sibling, 0 replies; 9+ messages in thread
From: Sagi Grimberg @ 2018-04-04 12:39 UTC (permalink / raw)
To: Baegjae Sung, keith.busch, axboe, hch; +Cc: linux-nvme, linux-kernel
> @@ -85,7 +121,14 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
> int srcu_idx;
>
> srcu_idx = srcu_read_lock(&head->srcu);
> - ns = nvme_find_path(head);
> + switch (head->mpath_policy) {
> + case NVME_MPATH_ROUND_ROBIN:
> + ns = nvme_find_path_rr(head);
> + break;
> + case NVME_MPATH_ACTIVE_STANDBY:
> + default:
> + ns = nvme_find_path(head);
> + }
If we grow multiple path selectors, would be more elegant to
use a callout mechanism.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] nvme-multipath: implement active-active round-robin path selector
[not found] ` <e22676055aca4e8d936b7479d6301d85@skt-tnetpmx2.SKT.AD>
@ 2018-04-04 14:30 ` Keith Busch
0 siblings, 0 replies; 9+ messages in thread
From: Keith Busch @ 2018-04-04 14:30 UTC (permalink / raw)
To: Eric H. Chang
Cc: Christoph Hellwig, Baegjae Sung, axboe, sagi, linux-nvme, linux-kernel
On Fri, Mar 30, 2018 at 09:04:46AM +0000, Eric H. Chang wrote:
> We internally call PCIe-retimer as HBA. It's not a real Host Bus Adapter that translates the interface from PCIe to SATA or SAS. Sorry for the confusion.
Please don't call a PCIe retimer an "HBA"! :)
While your experiment is setup to benefit from round-robin, my only
concern is it has odd performance in a real world scenario with
IO threads executing in different nodes. Christoph's proposal will
naturally utilize both paths optimally there, where round-robin will
saturate node interlinks.
Not that I'm against having the choice; your setup probably does represent
real use. But if we're going to have multiple choice, user documentation
on nvme path selectors will be useful here.
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2018-04-04 14:27 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-27 4:38 [PATCH] nvme-multipath: implement active-active round-robin path selector Baegjae Sung
2018-03-28 8:06 ` Christoph Hellwig
2018-03-28 19:47 ` Keith Busch
2018-03-29 8:56 ` Christoph Hellwig
2018-03-30 4:57 ` Baegjae Sung
2018-03-30 7:06 ` Christoph Hellwig
[not found] ` <e22676055aca4e8d936b7479d6301d85@skt-tnetpmx2.SKT.AD>
2018-04-04 14:30 ` Keith Busch
2018-04-04 12:36 ` Sagi Grimberg
2018-04-04 12:39 ` Sagi Grimberg
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).