* [PATCHv2 0/3] nvme: NUMA locality for fabrics
@ 2018-10-26 12:57 Hannes Reinecke
2018-10-26 12:57 ` [PATCH 1/3] nvme: NUMA locality information " Hannes Reinecke
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: Hannes Reinecke @ 2018-10-26 12:57 UTC (permalink / raw)
Hi all,
here's a patchset to leverage NUMA locality information for fabric controllers.
This is the second attempt for doing so; after discussion with hch we came
to the conclusion that the attempt in the initial submission with a manual
configuration would only lead to more confusion and suboptimal configuration.
So here's now a version with an automatic NUMA balancing, where we attempt
to split the number submitting CPUs/cores evenly across the available
controller.
With this patchset I'm seeing a performance increase from
262k IOPS to 344k IOPS, measured against a NetApp AF700.
As usual, comments and reviews are welcome.
Hannes Reinecke (3):
nvme: NUMA locality information for fabrics
nvme-multipath: Select paths based on NUMA locality
nvme-multipath: automatic NUMA path balancing
drivers/nvme/host/core.c | 41 +++++++++++++-
drivers/nvme/host/fc.c | 5 +-
drivers/nvme/host/multipath.c | 122 ++++++++++++++++++++++++++++++++++++++++--
drivers/nvme/host/nvme.h | 3 ++
drivers/nvme/host/pci.c | 2 +-
drivers/nvme/host/rdma.c | 6 ++-
6 files changed, 169 insertions(+), 10 deletions(-)
--
2.16.4
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 1/3] nvme: NUMA locality information for fabrics
2018-10-26 12:57 [PATCHv2 0/3] nvme: NUMA locality for fabrics Hannes Reinecke
@ 2018-10-26 12:57 ` Hannes Reinecke
2018-10-30 18:35 ` Sagi Grimberg
2018-10-26 12:57 ` [PATCH 2/3] nvme-multipath: Select paths based on NUMA locality Hannes Reinecke
2018-10-26 12:57 ` [PATCH 3/3] nvme-multipath: automatic NUMA path balancing Hannes Reinecke
2 siblings, 1 reply; 9+ messages in thread
From: Hannes Reinecke @ 2018-10-26 12:57 UTC (permalink / raw)
From: Hannes Reinecke <hare@suse.com>
Add a new field 'node_id' to the nvme_ctrl structure to hold the
NUMA locality information of the underlying hardware.
With that we can allocate the memory structures on the same NUMA
node as the underlying hardware.
Signed-off-by: Hannes Reinecke <hare at suse.com>
---
drivers/nvme/host/core.c | 4 +++-
drivers/nvme/host/fc.c | 5 +++--
drivers/nvme/host/multipath.c | 4 ++--
drivers/nvme/host/nvme.h | 1 +
drivers/nvme/host/rdma.c | 5 +++--
5 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 65c42448e904..ad7abeb7442b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2766,6 +2766,7 @@ static ssize_t field##_show(struct device *dev, \
static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
nvme_show_int_function(cntlid);
+nvme_show_int_function(node_id);
static ssize_t nvme_sysfs_delete(struct device *dev,
struct device_attribute *attr, const char *buf,
@@ -2845,6 +2846,7 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_subsysnqn.attr,
&dev_attr_address.attr,
&dev_attr_state.attr,
+ &dev_attr_node_id.attr,
NULL
};
@@ -3055,7 +3057,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
struct gendisk *disk;
struct nvme_id_ns *id;
char disk_name[DISK_NAME_LEN];
- int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
+ int node = ctrl->node_id, flags = GENHD_FL_EXT_DEVT;
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index e52b9d3c0bd6..500d9d6ec7ce 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2433,7 +2433,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
ctrl->tag_set.ops = &nvme_fc_mq_ops;
ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
ctrl->tag_set.reserved_tags = 1; /* fabric connect */
- ctrl->tag_set.numa_node = NUMA_NO_NODE;
+ ctrl->tag_set.numa_node = ctrl->ctrl.node_id;
ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
ctrl->tag_set.cmd_size =
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
@@ -3000,6 +3000,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->ctrl.opts = opts;
ctrl->ctrl.nr_reconnects = 0;
+ ctrl->ctrl.node_id = dev_to_node(lport->dev);
INIT_LIST_HEAD(&ctrl->ctrl_list);
ctrl->lport = lport;
ctrl->rport = rport;
@@ -3038,7 +3039,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
- ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
+ ctrl->admin_tag_set.numa_node = ctrl->ctrl.node_id;
ctrl->admin_tag_set.cmd_size =
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
ctrl->lport->ops->fcprqst_priv_sz);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 5e3cc8c59a39..123292187b40 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -141,7 +141,7 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
test_bit(NVME_NS_ANA_PENDING, &ns->flags))
continue;
- distance = node_distance(node, dev_to_node(ns->ctrl->dev));
+ distance = node_distance(node, ns->ctrl->node_id);
switch (ns->ana_state) {
case NVME_ANA_OPTIMIZED:
@@ -276,7 +276,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
return 0;
- q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
+ q = blk_alloc_queue_node(GFP_KERNEL, ctrl->node_id, NULL);
if (!q)
goto out;
q->queuedata = head;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9fefba039d1e..55347a547d84 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -153,6 +153,7 @@ struct nvme_ctrl {
struct request_queue *connect_q;
struct device *dev;
int instance;
+ int node_id;
struct blk_mq_tag_set *tagset;
struct blk_mq_tag_set *admin_tagset;
struct list_head namespaces;
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index d181cafedc58..31712d77b372 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -693,7 +693,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
set->ops = &nvme_rdma_admin_mq_ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = 2; /* connect + keep-alive */
- set->numa_node = NUMA_NO_NODE;
+ set->numa_node = nctrl->node_id;
set->cmd_size = sizeof(struct nvme_rdma_request) +
SG_CHUNK_SIZE * sizeof(struct scatterlist);
set->driver_data = ctrl;
@@ -706,7 +706,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
set->ops = &nvme_rdma_mq_ops;
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = 1; /* fabric connect */
- set->numa_node = NUMA_NO_NODE;
+ set->numa_node = nctrl->node_id;
set->flags = BLK_MQ_F_SHOULD_MERGE;
set->cmd_size = sizeof(struct nvme_rdma_request) +
SG_CHUNK_SIZE * sizeof(struct scatterlist);
@@ -762,6 +762,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
return error;
ctrl->device = ctrl->queues[0].device;
+ ctrl->ctrl.node_id = dev_to_node(ctrl->device->dev->dma_device);
ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
--
2.16.4
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 2/3] nvme-multipath: Select paths based on NUMA locality
2018-10-26 12:57 [PATCHv2 0/3] nvme: NUMA locality for fabrics Hannes Reinecke
2018-10-26 12:57 ` [PATCH 1/3] nvme: NUMA locality information " Hannes Reinecke
@ 2018-10-26 12:57 ` Hannes Reinecke
2018-10-30 18:39 ` Sagi Grimberg
2018-10-26 12:57 ` [PATCH 3/3] nvme-multipath: automatic NUMA path balancing Hannes Reinecke
2 siblings, 1 reply; 9+ messages in thread
From: Hannes Reinecke @ 2018-10-26 12:57 UTC (permalink / raw)
This patch creates a per-controller map to hold the NUMA locality
information. With that we can route I/O to the controller which is
'nearest' to the issuing CPU and decrease the latency there.
Signed-off-by: Hannes Reinecke <hare at suse.com>
---
drivers/nvme/host/core.c | 37 ++++++++++++++++++++++++++++++++++++-
drivers/nvme/host/fc.c | 2 +-
drivers/nvme/host/multipath.c | 9 +++++++--
drivers/nvme/host/nvme.h | 2 ++
drivers/nvme/host/pci.c | 2 +-
drivers/nvme/host/rdma.c | 3 ++-
6 files changed, 49 insertions(+), 6 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index ad7abeb7442b..02a697c6b983 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2204,6 +2204,21 @@ static int nvme_active_ctrls(struct nvme_subsystem *subsys)
return count;
}
+void nvme_set_ctrl_node(struct nvme_ctrl *ctrl, int node_id)
+{
+ int node;
+
+ ctrl->node_id = node_id;
+ if (node_id == NUMA_NO_NODE)
+ return;
+ ctrl->node_map = kzalloc(num_possible_nodes() * sizeof(int),
+ GFP_KERNEL);
+ if (ctrl->node_map)
+ for_each_node(node)
+ ctrl->node_map[node] = node_distance(node, node_id);
+}
+EXPORT_SYMBOL_GPL(nvme_set_ctrl_node);
+
static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
struct nvme_subsystem *subsys, *found;
@@ -2834,6 +2849,23 @@ static ssize_t nvme_sysfs_show_address(struct device *dev,
}
static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
+static ssize_t nvme_sysfs_show_node_map(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+ int node;
+ ssize_t offset = 0;
+
+ for_each_node(node)
+ offset += snprintf(buf + offset, PAGE_SIZE - offset,
+ "%d ", ctrl->node_map[node]);
+ offset += snprintf(buf + offset, PAGE_SIZE - offset, "\n");
+
+ return offset;
+}
+static DEVICE_ATTR(node_map, S_IRUGO, nvme_sysfs_show_node_map, NULL);
+
static struct attribute *nvme_dev_attrs[] = {
&dev_attr_reset_controller.attr,
&dev_attr_rescan_controller.attr,
@@ -2847,6 +2879,7 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_address.attr,
&dev_attr_state.attr,
&dev_attr_node_id.attr,
+ &dev_attr_node_map.attr,
NULL
};
@@ -2860,7 +2893,8 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
return 0;
if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
return 0;
-
+ if (a == &dev_attr_node_map.attr && !ctrl->node_map)
+ return 0;
return a->mode;
}
@@ -3507,6 +3541,7 @@ static void nvme_free_ctrl(struct device *dev)
ida_simple_remove(&nvme_instance_ida, ctrl->instance);
kfree(ctrl->effects);
+ kfree(ctrl->node_map);
nvme_mpath_uninit(ctrl);
if (subsys) {
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 500d9d6ec7ce..ce01140450c4 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3000,7 +3000,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->ctrl.opts = opts;
ctrl->ctrl.nr_reconnects = 0;
- ctrl->ctrl.node_id = dev_to_node(lport->dev);
+ nvme_set_ctrl_node(&ctrl->ctrl, dev_to_node(lport->dev));
INIT_LIST_HEAD(&ctrl->ctrl_list);
ctrl->lport = lport;
ctrl->rport = rport;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 123292187b40..a589a1a7b6ce 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -141,7 +141,8 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
test_bit(NVME_NS_ANA_PENDING, &ns->flags))
continue;
- distance = node_distance(node, ns->ctrl->node_id);
+ distance = ns->ctrl->node_map ?
+ ns->ctrl->node_map[node] : INT_MAX;
switch (ns->ana_state) {
case NVME_ANA_OPTIMIZED:
@@ -163,8 +164,11 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
if (!found)
found = fallback;
- if (found)
+ if (found) {
+ dev_dbg(disk_to_dev(head->disk), "none: node %d path %s\n",
+ node, found->disk->disk_name);
rcu_assign_pointer(head->current_path[node], found);
+ }
return found;
}
@@ -528,6 +532,7 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
nvme_mpath_set_live(ns);
mutex_unlock(&ns->head->lock);
}
+ nvme_mpath_clear_current_path(ns);
}
void nvme_mpath_remove_disk(struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 55347a547d84..0027f5d41ff8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -154,6 +154,7 @@ struct nvme_ctrl {
struct device *dev;
int instance;
int node_id;
+ int *node_map;
struct blk_mq_tag_set *tagset;
struct blk_mq_tag_set *admin_tagset;
struct list_head namespaces;
@@ -437,6 +438,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl);
void nvme_wait_freeze(struct nvme_ctrl *ctrl);
void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
void nvme_start_freeze(struct nvme_ctrl *ctrl);
+void nvme_set_ctrl_node(struct nvme_ctrl *ctrl, int node);
#define NVME_QID_ANY -1
struct request *nvme_alloc_request(struct request_queue *q,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4e023cd007e1..461116bd5a03 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2509,7 +2509,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
result = -ENOMEM;
goto release_pools;
}
-
+ nvme_set_ctrl_node(&dev->ctrl, node);
result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
quirks);
if (result)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 31712d77b372..e9d34a9305b8 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -762,7 +762,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
return error;
ctrl->device = ctrl->queues[0].device;
- ctrl->ctrl.node_id = dev_to_node(ctrl->device->dev->dma_device);
+ nvme_set_ctrl_node(&ctrl->ctrl,
+ dev_to_node(ctrl->device->dev->dma_device));
ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
--
2.16.4
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 3/3] nvme-multipath: automatic NUMA path balancing
2018-10-26 12:57 [PATCHv2 0/3] nvme: NUMA locality for fabrics Hannes Reinecke
2018-10-26 12:57 ` [PATCH 1/3] nvme: NUMA locality information " Hannes Reinecke
2018-10-26 12:57 ` [PATCH 2/3] nvme-multipath: Select paths based on NUMA locality Hannes Reinecke
@ 2018-10-26 12:57 ` Hannes Reinecke
2 siblings, 0 replies; 9+ messages in thread
From: Hannes Reinecke @ 2018-10-26 12:57 UTC (permalink / raw)
In order to utilize both paths on dual-ported HBAs we cannot rely
on the NUMA affinity alone, but rather have to distribute the
locality information to get the best possible result.
This patch implements a two-pass algorithm for assinging NUMA
locality information:
1. Distribute existing locality information so that no core has
more than one 'local' controller
2. Assign a 'local' controller for each of the remaining cores,
so that the overall weight (ie the sum of all locality information)
per ctrl is minimal.
Signed-off-by: Hannes Reinecke <hare at suse.com>
---
drivers/nvme/host/multipath.c | 111 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 111 insertions(+)
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index a589a1a7b6ce..9e4183401539 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -262,6 +262,115 @@ static void nvme_requeue_work(struct work_struct *work)
}
}
+void nvme_mpath_distribute_paths(struct nvme_subsystem *subsys, int num_ctrls,
+ struct nvme_ctrl *ctrl, int node_id)
+{
+ int node;
+ int found_node = NUMA_NO_NODE;
+ int max = LOCAL_DISTANCE * num_ctrls;
+
+ for_each_node(node) {
+ struct nvme_ctrl *c;
+ int sum = 0;
+
+ list_for_each_entry(c, &subsys->ctrls, subsys_entry)
+ sum += c->node_map[node];
+ if (sum > max) {
+ max = sum;
+ found_node = node;
+ }
+ }
+ if (found_node != NUMA_NO_NODE) {
+ ctrl->node_map[found_node] = LOCAL_DISTANCE;
+ ctrl->node_map[node_id] = REMOTE_DISTANCE;
+ }
+}
+
+void nvme_mpath_balance_node(struct nvme_subsystem *subsys,
+ int num_ctrls, int node_id)
+{
+ struct nvme_ctrl *found = NULL, *ctrl;
+ int max = LOCAL_DISTANCE * num_ctrls, node;
+
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+ int sum = 0;
+
+ for_each_node(node)
+ sum += ctrl->node_map[node];
+ if (sum > max) {
+ max = sum;
+ found = ctrl;
+ }
+ }
+ if (found)
+ found->node_map[node_id] = LOCAL_DISTANCE;
+}
+
+void nvme_mpath_balance_subsys(struct nvme_subsystem *subsys)
+{
+ struct nvme_ctrl *ctrl;
+ int num_ctrls = 0;
+ int node;
+
+ mutex_lock(&subsys->lock);
+
+ /*
+ * 1. Reset set NUMA distance
+ * During creation the NUMA distance is only set
+ * per controller, so after connecting the other
+ * controllers the NUMA information on the existing
+ * ones is incorrect.
+ */
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+ for_each_node(node)
+ ctrl->node_map[node] =
+ node_distance(node, ctrl->node_id);
+ num_ctrls++;
+ }
+
+ /*
+ * 2. Distribute optimal paths:
+ * Only one primary paths per node.
+ * Additional primary paths are moved to unassigned nodes.
+ */
+ for_each_node(node) {
+ bool optimal = false;
+
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+ if (ctrl->node_map[node] == LOCAL_DISTANCE) {
+ if (!optimal) {
+ optimal = true;
+ continue;
+ }
+ nvme_mpath_distribute_paths(subsys, num_ctrls,
+ ctrl, node);
+ }
+ }
+ }
+
+ /*
+ * 3. Balance unassigned nodes:
+ * Each unassigned node should have one primary path;
+ * the primary path is assigned to the ctrl with the
+ * minimal weight (ie the sum of distances over all nodes)
+ */
+ for_each_node(node) {
+ bool optimal = false;
+
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+ if (ctrl->node_map[node] == LOCAL_DISTANCE) {
+ optimal = true;
+ break;
+ }
+ }
+ if (optimal)
+ continue;
+ nvme_mpath_balance_node(subsys, num_ctrls, node);
+ }
+
+ mutex_unlock(&subsys->lock);
+}
+
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
{
struct request_queue *q;
@@ -553,6 +662,8 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
int error;
+ nvme_mpath_balance_subsys(ctrl->subsys);
+
if (!nvme_ctrl_use_ana(ctrl))
return 0;
--
2.16.4
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 1/3] nvme: NUMA locality information for fabrics
2018-10-26 12:57 ` [PATCH 1/3] nvme: NUMA locality information " Hannes Reinecke
@ 2018-10-30 18:35 ` Sagi Grimberg
0 siblings, 0 replies; 9+ messages in thread
From: Sagi Grimberg @ 2018-10-30 18:35 UTC (permalink / raw)
> static ssize_t nvme_sysfs_delete(struct device *dev,
> struct device_attribute *attr, const char *buf,
> @@ -2845,6 +2846,7 @@ static struct attribute *nvme_dev_attrs[] = {
> &dev_attr_subsysnqn.attr,
> &dev_attr_address.attr,
> &dev_attr_state.attr,
> + &dev_attr_node_id.attr,
Maybe call it numa_node to align with other devices sysfs attributes?
Otherwise looks fine,
Reviewed-by: Sagi Grimberg <sagi at grimberg.me>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 2/3] nvme-multipath: Select paths based on NUMA locality
2018-10-26 12:57 ` [PATCH 2/3] nvme-multipath: Select paths based on NUMA locality Hannes Reinecke
@ 2018-10-30 18:39 ` Sagi Grimberg
0 siblings, 0 replies; 9+ messages in thread
From: Sagi Grimberg @ 2018-10-30 18:39 UTC (permalink / raw)
> @@ -163,8 +164,11 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
>
> if (!found)
> found = fallback;
> - if (found)
> + if (found) {
> + dev_dbg(disk_to_dev(head->disk), "none: node %d path %s\n",
> + node, found->disk->disk_name);
none?
> rcu_assign_pointer(head->current_path[node], found);
> + }
> return found;
> }
>
> @@ -528,6 +532,7 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
> nvme_mpath_set_live(ns);
> mutex_unlock(&ns->head->lock);
> }
> + nvme_mpath_clear_current_path(ns);
Didn't I see this in a different patch from keith?
Please document why this is added.
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 1/3] nvme: NUMA locality information for fabrics
2018-11-08 9:22 ` Christoph Hellwig
@ 2018-11-08 9:35 ` Hannes Reinecke
0 siblings, 0 replies; 9+ messages in thread
From: Hannes Reinecke @ 2018-11-08 9:35 UTC (permalink / raw)
On 11/8/18 10:22 AM, Christoph Hellwig wrote:
> This patch looks good, but the dscription seems extremely misleading.
>
> I've rewritten it as below for inclusion in nvme-4.21, let me know
> what you think:
>
> ---
> From 01d2c22c9d6cd2f0cb2ac2ba5cf98c2ac2d8624e Mon Sep 17 00:00:00 2001
> From: Hannes Reinecke <hare at suse.com>
> Date: Fri, 2 Nov 2018 10:56:39 +0100
> Subject: nvme: add a numa_node field to struct nvme_ctrl
>
> Instead of directly poking into the struct device add a new numa_node
> field to struct nvme_ctrl. This allows fabrics drivers where ctrl->dev
> is a virtual device to support NUMA affinity as well.
>
> Also expose the field as a sysfs attribute, and populate it for the
> RDMA and FC transports.
>
> Signed-off-by: Hannes Reinecke <hare at suse.com>
> Reviewed-by: Sagi Grimberg <sagi at grimberg.me>
> Signed-off-by: Christoph Hellwig <hch at lst.de>
> ---
> drivers/nvme/host/core.c | 4 +++-
> drivers/nvme/host/fc.c | 5 +++--
> drivers/nvme/host/multipath.c | 4 ++--
> drivers/nvme/host/nvme.h | 1 +
> drivers/nvme/host/rdma.c | 5 +++--
> 5 files changed, 12 insertions(+), 7 deletions(-)
Works for me.
Cheers,
Hannes
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 1/3] nvme: NUMA locality information for fabrics
2018-11-02 9:56 ` [PATCH 1/3] nvme: NUMA locality information " Hannes Reinecke
@ 2018-11-08 9:22 ` Christoph Hellwig
2018-11-08 9:35 ` Hannes Reinecke
0 siblings, 1 reply; 9+ messages in thread
From: Christoph Hellwig @ 2018-11-08 9:22 UTC (permalink / raw)
This patch looks good, but the dscription seems extremely misleading.
I've rewritten it as below for inclusion in nvme-4.21, let me know
what you think:
---
>From 01d2c22c9d6cd2f0cb2ac2ba5cf98c2ac2d8624e Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.com>
Date: Fri, 2 Nov 2018 10:56:39 +0100
Subject: nvme: add a numa_node field to struct nvme_ctrl
Instead of directly poking into the struct device add a new numa_node
field to struct nvme_ctrl. This allows fabrics drivers where ctrl->dev
is a virtual device to support NUMA affinity as well.
Also expose the field as a sysfs attribute, and populate it for the
RDMA and FC transports.
Signed-off-by: Hannes Reinecke <hare at suse.com>
Reviewed-by: Sagi Grimberg <sagi at grimberg.me>
Signed-off-by: Christoph Hellwig <hch at lst.de>
---
drivers/nvme/host/core.c | 4 +++-
drivers/nvme/host/fc.c | 5 +++--
drivers/nvme/host/multipath.c | 4 ++--
drivers/nvme/host/nvme.h | 1 +
drivers/nvme/host/rdma.c | 5 +++--
5 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 41e00404f7a5..80b5a8c60b91 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2764,6 +2764,7 @@ static ssize_t field##_show(struct device *dev, \
static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
nvme_show_int_function(cntlid);
+nvme_show_int_function(numa_node);
static ssize_t nvme_sysfs_delete(struct device *dev,
struct device_attribute *attr, const char *buf,
@@ -2843,6 +2844,7 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_subsysnqn.attr,
&dev_attr_address.attr,
&dev_attr_state.attr,
+ &dev_attr_numa_node.attr,
NULL
};
@@ -3053,7 +3055,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
struct gendisk *disk;
struct nvme_id_ns *id;
char disk_name[DISK_NAME_LEN];
- int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
+ int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT;
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 0b70c8bab045..a22ff6fb82bc 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2433,7 +2433,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
ctrl->tag_set.ops = &nvme_fc_mq_ops;
ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
ctrl->tag_set.reserved_tags = 1; /* fabric connect */
- ctrl->tag_set.numa_node = NUMA_NO_NODE;
+ ctrl->tag_set.numa_node = ctrl->ctrl.numa_node;
ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
ctrl->tag_set.cmd_size =
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
@@ -3000,6 +3000,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->ctrl.opts = opts;
ctrl->ctrl.nr_reconnects = 0;
+ ctrl->ctrl.numa_node = dev_to_node(lport->dev);
INIT_LIST_HEAD(&ctrl->ctrl_list);
ctrl->lport = lport;
ctrl->rport = rport;
@@ -3038,7 +3039,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
- ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
+ ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node;
ctrl->admin_tag_set.cmd_size =
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
ctrl->lport->ops->fcprqst_priv_sz);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 5e3cc8c59a39..8e03cda770c5 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -141,7 +141,7 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
test_bit(NVME_NS_ANA_PENDING, &ns->flags))
continue;
- distance = node_distance(node, dev_to_node(ns->ctrl->dev));
+ distance = node_distance(node, ns->ctrl->numa_node);
switch (ns->ana_state) {
case NVME_ANA_OPTIMIZED:
@@ -276,7 +276,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
return 0;
- q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
+ q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node, NULL);
if (!q)
goto out;
q->queuedata = head;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cee79cb388af..f608fc11d329 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -153,6 +153,7 @@ struct nvme_ctrl {
struct request_queue *connect_q;
struct device *dev;
int instance;
+ int numa_node;
struct blk_mq_tag_set *tagset;
struct blk_mq_tag_set *admin_tagset;
struct list_head namespaces;
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index d181cafedc58..4468d672ced9 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -693,7 +693,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
set->ops = &nvme_rdma_admin_mq_ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = 2; /* connect + keep-alive */
- set->numa_node = NUMA_NO_NODE;
+ set->numa_node = nctrl->numa_node;
set->cmd_size = sizeof(struct nvme_rdma_request) +
SG_CHUNK_SIZE * sizeof(struct scatterlist);
set->driver_data = ctrl;
@@ -706,7 +706,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
set->ops = &nvme_rdma_mq_ops;
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = 1; /* fabric connect */
- set->numa_node = NUMA_NO_NODE;
+ set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_SHOULD_MERGE;
set->cmd_size = sizeof(struct nvme_rdma_request) +
SG_CHUNK_SIZE * sizeof(struct scatterlist);
@@ -762,6 +762,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
return error;
ctrl->device = ctrl->queues[0].device;
+ ctrl->ctrl.numa_node = dev_to_node(ctrl->device->dev->dma_device);
ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
--
2.19.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH 1/3] nvme: NUMA locality information for fabrics
2018-11-02 9:56 [PATCHv3 0/3] nvme: NUMA locality for fabrics Hannes Reinecke
@ 2018-11-02 9:56 ` Hannes Reinecke
2018-11-08 9:22 ` Christoph Hellwig
0 siblings, 1 reply; 9+ messages in thread
From: Hannes Reinecke @ 2018-11-02 9:56 UTC (permalink / raw)
From: Hannes Reinecke <hare@suse.com>
Add a new field 'numa_node' to the nvme_ctrl structure to hold the
NUMA locality information of the underlying hardware.
With that we can allocate the memory structures on the same NUMA
node as the underlying hardware.
Signed-off-by: Hannes Reinecke <hare at suse.com>
Reviewed-by: Sagi Grimberg <sagi at grimberg.me>
---
drivers/nvme/host/core.c | 4 +++-
drivers/nvme/host/fc.c | 5 +++--
drivers/nvme/host/multipath.c | 4 ++--
drivers/nvme/host/nvme.h | 1 +
drivers/nvme/host/rdma.c | 5 +++--
5 files changed, 12 insertions(+), 7 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2e65be8b1387..6dfcb72aa907 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2766,6 +2766,7 @@ static ssize_t field##_show(struct device *dev, \
static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
nvme_show_int_function(cntlid);
+nvme_show_int_function(numa_node);
static ssize_t nvme_sysfs_delete(struct device *dev,
struct device_attribute *attr, const char *buf,
@@ -2845,6 +2846,7 @@ static struct attribute *nvme_dev_attrs[] = {
&dev_attr_subsysnqn.attr,
&dev_attr_address.attr,
&dev_attr_state.attr,
+ &dev_attr_numa_node.attr,
NULL
};
@@ -3055,7 +3057,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
struct gendisk *disk;
struct nvme_id_ns *id;
char disk_name[DISK_NAME_LEN];
- int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
+ int node = ctrl->numa_node, flags = GENHD_FL_EXT_DEVT;
ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
if (!ns)
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 0b70c8bab045..a22ff6fb82bc 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2433,7 +2433,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
ctrl->tag_set.ops = &nvme_fc_mq_ops;
ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
ctrl->tag_set.reserved_tags = 1; /* fabric connect */
- ctrl->tag_set.numa_node = NUMA_NO_NODE;
+ ctrl->tag_set.numa_node = ctrl->ctrl.numa_node;
ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
ctrl->tag_set.cmd_size =
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
@@ -3000,6 +3000,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->ctrl.opts = opts;
ctrl->ctrl.nr_reconnects = 0;
+ ctrl->ctrl.numa_node = dev_to_node(lport->dev);
INIT_LIST_HEAD(&ctrl->ctrl_list);
ctrl->lport = lport;
ctrl->rport = rport;
@@ -3038,7 +3039,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
- ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
+ ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node;
ctrl->admin_tag_set.cmd_size =
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
ctrl->lport->ops->fcprqst_priv_sz);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 5e3cc8c59a39..8e03cda770c5 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -141,7 +141,7 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
test_bit(NVME_NS_ANA_PENDING, &ns->flags))
continue;
- distance = node_distance(node, dev_to_node(ns->ctrl->dev));
+ distance = node_distance(node, ns->ctrl->numa_node);
switch (ns->ana_state) {
case NVME_ANA_OPTIMIZED:
@@ -276,7 +276,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
return 0;
- q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
+ q = blk_alloc_queue_node(GFP_KERNEL, ctrl->numa_node, NULL);
if (!q)
goto out;
q->queuedata = head;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cee79cb388af..f608fc11d329 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -153,6 +153,7 @@ struct nvme_ctrl {
struct request_queue *connect_q;
struct device *dev;
int instance;
+ int numa_node;
struct blk_mq_tag_set *tagset;
struct blk_mq_tag_set *admin_tagset;
struct list_head namespaces;
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index d181cafedc58..4468d672ced9 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -693,7 +693,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
set->ops = &nvme_rdma_admin_mq_ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
set->reserved_tags = 2; /* connect + keep-alive */
- set->numa_node = NUMA_NO_NODE;
+ set->numa_node = nctrl->numa_node;
set->cmd_size = sizeof(struct nvme_rdma_request) +
SG_CHUNK_SIZE * sizeof(struct scatterlist);
set->driver_data = ctrl;
@@ -706,7 +706,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
set->ops = &nvme_rdma_mq_ops;
set->queue_depth = nctrl->sqsize + 1;
set->reserved_tags = 1; /* fabric connect */
- set->numa_node = NUMA_NO_NODE;
+ set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_SHOULD_MERGE;
set->cmd_size = sizeof(struct nvme_rdma_request) +
SG_CHUNK_SIZE * sizeof(struct scatterlist);
@@ -762,6 +762,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
return error;
ctrl->device = ctrl->queues[0].device;
+ ctrl->ctrl.numa_node = dev_to_node(ctrl->device->dev->dma_device);
ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
--
2.16.4
^ permalink raw reply related [flat|nested] 9+ messages in thread
end of thread, other threads:[~2018-11-08 9:35 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-26 12:57 [PATCHv2 0/3] nvme: NUMA locality for fabrics Hannes Reinecke
2018-10-26 12:57 ` [PATCH 1/3] nvme: NUMA locality information " Hannes Reinecke
2018-10-30 18:35 ` Sagi Grimberg
2018-10-26 12:57 ` [PATCH 2/3] nvme-multipath: Select paths based on NUMA locality Hannes Reinecke
2018-10-30 18:39 ` Sagi Grimberg
2018-10-26 12:57 ` [PATCH 3/3] nvme-multipath: automatic NUMA path balancing Hannes Reinecke
2018-11-02 9:56 [PATCHv3 0/3] nvme: NUMA locality for fabrics Hannes Reinecke
2018-11-02 9:56 ` [PATCH 1/3] nvme: NUMA locality information " Hannes Reinecke
2018-11-08 9:22 ` Christoph Hellwig
2018-11-08 9:35 ` Hannes Reinecke
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.