All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] nvme: NUMA locality for fabrics
@ 2018-10-05  9:29 Hannes Reinecke
  2018-10-05  9:29 ` [PATCH 1/2] nvme: NUMA locality information " Hannes Reinecke
  2018-10-05  9:29 ` [PATCH 2/2] nvme-multipath: manual NUMA configuration Hannes Reinecke
  0 siblings, 2 replies; 14+ messages in thread
From: Hannes Reinecke @ 2018-10-05  9:29 UTC (permalink / raw)


Hi all,

[Resend, as I forgot to include the nvme ml]

here's a patchset to leverage NUMA locality information for fabric controllers.
As there are systems for which no 'best' configuration exists (eg a symmetric
NUMA machine with 4 nodes and 2 HBAs) it also exports the node mapping to
sysfs so that the admin can tweak them to achieve best performance.

With this patchset I'm seeing a performance increase from
262k IOPS to 344k IOPS, measured against a NetApp AF700.

As usual, comments and reviews are welcome.

Hannes Reinecke (2):
  nvme: NUMA locality information for fabrics
  nvme-multipath: manual NUMA configuration

 drivers/nvme/host/core.c      | 68 +++++++++++++++++++++++++++++++++++++++++--
 drivers/nvme/host/fc.c        |  5 ++--
 drivers/nvme/host/multipath.c | 11 +++++--
 drivers/nvme/host/nvme.h      |  3 ++
 drivers/nvme/host/pci.c       |  2 +-
 drivers/nvme/host/rdma.c      |  6 ++--
 6 files changed, 85 insertions(+), 10 deletions(-)

-- 
2.16.4

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/2] nvme: NUMA locality information for fabrics
  2018-10-05  9:29 [PATCH 0/2] nvme: NUMA locality for fabrics Hannes Reinecke
@ 2018-10-05  9:29 ` Hannes Reinecke
  2018-10-08 10:04   ` Christoph Hellwig
  2018-10-05  9:29 ` [PATCH 2/2] nvme-multipath: manual NUMA configuration Hannes Reinecke
  1 sibling, 1 reply; 14+ messages in thread
From: Hannes Reinecke @ 2018-10-05  9:29 UTC (permalink / raw)


From: Hannes Reinecke <hare@suse.com>

Add a new field 'node_id' to the nvme_ctrl structure to hold the
NUMA locality information of the underlying hardware.
With that we can allocate the memory structures on the same NUMA
node as the underlying hardware.

Signed-off-by: Hannes Reinecke <hare at suse.com>
---
 drivers/nvme/host/core.c      | 4 +++-
 drivers/nvme/host/fc.c        | 5 +++--
 drivers/nvme/host/multipath.c | 4 ++--
 drivers/nvme/host/nvme.h      | 1 +
 drivers/nvme/host/pci.c       | 1 +
 drivers/nvme/host/rdma.c      | 5 +++--
 6 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2db33a752e2b..21c1d317415a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2766,6 +2766,7 @@ static ssize_t  field##_show(struct device *dev,				\
 static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL);
 
 nvme_show_int_function(cntlid);
+nvme_show_int_function(node_id);
 
 static ssize_t nvme_sysfs_delete(struct device *dev,
 				struct device_attribute *attr, const char *buf,
@@ -2845,6 +2846,7 @@ static struct attribute *nvme_dev_attrs[] = {
 	&dev_attr_subsysnqn.attr,
 	&dev_attr_address.attr,
 	&dev_attr_state.attr,
+	&dev_attr_node_id.attr,
 	NULL
 };
 
@@ -3055,7 +3057,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
 	struct gendisk *disk;
 	struct nvme_id_ns *id;
 	char disk_name[DISK_NAME_LEN];
-	int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
+	int node = ctrl->node_id, flags = GENHD_FL_EXT_DEVT;
 
 	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
 	if (!ns)
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 9d201b35397d..7f246dd04bc5 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2422,7 +2422,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
 	ctrl->tag_set.ops = &nvme_fc_mq_ops;
 	ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
 	ctrl->tag_set.reserved_tags = 1; /* fabric connect */
-	ctrl->tag_set.numa_node = NUMA_NO_NODE;
+	ctrl->tag_set.numa_node = ctrl->ctrl.node_id;
 	ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
 	ctrl->tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
 					(SG_CHUNK_SIZE *
@@ -2990,6 +2990,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
 	ctrl->ctrl.opts = opts;
 	ctrl->ctrl.nr_reconnects = 0;
+	ctrl->ctrl.node_id = dev_to_node(lport->dev);
 	INIT_LIST_HEAD(&ctrl->ctrl_list);
 	ctrl->lport = lport;
 	ctrl->rport = rport;
@@ -3028,7 +3029,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 	ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
 	ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 	ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
-	ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
+	ctrl->admin_tag_set.numa_node = ctrl->ctrl.node_id;
 	ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
 					(SG_CHUNK_SIZE *
 						sizeof(struct scatterlist)) +
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 52987052b7fc..8d456c6f0e02 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -141,7 +141,7 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
 			continue;
 
-		distance = node_distance(node, dev_to_node(ns->ctrl->dev));
+		distance = node_distance(node, ns->ctrl->node_id);
 
 		switch (ns->ana_state) {
 		case NVME_ANA_OPTIMIZED:
@@ -276,7 +276,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
 	if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
 		return 0;
 
-	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
+	q = blk_alloc_queue_node(GFP_KERNEL, ctrl->node_id, NULL);
 	if (!q)
 		goto out;
 	q->queuedata = head;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9fefba039d1e..55347a547d84 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -153,6 +153,7 @@ struct nvme_ctrl {
 	struct request_queue *connect_q;
 	struct device *dev;
 	int instance;
+	int node_id;
 	struct blk_mq_tag_set *tagset;
 	struct blk_mq_tag_set *admin_tagset;
 	struct list_head namespaces;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d668682f91df..b5d37aacf212 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2517,6 +2517,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
+	dev->ctrl.node_id = node;
 	nvme_get_ctrl(&dev->ctrl);
 	async_schedule(nvme_async_probe, dev);
 
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index dc042017c293..707e158561ba 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -686,7 +686,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 		set->ops = &nvme_rdma_admin_mq_ops;
 		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 		set->reserved_tags = 2; /* connect + keep-alive */
-		set->numa_node = NUMA_NO_NODE;
+		set->numa_node = nctrl->node_id;
 		set->cmd_size = sizeof(struct nvme_rdma_request) +
 			SG_CHUNK_SIZE * sizeof(struct scatterlist);
 		set->driver_data = ctrl;
@@ -699,7 +699,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 		set->ops = &nvme_rdma_mq_ops;
 		set->queue_depth = nctrl->sqsize + 1;
 		set->reserved_tags = 1; /* fabric connect */
-		set->numa_node = NUMA_NO_NODE;
+		set->numa_node = nctrl->node_id;
 		set->flags = BLK_MQ_F_SHOULD_MERGE;
 		set->cmd_size = sizeof(struct nvme_rdma_request) +
 			SG_CHUNK_SIZE * sizeof(struct scatterlist);
@@ -755,6 +755,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 		return error;
 
 	ctrl->device = ctrl->queues[0].device;
+	ctrl->ctrl.node_id = dev_to_node(ctrl->device->dev->dma_device);
 
 	ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
 
-- 
2.16.4

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/2] nvme-multipath: manual NUMA configuration
  2018-10-05  9:29 [PATCH 0/2] nvme: NUMA locality for fabrics Hannes Reinecke
  2018-10-05  9:29 ` [PATCH 1/2] nvme: NUMA locality information " Hannes Reinecke
@ 2018-10-05  9:29 ` Hannes Reinecke
  2018-10-08 10:05   ` Christoph Hellwig
  1 sibling, 1 reply; 14+ messages in thread
From: Hannes Reinecke @ 2018-10-05  9:29 UTC (permalink / raw)


Not every NUMA configuration allows for an automatic setup via
the distance between nodes, or even the admin might have some
specific ideas on how things should be setup.
This patch adds a new sysfs attribute 'node_map' to allow to
specify the NUMA mapping for each controller.

Signed-off-by: Hannes Reinecke <hare at suse.com>
---
 drivers/nvme/host/core.c      | 64 ++++++++++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/fc.c        |  2 +-
 drivers/nvme/host/multipath.c |  9 ++++--
 drivers/nvme/host/nvme.h      |  2 ++
 drivers/nvme/host/pci.c       |  3 +-
 drivers/nvme/host/rdma.c      |  3 +-
 6 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 21c1d317415a..57271c0a765f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2204,6 +2204,21 @@ static int nvme_active_ctrls(struct nvme_subsystem *subsys)
 	return count;
 }
 
+void nvme_set_ctrl_node(struct nvme_ctrl *ctrl, int node_id)
+{
+	int node;
+
+	ctrl->node_id = node_id;
+	if (node_id == NUMA_NO_NODE)
+		return;
+	ctrl->node_map = kzalloc(num_possible_nodes() * sizeof(int),
+				 GFP_KERNEL);
+	if (ctrl->node_map)
+		for_each_node(node)
+			ctrl->node_map[node] = node_distance(node, node_id);
+}
+EXPORT_SYMBOL_GPL(nvme_set_ctrl_node);
+
 static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 {
 	struct nvme_subsystem *subsys, *found;
@@ -2834,6 +2849,50 @@ static ssize_t nvme_sysfs_show_address(struct device *dev,
 }
 static DEVICE_ATTR(address, S_IRUGO, nvme_sysfs_show_address, NULL);
 
+static ssize_t nvme_sysfs_store_node_map(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	int i = 0, node_id, ret;
+	char *ptr, *node_str, *node_ptr;
+
+	ptr = node_str = kstrdup(buf, GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+	while (ptr && i < num_possible_nodes()) {
+		printk("%s\n", ptr);
+		node_ptr = strsep(&ptr, " ");
+		ret = kstrtoint(node_ptr, 0, &node_id);
+		if (ret < 0)
+			break;
+		ctrl->node_map[i] = node_id;
+		i++;
+	}
+	kfree(node_str);
+	if (ptr)
+		ret = -EINVAL;
+	return ret < 0 ? ret : count;
+}
+
+static ssize_t nvme_sysfs_show_node_map(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	int node;
+	ssize_t offset = 0;
+
+	for_each_node(node)
+		offset += snprintf(buf + offset, PAGE_SIZE - offset,
+				   "%d ", ctrl->node_map[node]);
+	offset += snprintf(buf + offset, PAGE_SIZE - offset, "\n");
+
+	return offset;
+}
+static DEVICE_ATTR(node_map, S_IRUGO | S_IWUSR, nvme_sysfs_show_node_map,
+		   nvme_sysfs_store_node_map);
+
 static struct attribute *nvme_dev_attrs[] = {
 	&dev_attr_reset_controller.attr,
 	&dev_attr_rescan_controller.attr,
@@ -2847,6 +2906,7 @@ static struct attribute *nvme_dev_attrs[] = {
 	&dev_attr_address.attr,
 	&dev_attr_state.attr,
 	&dev_attr_node_id.attr,
+	&dev_attr_node_map.attr,
 	NULL
 };
 
@@ -2860,7 +2920,8 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
 		return 0;
 	if (a == &dev_attr_address.attr && !ctrl->ops->get_address)
 		return 0;
-
+	if (a == &dev_attr_node_map.attr && !ctrl->node_map)
+		return 0;
 	return a->mode;
 }
 
@@ -3507,6 +3568,7 @@ static void nvme_free_ctrl(struct device *dev)
 
 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 	kfree(ctrl->effects);
+	kfree(ctrl->node_map);
 	nvme_mpath_uninit(ctrl);
 
 	if (subsys) {
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 7f246dd04bc5..9ceea2c91249 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2990,7 +2990,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
 	ctrl->ctrl.opts = opts;
 	ctrl->ctrl.nr_reconnects = 0;
-	ctrl->ctrl.node_id = dev_to_node(lport->dev);
+	nvme_set_ctrl_node(&ctrl->ctrl, dev_to_node(lport->dev));
 	INIT_LIST_HEAD(&ctrl->ctrl_list);
 	ctrl->lport = lport;
 	ctrl->rport = rport;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 8d456c6f0e02..b9c617cbe2c4 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -141,7 +141,8 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
 			continue;
 
-		distance = node_distance(node, ns->ctrl->node_id);
+		distance = ns->ctrl->node_map ?
+			ns->ctrl->node_map[node] : INT_MAX;
 
 		switch (ns->ana_state) {
 		case NVME_ANA_OPTIMIZED:
@@ -163,8 +164,11 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
 
 	if (!found)
 		found = fallback;
-	if (found)
+	if (found) {
+		dev_dbg(disk_to_dev(head->disk), "none: node %d path %s\n",
+			node, found->disk->disk_name);
 		rcu_assign_pointer(head->current_path[node], found);
+	}
 	return found;
 }
 
@@ -519,6 +523,7 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
 		nvme_mpath_set_live(ns);
 		mutex_unlock(&ns->head->lock);
 	}
+	nvme_mpath_clear_current_path(ns);
 }
 
 void nvme_mpath_remove_disk(struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 55347a547d84..0027f5d41ff8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -154,6 +154,7 @@ struct nvme_ctrl {
 	struct device *dev;
 	int instance;
 	int node_id;
+	int *node_map;
 	struct blk_mq_tag_set *tagset;
 	struct blk_mq_tag_set *admin_tagset;
 	struct list_head namespaces;
@@ -437,6 +438,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl);
 void nvme_wait_freeze(struct nvme_ctrl *ctrl);
 void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
 void nvme_start_freeze(struct nvme_ctrl *ctrl);
+void nvme_set_ctrl_node(struct nvme_ctrl *ctrl, int node);
 
 #define NVME_QID_ANY -1
 struct request *nvme_alloc_request(struct request_queue *q,
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b5d37aacf212..3df9d380e59b 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2509,7 +2509,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 		result = -ENOMEM;
 		goto release_pools;
 	}
-
+	nvme_set_ctrl_node(&dev->ctrl, node);
 	result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
 			quirks);
 	if (result)
@@ -2517,7 +2517,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
-	dev->ctrl.node_id = node;
 	nvme_get_ctrl(&dev->ctrl);
 	async_schedule(nvme_async_probe, dev);
 
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 707e158561ba..70375acff812 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -755,7 +755,8 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 		return error;
 
 	ctrl->device = ctrl->queues[0].device;
-	ctrl->ctrl.node_id = dev_to_node(ctrl->device->dev->dma_device);
+	nvme_set_ctrl_node(&ctrl->ctrl,
+			   dev_to_node(ctrl->device->dev->dma_device));
 
 	ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
 
-- 
2.16.4

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 1/2] nvme: NUMA locality information for fabrics
  2018-10-05  9:29 ` [PATCH 1/2] nvme: NUMA locality information " Hannes Reinecke
@ 2018-10-08 10:04   ` Christoph Hellwig
  2018-10-08 10:22     ` Matias Bjørling
  2018-10-08 10:24     ` Hannes Reinecke
  0 siblings, 2 replies; 14+ messages in thread
From: Christoph Hellwig @ 2018-10-08 10:04 UTC (permalink / raw)


> @@ -3055,7 +3057,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>  	struct gendisk *disk;
>  	struct nvme_id_ns *id;
>  	char disk_name[DISK_NAME_LEN];
> -	int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
> +	int node = ctrl->node_id, flags = GENHD_FL_EXT_DEVT;

I think we can just kill the local node variable now.

> @@ -276,7 +276,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
>  	if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
>  		return 0;
>  
> -	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
> +	q = blk_alloc_queue_node(GFP_KERNEL, ctrl->node_id, NULL);

This looks a little odd.  We create the mpath disk here, and just
need to pass in the ctrl to do some initialization.  Why would be
allocate the multipath node on the node of the controller?

If there is a good reason it needs to go into a comment, if not
we should drop this hunk.

> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index d668682f91df..b5d37aacf212 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2517,6 +2517,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  
>  	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
>  
> +	dev->ctrl.node_id = node;
>  	nvme_get_ctrl(&dev->ctrl);
>  	async_schedule(nvme_async_probe, dev);

This changes behavior as node might remain NUMA_NO_NODE when the code
above sets the device node to first_memory_node now when
it would otherwise be NUMA_NO_NODE.  Then again that code makes no
sense to me to start with - Matias added it with the blk-mq conversion,
so maybe he remembers why we do it to start with?  If we want to fix
up nodes like that it seems like we should do it in the driver core
or the PCI core.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 2/2] nvme-multipath: manual NUMA configuration
  2018-10-05  9:29 ` [PATCH 2/2] nvme-multipath: manual NUMA configuration Hannes Reinecke
@ 2018-10-08 10:05   ` Christoph Hellwig
  2018-10-08 10:19     ` Hannes Reinecke
  0 siblings, 1 reply; 14+ messages in thread
From: Christoph Hellwig @ 2018-10-08 10:05 UTC (permalink / raw)


On Fri, Oct 05, 2018@11:29:15AM +0200, Hannes Reinecke wrote:
> Not every NUMA configuration allows for an automatic setup via
> the distance between nodes, or even the admin might have some
> specific ideas on how things should be setup.
> This patch adds a new sysfs attribute 'node_map' to allow to
> specify the NUMA mapping for each controller.

This is a rather vagaue rational for a lot of anoying boilerplate
code.  I'd like to see better arguments for even looking into adding
magic knobs.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 2/2] nvme-multipath: manual NUMA configuration
  2018-10-08 10:05   ` Christoph Hellwig
@ 2018-10-08 10:19     ` Hannes Reinecke
  0 siblings, 0 replies; 14+ messages in thread
From: Hannes Reinecke @ 2018-10-08 10:19 UTC (permalink / raw)


On 10/8/18 12:05 PM, Christoph Hellwig wrote:
> On Fri, Oct 05, 2018@11:29:15AM +0200, Hannes Reinecke wrote:
>> Not every NUMA configuration allows for an automatic setup via
>> the distance between nodes, or even the admin might have some
>> specific ideas on how things should be setup.
>> This patch adds a new sysfs attribute 'node_map' to allow to
>> specify the NUMA mapping for each controller.
> 
> This is a rather vagaue rational for a lot of anoying boilerplate
> code.  I'd like to see better arguments for even looking into adding
> magic knobs.
> 
Two reasons:
- With setting the correct affinity I'm getting a 25% performance boost
   (mainly as we can now utilize both paths properly)
- I haven't found a way of how we can distribute the NUMA nodes optimal, 
given the various possible HW combinations.

It's pretty trivial if you have only two NUMA nodes and two PCI 
devices/functions to distribute, but it becomes less clear for eg 4 NUMA 
nodes or two PCI functions on the same device (a dual-port FC HBA, say).

I have a system with this NUMA distance table:

10 21 21 21
21 10 21 21
21 21 10 21
21 21 21 10

and two (FC) HBAs on node 0 and 3.
So your algorithm gives me this distribution:

hba0: node 0 1 2
hba1: node 3

which clearly is suboptimal. We surely can require to have a PCI layout 
where the first HBA has to be on node 0, and the second on node 2, but 
this really is a bit daft and is just trying to paper over the fact that 
our algorithm sucks.
And given that we can place the two HBAs on virtually any node I found 
it too time-consuming trying to come up with an algorithm trying to fit 
all needs.
And that is the simple case; if one has a dual-ported HBA the NUMA 
locality for both paths will be identical, but we _still_ will be 
getting a performance boost by just being able to utilize both paths.

Additionally, with this patch we do provide static load-balancing 
without the need to add yet another framework.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/2] nvme: NUMA locality information for fabrics
  2018-10-08 10:04   ` Christoph Hellwig
@ 2018-10-08 10:22     ` Matias Bjørling
  2018-10-08 10:27       ` Hannes Reinecke
  2018-10-09  6:13       ` Christoph Hellwig
  2018-10-08 10:24     ` Hannes Reinecke
  1 sibling, 2 replies; 14+ messages in thread
From: Matias Bjørling @ 2018-10-08 10:22 UTC (permalink / raw)




On 10/08/2018 12:04 PM, Christoph Hellwig wrote:
>> @@ -3055,7 +3057,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>>   	struct gendisk *disk;
>>   	struct nvme_id_ns *id;
>>   	char disk_name[DISK_NAME_LEN];
>> -	int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
>> +	int node = ctrl->node_id, flags = GENHD_FL_EXT_DEVT;
> 
> I think we can just kill the local node variable now.
> 
>> @@ -276,7 +276,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
>>   	if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
>>   		return 0;
>>   
>> -	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
>> +	q = blk_alloc_queue_node(GFP_KERNEL, ctrl->node_id, NULL);
> 
> This looks a little odd.  We create the mpath disk here, and just
> need to pass in the ctrl to do some initialization.  Why would be
> allocate the multipath node on the node of the controller?
> 
> If there is a good reason it needs to go into a comment, if not
> we should drop this hunk.
> 
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index d668682f91df..b5d37aacf212 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -2517,6 +2517,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>>   
>>   	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
>>   
>> +	dev->ctrl.node_id = node;
>>   	nvme_get_ctrl(&dev->ctrl);
>>   	async_schedule(nvme_async_probe, dev);
> 
> This changes behavior as node might remain NUMA_NO_NODE when the code
> above sets the device node to first_memory_node now when
> it would otherwise be NUMA_NO_NODE.  Then again that code makes no
> sense to me to start with - Matias added it with the blk-mq conversion,
> so maybe he remembers why we do it to start with?  If we want to fix
> up nodes like that it seems like we should do it in the driver core
> or the PCI core.
> 

Back in the days it was carried over to make sure the structures was 
allocated from the same node as the pcie device.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/2] nvme: NUMA locality information for fabrics
  2018-10-08 10:04   ` Christoph Hellwig
  2018-10-08 10:22     ` Matias Bjørling
@ 2018-10-08 10:24     ` Hannes Reinecke
  1 sibling, 0 replies; 14+ messages in thread
From: Hannes Reinecke @ 2018-10-08 10:24 UTC (permalink / raw)


On 10/8/18 12:04 PM, Christoph Hellwig wrote:
>> @@ -3055,7 +3057,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>>   	struct gendisk *disk;
>>   	struct nvme_id_ns *id;
>>   	char disk_name[DISK_NAME_LEN];
>> -	int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
>> +	int node = ctrl->node_id, flags = GENHD_FL_EXT_DEVT;
> 
> I think we can just kill the local node variable now.
> 
Indeed.

>> @@ -276,7 +276,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
>>   	if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
>>   		return 0;
>>   
>> -	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
>> +	q = blk_alloc_queue_node(GFP_KERNEL, ctrl->node_id, NULL);
> 
> This looks a little odd.  We create the mpath disk here, and just
> need to pass in the ctrl to do some initialization.  Why would be
> allocate the multipath node on the node of the controller?
> 
> If there is a good reason it needs to go into a comment, if not
> we should drop this hunk.
> 
Well, as we _do_ have an argument for passing in the NUMA locality I 
thought it'd being a good time to also use it :-)
But actually I have debated with myself whether it's the correct thing 
to do.
So yeah, I guess it can be dropped.

>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index d668682f91df..b5d37aacf212 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -2517,6 +2517,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>>   
>>   	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
>>   
>> +	dev->ctrl.node_id = node;
>>   	nvme_get_ctrl(&dev->ctrl);
>>   	async_schedule(nvme_async_probe, dev);
> 
> This changes behavior as node might remain NUMA_NO_NODE when the code
> above sets the device node to first_memory_node now when
> it would otherwise be NUMA_NO_NODE.  Then again that code makes no
> sense to me to start with - Matias added it with the blk-mq conversion,
> so maybe he remembers why we do it to start with?  If we want to fix
> up nodes like that it seems like we should do it in the driver core
> or the PCI core.
> 
This was primarily for orthogonality; I don't have any issues with 
dropping it.

Will be resending the patchset.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/2] nvme: NUMA locality information for fabrics
  2018-10-08 10:22     ` Matias Bjørling
@ 2018-10-08 10:27       ` Hannes Reinecke
  2018-10-08 10:29         ` Matias Bjørling
  2018-10-09  6:13       ` Christoph Hellwig
  1 sibling, 1 reply; 14+ messages in thread
From: Hannes Reinecke @ 2018-10-08 10:27 UTC (permalink / raw)


On 10/8/18 12:22 PM, Matias Bj?rling wrote:
> 
> 
> On 10/08/2018 12:04 PM, Christoph Hellwig wrote:
[ .. ]
>>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>>> index d668682f91df..b5d37aacf212 100644
>>> --- a/drivers/nvme/host/pci.c
>>> +++ b/drivers/nvme/host/pci.c
>>> @@ -2517,6 +2517,7 @@ static int nvme_probe(struct pci_dev *pdev, 
>>> const struct pci_device_id *id)
>>> ????? dev_info(dev->ctrl.device, "pci function %s\n", 
>>> dev_name(&pdev->dev));
>>> +??? dev->ctrl.node_id = node;
>>> ????? nvme_get_ctrl(&dev->ctrl);
>>> ????? async_schedule(nvme_async_probe, dev);
>>
>> This changes behavior as node might remain NUMA_NO_NODE when the code
>> above sets the device node to first_memory_node now when
>> it would otherwise be NUMA_NO_NODE.? Then again that code makes no
>> sense to me to start with - Matias added it with the blk-mq conversion,
>> so maybe he remembers why we do it to start with?? If we want to fix
>> up nodes like that it seems like we should do it in the driver core
>> or the PCI core.
>>
> 
> Back in the days it was carried over to make sure the structures was 
> allocated from the same node as the pcie device.
> 
Which really is what this patch is trying to do.
Or am I wrong?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		               zSeries & Storage
hare at suse.com			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/2] nvme: NUMA locality information for fabrics
  2018-10-08 10:27       ` Hannes Reinecke
@ 2018-10-08 10:29         ` Matias Bjørling
  2018-10-08 23:29           ` Sagi Grimberg
  0 siblings, 1 reply; 14+ messages in thread
From: Matias Bjørling @ 2018-10-08 10:29 UTC (permalink / raw)




On 10/08/2018 12:27 PM, Hannes Reinecke wrote:
> On 10/8/18 12:22 PM, Matias Bj?rling wrote:
>>
>>
>> On 10/08/2018 12:04 PM, Christoph Hellwig wrote:
> [ .. ]
>>>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>>>> index d668682f91df..b5d37aacf212 100644
>>>> --- a/drivers/nvme/host/pci.c
>>>> +++ b/drivers/nvme/host/pci.c
>>>> @@ -2517,6 +2517,7 @@ static int nvme_probe(struct pci_dev *pdev, 
>>>> const struct pci_device_id *id)
>>>> ????? dev_info(dev->ctrl.device, "pci function %s\n", 
>>>> dev_name(&pdev->dev));
>>>> +??? dev->ctrl.node_id = node;
>>>> ????? nvme_get_ctrl(&dev->ctrl);
>>>> ????? async_schedule(nvme_async_probe, dev);
>>>
>>> This changes behavior as node might remain NUMA_NO_NODE when the code
>>> above sets the device node to first_memory_node now when
>>> it would otherwise be NUMA_NO_NODE.? Then again that code makes no
>>> sense to me to start with - Matias added it with the blk-mq conversion,
>>> so maybe he remembers why we do it to start with?? If we want to fix
>>> up nodes like that it seems like we should do it in the driver core
>>> or the PCI core.
>>>
>>
>> Back in the days it was carried over to make sure the structures was 
>> allocated from the same node as the pcie device.
>>
> Which really is what this patch is trying to do.
> Or am I wrong?
> 

Yep. It was missing for fabrics.

> Cheers,
> 
> Hannes

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/2] nvme: NUMA locality information for fabrics
  2018-10-08 10:29         ` Matias Bjørling
@ 2018-10-08 23:29           ` Sagi Grimberg
  2018-10-08 23:31             ` Sagi Grimberg
  2018-10-09  6:14             ` Hannes Reinecke
  0 siblings, 2 replies; 14+ messages in thread
From: Sagi Grimberg @ 2018-10-08 23:29 UTC (permalink / raw)



>>> Back in the days it was carried over to make sure the structures was 
>>> allocated from the same node as the pcie device.
>>>
>> Which really is what this patch is trying to do.
>> Or am I wrong?
>>
> 
> Yep. It was missing for fabrics.

Its not a trivial trade-off what should the memory be local
to. Device locality will not necessarily result in better
performance than application locality.

For example if you run X threads all on the far node (from
the device) it would be very costly if all memory
accesses should cross the QPI bus...

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/2] nvme: NUMA locality information for fabrics
  2018-10-08 23:29           ` Sagi Grimberg
@ 2018-10-08 23:31             ` Sagi Grimberg
  2018-10-09  6:14             ` Hannes Reinecke
  1 sibling, 0 replies; 14+ messages in thread
From: Sagi Grimberg @ 2018-10-08 23:31 UTC (permalink / raw)




On 10/08/2018 04:29 PM, Sagi Grimberg wrote:
> 
>>>> Back in the days it was carried over to make sure the structures was 
>>>> allocated from the same node as the pcie device.
>>>>
>>> Which really is what this patch is trying to do.
>>> Or am I wrong?
>>>
>>
>> Yep. It was missing for fabrics.
> 
> Its not a trivial trade-off what should the memory be local
> to. Device locality will not necessarily result in better
> performance than application locality.
> 
> For example if you run X threads all on the far node (from
> the device) it would be very costly if all memory
> accesses should cross the QPI bus...

Not to mention that the data buffers (which will dominate the
dma activity) will often be allocated with application locality.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/2] nvme: NUMA locality information for fabrics
  2018-10-08 10:22     ` Matias Bjørling
  2018-10-08 10:27       ` Hannes Reinecke
@ 2018-10-09  6:13       ` Christoph Hellwig
  1 sibling, 0 replies; 14+ messages in thread
From: Christoph Hellwig @ 2018-10-09  6:13 UTC (permalink / raw)


On Mon, Oct 08, 2018@12:22:26PM +0200, Matias Bj?rling wrote:
>> This changes behavior as node might remain NUMA_NO_NODE when the code
>> above sets the device node to first_memory_node now when
>> it would otherwise be NUMA_NO_NODE.  Then again that code makes no
>> sense to me to start with - Matias added it with the blk-mq conversion,
>> so maybe he remembers why we do it to start with?  If we want to fix
>> up nodes like that it seems like we should do it in the driver core
>> or the PCI core.
>>
>
> Back in the days it was carried over to make sure the structures was 
> allocated from the same node as the pcie device.

Well, the PCI device will only get allocated from a node if it has
a node set, so overriding it doesn't make much sense.

Now it seems like various architectures have a stub pcibus_to_node
and thus always return NUMA_NO_NODE, but I don't really see a good
reason to override it.  Either we live with that, or move the
set_dev_node hack into the PCI core.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/2] nvme: NUMA locality information for fabrics
  2018-10-08 23:29           ` Sagi Grimberg
  2018-10-08 23:31             ` Sagi Grimberg
@ 2018-10-09  6:14             ` Hannes Reinecke
  1 sibling, 0 replies; 14+ messages in thread
From: Hannes Reinecke @ 2018-10-09  6:14 UTC (permalink / raw)


On 10/9/18 1:29 AM, Sagi Grimberg wrote:
> 
>>>> Back in the days it was carried over to make sure the structures was 
>>>> allocated from the same node as the pcie device.
>>>>
>>> Which really is what this patch is trying to do.
>>> Or am I wrong?
>>>
>>
>> Yep. It was missing for fabrics.
> 
> Its not a trivial trade-off what should the memory be local
> to. Device locality will not necessarily result in better
> performance than application locality.
> 
> For example if you run X threads all on the far node (from
> the device) it would be very costly if all memory
> accesses should cross the QPI bus...

Oh, yes. I fully concur.
But this patchset is trying to establish an _optimal_ path for I/O, ie 
ensure that I/O originating on that node where the HBA is local to will 
be handled optimally.
I/O _not_ originating on that node might suffer a performance penalty, yes.

And I'm somewhat at a loss how we can achieve application locality here; 
the only place where we _can_ influence locality is when creating the 
queue, and that typically is done via nvme-cli.
So any application would need to be tweaked to run on the same node 
where the nvme-cli program has been run; but as we need to tweak the 
application anyway we can as well look at the 'node' attribute for the 
queue and use that for tweaking.
So I somewhat fail to see your point ...

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2018-10-09  6:14 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-05  9:29 [PATCH 0/2] nvme: NUMA locality for fabrics Hannes Reinecke
2018-10-05  9:29 ` [PATCH 1/2] nvme: NUMA locality information " Hannes Reinecke
2018-10-08 10:04   ` Christoph Hellwig
2018-10-08 10:22     ` Matias Bjørling
2018-10-08 10:27       ` Hannes Reinecke
2018-10-08 10:29         ` Matias Bjørling
2018-10-08 23:29           ` Sagi Grimberg
2018-10-08 23:31             ` Sagi Grimberg
2018-10-09  6:14             ` Hannes Reinecke
2018-10-09  6:13       ` Christoph Hellwig
2018-10-08 10:24     ` Hannes Reinecke
2018-10-05  9:29 ` [PATCH 2/2] nvme-multipath: manual NUMA configuration Hannes Reinecke
2018-10-08 10:05   ` Christoph Hellwig
2018-10-08 10:19     ` Hannes Reinecke

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.