All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] nvme: take node locality into account when selecting a path
@ 2018-09-27 23:05 Christoph Hellwig
  2018-09-28 14:12 ` Keith Busch
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Christoph Hellwig @ 2018-09-27 23:05 UTC (permalink / raw)


Make current_path an array with an entry for every possible node, and
cache the best path on a per-node basis.  Take the node distance into
account when selecting it.  This is primarily useful for dual-ported PCIe
devices which are connected to PCIe root ports on different sockets.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 drivers/nvme/host/core.c      |  7 ++++-
 drivers/nvme/host/multipath.c | 50 +++++++++++++++++++++++++++--------
 drivers/nvme/host/nvme.h      | 25 +++++++-----------
 3 files changed, 54 insertions(+), 28 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 41a6180649d2..eb10b263de8f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2900,9 +2900,14 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
 		unsigned nsid, struct nvme_id_ns *id)
 {
 	struct nvme_ns_head *head;
+	size_t size = sizeof(*head);
 	int ret = -ENOMEM;
 
-	head = kzalloc(sizeof(*head), GFP_KERNEL);
+#ifdef CONFIG_NVME_MULTIPATH
+	size += num_possible_nodes() * sizeof(struct nvme_ns *);
+#endif
+
+	head = kzalloc(size, GFP_KERNEL);
 	if (!head)
 		goto out;
 	ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 5a9562881d4e..82c8911aff6f 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -110,29 +110,55 @@ static const char *nvme_ana_state_names[] = {
 	[NVME_ANA_CHANGE]		= "change",
 };
 
-static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
+void nvme_mpath_clear_current_path(struct nvme_ns *ns)
 {
-	struct nvme_ns *ns, *fallback = NULL;
+	struct nvme_ns_head *head = ns->head;
+	int node;
+
+	if (!head)
+		return;
+
+	for_each_node(node) {
+		if (ns == rcu_access_pointer(head->current_path[node]))
+			rcu_assign_pointer(head->current_path[node], NULL);
+	}
+}
+
+static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
+{
+	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
+	struct nvme_ns *found = NULL, *fallback = NULL, *ns;
 
 	list_for_each_entry_rcu(ns, &head->list, siblings) {
 		if (ns->ctrl->state != NVME_CTRL_LIVE ||
 		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
 			continue;
+
+		distance = node_distance(node, dev_to_node(ns->ctrl->dev));
+
 		switch (ns->ana_state) {
 		case NVME_ANA_OPTIMIZED:
-			rcu_assign_pointer(head->current_path, ns);
-			return ns;
+			if (distance < found_distance) {
+				found_distance = distance;
+				found = ns;
+			}
+			break;
 		case NVME_ANA_NONOPTIMIZED:
-			fallback = ns;
+			if (distance < fallback_distance) {
+				fallback_distance = distance;
+				fallback = ns;
+			}
 			break;
 		default:
 			break;
 		}
 	}
 
-	if (fallback)
-		rcu_assign_pointer(head->current_path, fallback);
-	return fallback;
+	if (!found)
+		found = fallback;
+	if (found)
+		rcu_assign_pointer(head->current_path[node], found);
+	return found;
 }
 
 static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
@@ -143,10 +169,12 @@ static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
 
 inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
 {
-	struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
+	int node = numa_node_id();
+	struct nvme_ns *ns;
 
+	ns = srcu_dereference(head->current_path[node], &head->srcu);
 	if (unlikely(!ns || !nvme_path_is_optimized(ns)))
-		ns = __nvme_find_path(head);
+		ns = __nvme_find_path(head, node);
 	return ns;
 }
 
@@ -193,7 +221,7 @@ static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
 	int srcu_idx;
 
 	srcu_idx = srcu_read_lock(&head->srcu);
-	ns = srcu_dereference(head->current_path, &head->srcu);
+	ns = srcu_dereference(head->current_path[numa_node_id()], &head->srcu);
 	if (likely(ns && nvme_path_is_optimized(ns)))
 		found = ns->queue->poll_fn(q, qc);
 	srcu_read_unlock(&head->srcu, srcu_idx);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index bb4a2003c097..5eec5b9205f3 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -277,14 +277,6 @@ struct nvme_ns_ids {
  * only ever has a single entry for private namespaces.
  */
 struct nvme_ns_head {
-#ifdef CONFIG_NVME_MULTIPATH
-	struct gendisk		*disk;
-	struct nvme_ns __rcu	*current_path;
-	struct bio_list		requeue_list;
-	spinlock_t		requeue_lock;
-	struct work_struct	requeue_work;
-	struct mutex		lock;
-#endif
 	struct list_head	list;
 	struct srcu_struct      srcu;
 	struct nvme_subsystem	*subsys;
@@ -293,6 +285,14 @@ struct nvme_ns_head {
 	struct list_head	entry;
 	struct kref		ref;
 	int			instance;
+#ifdef CONFIG_NVME_MULTIPATH
+	struct gendisk		*disk;
+	struct bio_list		requeue_list;
+	spinlock_t		requeue_lock;
+	struct work_struct	requeue_work;
+	struct mutex		lock;
+	struct nvme_ns __rcu	*current_path[];
+#endif
 };
 
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
@@ -474,14 +474,7 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head);
 int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
 void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
 void nvme_mpath_stop(struct nvme_ctrl *ctrl);
-
-static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
-{
-	struct nvme_ns_head *head = ns->head;
-
-	if (head && ns == rcu_access_pointer(head->current_path))
-		rcu_assign_pointer(head->current_path, NULL);
-}
+void nvme_mpath_clear_current_path(struct nvme_ns *ns);
 struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
 
 static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
-- 
2.19.0

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH] nvme: take node locality into account when selecting a path
  2018-09-27 23:05 [PATCH] nvme: take node locality into account when selecting a path Christoph Hellwig
@ 2018-09-28 14:12 ` Keith Busch
  2018-09-28 22:31 ` Sagi Grimberg
  2018-09-29 12:09 ` Hannes Reinecke
  2 siblings, 0 replies; 13+ messages in thread
From: Keith Busch @ 2018-09-28 14:12 UTC (permalink / raw)


On Thu, Sep 27, 2018@04:05:57PM -0700, Christoph Hellwig wrote:
> Make current_path an array with an entry for every possible node, and
> cache the best path on a per-node basis.  Take the node distance into
> account when selecting it.  This is primarily useful for dual-ported PCIe
> devices which are connected to PCIe root ports on different sockets.
> 
> Signed-off-by: Christoph Hellwig <hch at lst.de>

Looks great!

Reviewed-by: Keith Busch <keith.busch at intel.com>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] nvme: take node locality into account when selecting a path
  2018-09-27 23:05 [PATCH] nvme: take node locality into account when selecting a path Christoph Hellwig
  2018-09-28 14:12 ` Keith Busch
@ 2018-09-28 22:31 ` Sagi Grimberg
  2018-09-30 23:01   ` Christoph Hellwig
  2018-09-29 12:09 ` Hannes Reinecke
  2 siblings, 1 reply; 13+ messages in thread
From: Sagi Grimberg @ 2018-09-28 22:31 UTC (permalink / raw)



> +static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
> +{
> +	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
> +	struct nvme_ns *found = NULL, *fallback = NULL, *ns;
>   
>   	list_for_each_entry_rcu(ns, &head->list, siblings) {
>   		if (ns->ctrl->state != NVME_CTRL_LIVE ||
>   		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
>   			continue;
> +
> +		distance = node_distance(node, dev_to_node(ns->ctrl->dev));

Can we make this useful also for fabrics who have device affinity
knowledge? ctrl->dev is useless for fabrics...

Perhaps make it a ctrl->ops->devnode() or something?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] nvme: take node locality into account when selecting a path
  2018-09-27 23:05 [PATCH] nvme: take node locality into account when selecting a path Christoph Hellwig
  2018-09-28 14:12 ` Keith Busch
  2018-09-28 22:31 ` Sagi Grimberg
@ 2018-09-29 12:09 ` Hannes Reinecke
  2018-09-30 22:59   ` Christoph Hellwig
  2 siblings, 1 reply; 13+ messages in thread
From: Hannes Reinecke @ 2018-09-29 12:09 UTC (permalink / raw)


On 9/28/18 1:05 AM, Christoph Hellwig wrote:
> Make current_path an array with an entry for every possible node, and
> cache the best path on a per-node basis.  Take the node distance into
> account when selecting it.  This is primarily useful for dual-ported PCIe
> devices which are connected to PCIe root ports on different sockets.
> 
> Signed-off-by: Christoph Hellwig <hch at lst.de>
> ---
>  drivers/nvme/host/core.c      |  7 ++++-
>  drivers/nvme/host/multipath.c | 50 +++++++++++++++++++++++++++--------
>  drivers/nvme/host/nvme.h      | 25 +++++++-----------
>  3 files changed, 54 insertions(+), 28 deletions(-)
> 
Actually I would love to have some statistic or debug info showing the
NUMA locality mapping somewhere. Otherwise the admin might have a hard
time figuring out what's going on here, plus debugging any I/O routing
issues will be tricky.

But I guess we can do it with a later patch.

So:

Reviewed-by: Hannes Reinecke <hare at suse.com>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] nvme: take node locality into account when selecting a path
  2018-09-29 12:09 ` Hannes Reinecke
@ 2018-09-30 22:59   ` Christoph Hellwig
  0 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2018-09-30 22:59 UTC (permalink / raw)


On Sat, Sep 29, 2018@02:09:15PM +0200, Hannes Reinecke wrote:
> Actually I would love to have some statistic or debug info showing the
> NUMA locality mapping somewhere. Otherwise the admin might have a hard
> time figuring out what's going on here, plus debugging any I/O routing
> issues will be tricky.

You already see the node of the devices in sysfs, so what else do you
think of?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] nvme: take node locality into account when selecting a path
  2018-09-28 22:31 ` Sagi Grimberg
@ 2018-09-30 23:01   ` Christoph Hellwig
  2018-10-01 19:45     ` Sagi Grimberg
  0 siblings, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2018-09-30 23:01 UTC (permalink / raw)


On Fri, Sep 28, 2018@03:31:09PM -0700, Sagi Grimberg wrote:
>
>> +static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
>> +{
>> +	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
>> +	struct nvme_ns *found = NULL, *fallback = NULL, *ns;
>>     	list_for_each_entry_rcu(ns, &head->list, siblings) {
>>   		if (ns->ctrl->state != NVME_CTRL_LIVE ||
>>   		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
>>   			continue;
>> +
>> +		distance = node_distance(node, dev_to_node(ns->ctrl->dev));
>
> Can we make this useful also for fabrics who have device affinity
> knowledge? ctrl->dev is useless for fabrics...
>
> Perhaps make it a ctrl->ops->devnode() or something?

I'd prefer a pointer of field (we could copy the node over) over
an indirect call.  But I'll happily leave that to the person that
actually implements the functionality to fabrics.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] nvme: take node locality into account when selecting a path
  2018-09-30 23:01   ` Christoph Hellwig
@ 2018-10-01 19:45     ` Sagi Grimberg
  2018-10-02 17:30       ` Hannes Reinecke
  0 siblings, 1 reply; 13+ messages in thread
From: Sagi Grimberg @ 2018-10-01 19:45 UTC (permalink / raw)



>>> +static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
>>> +{
>>> +	int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
>>> +	struct nvme_ns *found = NULL, *fallback = NULL, *ns;
>>>      	list_for_each_entry_rcu(ns, &head->list, siblings) {
>>>    		if (ns->ctrl->state != NVME_CTRL_LIVE ||
>>>    		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
>>>    			continue;
>>> +
>>> +		distance = node_distance(node, dev_to_node(ns->ctrl->dev));
>>
>> Can we make this useful also for fabrics who have device affinity
>> knowledge? ctrl->dev is useless for fabrics...
>>
>> Perhaps make it a ctrl->ops->devnode() or something?
> 
> I'd prefer a pointer of field (we could copy the node over) over
> an indirect call.  But I'll happily leave that to the person that
> actually implements the functionality to fabrics.

Fair enough... I can follow up on that.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] nvme: take node locality into account when selecting a path
  2018-10-01 19:45     ` Sagi Grimberg
@ 2018-10-02 17:30       ` Hannes Reinecke
  2018-10-02 17:39         ` Christoph Hellwig
  0 siblings, 1 reply; 13+ messages in thread
From: Hannes Reinecke @ 2018-10-02 17:30 UTC (permalink / raw)


On 10/1/18 9:45 PM, Sagi Grimberg wrote:
> 
>>>> +static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, 
>>>> int node)
>>>> +{
>>>> +??? int found_distance = INT_MAX, fallback_distance = INT_MAX, 
>>>> distance;
>>>> +??? struct nvme_ns *found = NULL, *fallback = NULL, *ns;
>>>> ???????? list_for_each_entry_rcu(ns, &head->list, siblings) {
>>>> ?????????? if (ns->ctrl->state != NVME_CTRL_LIVE ||
>>>> ?????????????? test_bit(NVME_NS_ANA_PENDING, &ns->flags))
>>>> ?????????????? continue;
>>>> +
>>>> +??????? distance = node_distance(node, dev_to_node(ns->ctrl->dev));
>>>
>>> Can we make this useful also for fabrics who have device affinity
>>> knowledge? ctrl->dev is useless for fabrics...
>>>
>>> Perhaps make it a ctrl->ops->devnode() or something?
>>
>> I'd prefer a pointer of field (we could copy the node over) over
>> an indirect call.? But I'll happily leave that to the person that
>> actually implements the functionality to fabrics.
> 
> Fair enough... I can follow up on that.
> 
Something like this?

Cheers,

Hannes


-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-nvme-use-NUMA-locality-information-for-fabrics.patch
Type: text/x-patch
Size: 4330 bytes
Desc: not available
URL: <http://lists.infradead.org/pipermail/linux-nvme/attachments/20181002/2ea1784e/attachment-0001.bin>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] nvme: take node locality into account when selecting a path
  2018-10-02 17:30       ` Hannes Reinecke
@ 2018-10-02 17:39         ` Christoph Hellwig
  2018-10-03  8:56           ` Hannes Reinecke
  0 siblings, 1 reply; 13+ messages in thread
From: Christoph Hellwig @ 2018-10-02 17:39 UTC (permalink / raw)


On Tue, Oct 02, 2018@07:30:02PM +0200, Hannes Reinecke wrote:
>> Fair enough... I can follow up on that.
>>
> Something like this?

As ?aid I'd rather avoid the indirect call if at all possible.

Please either add a numa_id field to struct nvme_ctrl, or a
locality_dev or something.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] nvme: take node locality into account when selecting a path
  2018-10-02 17:39         ` Christoph Hellwig
@ 2018-10-03  8:56           ` Hannes Reinecke
  2018-10-03 12:43             ` Christoph Hellwig
  2018-10-04  1:30             ` Sagi Grimberg
  0 siblings, 2 replies; 13+ messages in thread
From: Hannes Reinecke @ 2018-10-03  8:56 UTC (permalink / raw)


On 10/2/18 7:39 PM, Christoph Hellwig wrote:
> On Tue, Oct 02, 2018@07:30:02PM +0200, Hannes Reinecke wrote:
>>> Fair enough... I can follow up on that.
>>>
>> Something like this?
> 
> As ?aid I'd rather avoid the indirect call if at all possible.
> 
> Please either add a numa_id field to struct nvme_ctrl, or a
> locality_dev or something.
> 
Ah. So that should be more like it.

Cheers,

Hannes


-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-nvme-NUMA-locality-information-for-fabrics.patch
Type: text/x-patch
Size: 5801 bytes
Desc: not available
URL: <http://lists.infradead.org/pipermail/linux-nvme/attachments/20181003/104480f2/attachment-0001.bin>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] nvme: take node locality into account when selecting a path
  2018-10-03  8:56           ` Hannes Reinecke
@ 2018-10-03 12:43             ` Christoph Hellwig
  2018-10-04  1:30             ` Sagi Grimberg
  1 sibling, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2018-10-03 12:43 UTC (permalink / raw)


Yes, something like that.  Let me know when this passes some basic
FC and RDMA testing and we can merge it.

On Wed, Oct 03, 2018@10:56:12AM +0200, Hannes Reinecke wrote:
> On 10/2/18 7:39 PM, Christoph Hellwig wrote:
>> On Tue, Oct 02, 2018@07:30:02PM +0200, Hannes Reinecke wrote:
>>>> Fair enough... I can follow up on that.
>>>>
>>> Something like this?
>>
>> As ?aid I'd rather avoid the indirect call if at all possible.
>>
>> Please either add a numa_id field to struct nvme_ctrl, or a
>> locality_dev or something.
>>
> Ah. So that should be more like it.
>
> Cheers,
>
> Hannes
>
>

> >From 478db61eab3f7a178a0c1f2e5c88c742cf5006ab Mon Sep 17 00:00:00 2001
> From: Hannes Reinecke <hare at suse.com>
> Date: Wed, 3 Oct 2018 10:53:05 +0200
> Subject: [PATCH] nvme: NUMA locality information for fabrics
> 
> Add a new field 'node_id' to the nvme_ctrl structure to hold the
> NUMA locality information of the underlying hardware.
> With that we can allocate the memory structures on the same NUMA
> node as the underlying hardware.
> 
> Signed-off-by: Hannes Reinecke <hare at suse.com>
> ---
>  drivers/nvme/host/core.c      | 2 +-
>  drivers/nvme/host/fc.c        | 5 +++--
>  drivers/nvme/host/multipath.c | 4 ++--
>  drivers/nvme/host/nvme.h      | 1 +
>  drivers/nvme/host/pci.c       | 1 +
>  drivers/nvme/host/rdma.c      | 6 +++---
>  6 files changed, 11 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 2db33a752e2b..0ec56e4916ea 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3055,7 +3055,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
>  	struct gendisk *disk;
>  	struct nvme_id_ns *id;
>  	char disk_name[DISK_NAME_LEN];
> -	int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT;
> +	int node = ctrl->node_id, flags = GENHD_FL_EXT_DEVT;
>  
>  	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
>  	if (!ns)
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index 9d201b35397d..7f246dd04bc5 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -2422,7 +2422,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
>  	ctrl->tag_set.ops = &nvme_fc_mq_ops;
>  	ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
>  	ctrl->tag_set.reserved_tags = 1; /* fabric connect */
> -	ctrl->tag_set.numa_node = NUMA_NO_NODE;
> +	ctrl->tag_set.numa_node = ctrl->ctrl.node_id;
>  	ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
>  	ctrl->tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
>  					(SG_CHUNK_SIZE *
> @@ -2990,6 +2990,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
>  
>  	ctrl->ctrl.opts = opts;
>  	ctrl->ctrl.nr_reconnects = 0;
> +	ctrl->ctrl.node_id = dev_to_node(lport->dev);
>  	INIT_LIST_HEAD(&ctrl->ctrl_list);
>  	ctrl->lport = lport;
>  	ctrl->rport = rport;
> @@ -3028,7 +3029,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
>  	ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
>  	ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
>  	ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */
> -	ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
> +	ctrl->admin_tag_set.numa_node = ctrl->ctrl.node_id;
>  	ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) +
>  					(SG_CHUNK_SIZE *
>  						sizeof(struct scatterlist)) +
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 31f29f97374b..2616251b3236 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -141,7 +141,7 @@ static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
>  		    test_bit(NVME_NS_ANA_PENDING, &ns->flags))
>  			continue;
>  
> -		distance = node_distance(node, dev_to_node(ns->ctrl->dev));
> +		distance = node_distance(node, ns->ctrl->node_id);
>  
>  		switch (ns->ana_state) {
>  		case NVME_ANA_OPTIMIZED:
> @@ -276,7 +276,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
>  	if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath)
>  		return 0;
>  
> -	q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE, NULL);
> +	q = blk_alloc_queue_node(GFP_KERNEL, ctrl->node_id, NULL);
>  	if (!q)
>  		goto out;
>  	q->queuedata = head;
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 9fefba039d1e..55347a547d84 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -153,6 +153,7 @@ struct nvme_ctrl {
>  	struct request_queue *connect_q;
>  	struct device *dev;
>  	int instance;
> +	int node_id;
>  	struct blk_mq_tag_set *tagset;
>  	struct blk_mq_tag_set *admin_tagset;
>  	struct list_head namespaces;
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index d668682f91df..b5d37aacf212 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -2517,6 +2517,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  
>  	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
>  
> +	dev->ctrl.node_id = node;
>  	nvme_get_ctrl(&dev->ctrl);
>  	async_schedule(nvme_async_probe, dev);
>  
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index dc042017c293..a64b02c13934 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -686,7 +686,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
>  		set->ops = &nvme_rdma_admin_mq_ops;
>  		set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
>  		set->reserved_tags = 2; /* connect + keep-alive */
> -		set->numa_node = NUMA_NO_NODE;
> +		set->numa_node = nctrl->node_id;
>  		set->cmd_size = sizeof(struct nvme_rdma_request) +
>  			SG_CHUNK_SIZE * sizeof(struct scatterlist);
>  		set->driver_data = ctrl;
> @@ -699,7 +699,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
>  		set->ops = &nvme_rdma_mq_ops;
>  		set->queue_depth = nctrl->sqsize + 1;
>  		set->reserved_tags = 1; /* fabric connect */
> -		set->numa_node = NUMA_NO_NODE;
> +		set->numa_node = nctrl->node_id;
>  		set->flags = BLK_MQ_F_SHOULD_MERGE;
>  		set->cmd_size = sizeof(struct nvme_rdma_request) +
>  			SG_CHUNK_SIZE * sizeof(struct scatterlist);
> @@ -1975,7 +1975,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
>  	ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
>  	ctrl->ctrl.sqsize = opts->queue_size - 1;
>  	ctrl->ctrl.kato = opts->kato;
> -
> +	ctrl->ctrl.node_id = NUMA_NO_NODE;
>  	ret = -ENOMEM;
>  	ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
>  				GFP_KERNEL);
> -- 
> 2.13.7
> 

---end quoted text---

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] nvme: take node locality into account when selecting a path
  2018-10-03  8:56           ` Hannes Reinecke
  2018-10-03 12:43             ` Christoph Hellwig
@ 2018-10-04  1:30             ` Sagi Grimberg
  2018-10-04 15:40               ` Hannes Reinecke
  1 sibling, 1 reply; 13+ messages in thread
From: Sagi Grimberg @ 2018-10-04  1:30 UTC (permalink / raw)



> Ah. So that should be more like it.

For it to be useful for RDMA you'll want to add:
--
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index ea325a636588..a926c370e83a 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -755,6 +755,7 @@ static int nvme_rdma_configure_admin_queue(struct 
nvme_rdma_ctrl *ctrl,
                 return error;

         ctrl->device = ctrl->queues[0].device;
+       ctrl->ctrl.node_id = dev_to_node(device->dev->dma_device);

         ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
--

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH] nvme: take node locality into account when selecting a path
  2018-10-04  1:30             ` Sagi Grimberg
@ 2018-10-04 15:40               ` Hannes Reinecke
  0 siblings, 0 replies; 13+ messages in thread
From: Hannes Reinecke @ 2018-10-04 15:40 UTC (permalink / raw)


On 10/4/18 3:30 AM, Sagi Grimberg wrote:
> 
>> Ah. So that should be more like it.
> 
> For it to be useful for RDMA you'll want to add:
> -- 
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index ea325a636588..a926c370e83a 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -755,6 +755,7 @@ static int nvme_rdma_configure_admin_queue(struct 
> nvme_rdma_ctrl *ctrl,
>  ??????????????? return error;
> 
>  ??????? ctrl->device = ctrl->queues[0].device;
> +?????? ctrl->ctrl.node_id = dev_to_node(device->dev->dma_device);
> 
>  ??????? ctrl->max_fr_pages = 
> nvme_rdma_get_max_fr_pages(ctrl->device->dev);
> -- 
Ah, I knew I could count on you :-)
The mapping to the underlying hardware in RDMA had got me confused.

Will be fixing it up and resend the patch.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-10-04 15:40 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-27 23:05 [PATCH] nvme: take node locality into account when selecting a path Christoph Hellwig
2018-09-28 14:12 ` Keith Busch
2018-09-28 22:31 ` Sagi Grimberg
2018-09-30 23:01   ` Christoph Hellwig
2018-10-01 19:45     ` Sagi Grimberg
2018-10-02 17:30       ` Hannes Reinecke
2018-10-02 17:39         ` Christoph Hellwig
2018-10-03  8:56           ` Hannes Reinecke
2018-10-03 12:43             ` Christoph Hellwig
2018-10-04  1:30             ` Sagi Grimberg
2018-10-04 15:40               ` Hannes Reinecke
2018-09-29 12:09 ` Hannes Reinecke
2018-09-30 22:59   ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.