All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] block: fix rdma queue mapping
@ 2018-08-20 20:54 ` Sagi Grimberg
  0 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-08-20 20:54 UTC (permalink / raw)
  To: linux-block, linux-rdma, linux-nvme
  Cc: Christoph Hellwig, Steve Wise, Max Gurtovoy

nvme-rdma attempts to map queues based on irq vector affinity.
However, for some devices, completion vector irq affinity is
configurable by the user which can break the existing assumption
that irq vectors are optimally arranged over the host cpu cores.

So we map queues in two stages:
First map queues according to corresponding to the completion
vector IRQ affinity taking the first cpu in the vector affinity map.
if the current irq affinity is arranged such that a vector is not
assigned to any distinct cpu, we map it to a cpu that is on the same
node. If numa affinity can not be sufficed, we map it to any unmapped
cpu we can find. Then, map the remaining cpus in the possible cpumap
naively.

Tested-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Sagi Grimberg <sagi@grimberg.me>
---
Changes from v1:
- fixed double semicolon typo

 block/blk-mq-cpumap.c  | 39 +++++++++++---------
 block/blk-mq-rdma.c    | 80 ++++++++++++++++++++++++++++++++++++------
 include/linux/blk-mq.h |  1 +
 3 files changed, 93 insertions(+), 27 deletions(-)

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 3eb169f15842..34811db8cba9 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -30,30 +30,35 @@ static int get_first_sibling(unsigned int cpu)
 	return cpu;
 }
 
-int blk_mq_map_queues(struct blk_mq_tag_set *set)
+void blk_mq_map_queue_cpu(struct blk_mq_tag_set *set, unsigned int cpu)
 {
 	unsigned int *map = set->mq_map;
 	unsigned int nr_queues = set->nr_hw_queues;
-	unsigned int cpu, first_sibling;
+	unsigned int first_sibling;
 
-	for_each_possible_cpu(cpu) {
-		/*
-		 * First do sequential mapping between CPUs and queues.
-		 * In case we still have CPUs to map, and we have some number of
-		 * threads per cores then map sibling threads to the same queue for
-		 * performace optimizations.
-		 */
-		if (cpu < nr_queues) {
+	/*
+	 * First do sequential mapping between CPUs and queues.
+	 * In case we still have CPUs to map, and we have some number of
+	 * threads per cores then map sibling threads to the same queue for
+	 * performace optimizations.
+	 */
+	if (cpu < nr_queues) {
+		map[cpu] = cpu_to_queue_index(nr_queues, cpu);
+	} else {
+		first_sibling = get_first_sibling(cpu);
+		if (first_sibling == cpu)
 			map[cpu] = cpu_to_queue_index(nr_queues, cpu);
-		} else {
-			first_sibling = get_first_sibling(cpu);
-			if (first_sibling == cpu)
-				map[cpu] = cpu_to_queue_index(nr_queues, cpu);
-			else
-				map[cpu] = map[first_sibling];
-		}
+		else
+			map[cpu] = map[first_sibling];
 	}
+}
+
+int blk_mq_map_queues(struct blk_mq_tag_set *set)
+{
+	unsigned int cpu;
 
+	for_each_possible_cpu(cpu)
+		blk_mq_map_queue_cpu(set, cpu);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_mq_map_queues);
diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c
index 996167f1de18..3bce60cd5bcf 100644
--- a/block/blk-mq-rdma.c
+++ b/block/blk-mq-rdma.c
@@ -14,6 +14,61 @@
 #include <linux/blk-mq-rdma.h>
 #include <rdma/ib_verbs.h>
 
+static int blk_mq_rdma_map_queue(struct blk_mq_tag_set *set,
+		struct ib_device *dev, int first_vec, unsigned int queue)
+{
+	const struct cpumask *mask;
+	unsigned int cpu;
+	bool mapped = false;
+
+	mask = ib_get_vector_affinity(dev, first_vec + queue);
+	if (!mask)
+		return -ENOTSUPP;
+
+	/* map with an unmapped cpu according to affinity mask */
+	for_each_cpu(cpu, mask) {
+		if (set->mq_map[cpu] == UINT_MAX) {
+			set->mq_map[cpu] = queue;
+			mapped = true;
+			break;
+		}
+	}
+
+	if (!mapped) {
+		int n;
+
+		/* map with an unmapped cpu in the same numa node */
+		for_each_node(n) {
+			const struct cpumask *node_cpumask = cpumask_of_node(n);
+
+			if (!cpumask_intersects(mask, node_cpumask))
+				continue;
+
+			for_each_cpu(cpu, node_cpumask) {
+				if (set->mq_map[cpu] == UINT_MAX) {
+					set->mq_map[cpu] = queue;
+					mapped = true;
+					break;
+				}
+			}
+		}
+	}
+
+	if (!mapped) {
+		/* map with any unmapped cpu we can find */
+		for_each_possible_cpu(cpu) {
+			if (set->mq_map[cpu] == UINT_MAX) {
+				set->mq_map[cpu] = queue;
+				mapped = true;
+				break;
+			}
+		}
+	}
+
+	WARN_ON_ONCE(!mapped);
+	return 0;
+}
+
 /**
  * blk_mq_rdma_map_queues - provide a default queue mapping for rdma device
  * @set:	tagset to provide the mapping for
@@ -21,31 +76,36 @@
  * @first_vec:	first interrupt vectors to use for queues (usually 0)
  *
  * This function assumes the rdma device @dev has at least as many available
- * interrupt vetors as @set has queues.  It will then query it's affinity mask
- * and built queue mapping that maps a queue to the CPUs that have irq affinity
- * for the corresponding vector.
+ * interrupt vetors as @set has queues.  It will then query vector affinity mask
+ * and attempt to build irq affinity aware queue mappings. If optimal affinity
+ * aware mapping cannot be acheived for a given queue, we look for any unmapped
+ * cpu to map it. Lastly, we map naively all other unmapped cpus in the mq_map.
  *
  * In case either the driver passed a @dev with less vectors than
  * @set->nr_hw_queues, or @dev does not provide an affinity mask for a
  * vector, we fallback to the naive mapping.
  */
 int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set,
-		struct ib_device *dev, int first_vec)
+                struct ib_device *dev, int first_vec)
 {
-	const struct cpumask *mask;
 	unsigned int queue, cpu;
 
+	/* reset cpu mapping */
+	for_each_possible_cpu(cpu)
+		set->mq_map[cpu] = UINT_MAX;
+
 	for (queue = 0; queue < set->nr_hw_queues; queue++) {
-		mask = ib_get_vector_affinity(dev, first_vec + queue);
-		if (!mask)
+		if (blk_mq_rdma_map_queue(set, dev, first_vec, queue))
 			goto fallback;
+	}
 
-		for_each_cpu(cpu, mask)
-			set->mq_map[cpu] = queue;
+	/* map any remaining unmapped cpus */
+	for_each_possible_cpu(cpu) {
+		if (set->mq_map[cpu] == UINT_MAX)
+			blk_mq_map_queue_cpu(set, cpu);
 	}
 
 	return 0;
-
 fallback:
 	return blk_mq_map_queues(set);
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d710e92874cc..6eb09c4de34f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -285,6 +285,7 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
 
 int blk_mq_map_queues(struct blk_mq_tag_set *set);
+void blk_mq_map_queue_cpu(struct blk_mq_tag_set *set, unsigned int cpu);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
 
 void blk_mq_quiesce_queue_nowait(struct request_queue *q);
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-08-20 20:54 ` Sagi Grimberg
  0 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-08-20 20:54 UTC (permalink / raw)


nvme-rdma attempts to map queues based on irq vector affinity.
However, for some devices, completion vector irq affinity is
configurable by the user which can break the existing assumption
that irq vectors are optimally arranged over the host cpu cores.

So we map queues in two stages:
First map queues according to corresponding to the completion
vector IRQ affinity taking the first cpu in the vector affinity map.
if the current irq affinity is arranged such that a vector is not
assigned to any distinct cpu, we map it to a cpu that is on the same
node. If numa affinity can not be sufficed, we map it to any unmapped
cpu we can find. Then, map the remaining cpus in the possible cpumap
naively.

Tested-by: Steve Wise <swise at opengridcomputing.com>
Signed-off-by: Sagi Grimberg <sagi at grimberg.me>
---
Changes from v1:
- fixed double semicolon typo

 block/blk-mq-cpumap.c  | 39 +++++++++++---------
 block/blk-mq-rdma.c    | 80 ++++++++++++++++++++++++++++++++++++------
 include/linux/blk-mq.h |  1 +
 3 files changed, 93 insertions(+), 27 deletions(-)

diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 3eb169f15842..34811db8cba9 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -30,30 +30,35 @@ static int get_first_sibling(unsigned int cpu)
 	return cpu;
 }
 
-int blk_mq_map_queues(struct blk_mq_tag_set *set)
+void blk_mq_map_queue_cpu(struct blk_mq_tag_set *set, unsigned int cpu)
 {
 	unsigned int *map = set->mq_map;
 	unsigned int nr_queues = set->nr_hw_queues;
-	unsigned int cpu, first_sibling;
+	unsigned int first_sibling;
 
-	for_each_possible_cpu(cpu) {
-		/*
-		 * First do sequential mapping between CPUs and queues.
-		 * In case we still have CPUs to map, and we have some number of
-		 * threads per cores then map sibling threads to the same queue for
-		 * performace optimizations.
-		 */
-		if (cpu < nr_queues) {
+	/*
+	 * First do sequential mapping between CPUs and queues.
+	 * In case we still have CPUs to map, and we have some number of
+	 * threads per cores then map sibling threads to the same queue for
+	 * performace optimizations.
+	 */
+	if (cpu < nr_queues) {
+		map[cpu] = cpu_to_queue_index(nr_queues, cpu);
+	} else {
+		first_sibling = get_first_sibling(cpu);
+		if (first_sibling == cpu)
 			map[cpu] = cpu_to_queue_index(nr_queues, cpu);
-		} else {
-			first_sibling = get_first_sibling(cpu);
-			if (first_sibling == cpu)
-				map[cpu] = cpu_to_queue_index(nr_queues, cpu);
-			else
-				map[cpu] = map[first_sibling];
-		}
+		else
+			map[cpu] = map[first_sibling];
 	}
+}
+
+int blk_mq_map_queues(struct blk_mq_tag_set *set)
+{
+	unsigned int cpu;
 
+	for_each_possible_cpu(cpu)
+		blk_mq_map_queue_cpu(set, cpu);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_mq_map_queues);
diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c
index 996167f1de18..3bce60cd5bcf 100644
--- a/block/blk-mq-rdma.c
+++ b/block/blk-mq-rdma.c
@@ -14,6 +14,61 @@
 #include <linux/blk-mq-rdma.h>
 #include <rdma/ib_verbs.h>
 
+static int blk_mq_rdma_map_queue(struct blk_mq_tag_set *set,
+		struct ib_device *dev, int first_vec, unsigned int queue)
+{
+	const struct cpumask *mask;
+	unsigned int cpu;
+	bool mapped = false;
+
+	mask = ib_get_vector_affinity(dev, first_vec + queue);
+	if (!mask)
+		return -ENOTSUPP;
+
+	/* map with an unmapped cpu according to affinity mask */
+	for_each_cpu(cpu, mask) {
+		if (set->mq_map[cpu] == UINT_MAX) {
+			set->mq_map[cpu] = queue;
+			mapped = true;
+			break;
+		}
+	}
+
+	if (!mapped) {
+		int n;
+
+		/* map with an unmapped cpu in the same numa node */
+		for_each_node(n) {
+			const struct cpumask *node_cpumask = cpumask_of_node(n);
+
+			if (!cpumask_intersects(mask, node_cpumask))
+				continue;
+
+			for_each_cpu(cpu, node_cpumask) {
+				if (set->mq_map[cpu] == UINT_MAX) {
+					set->mq_map[cpu] = queue;
+					mapped = true;
+					break;
+				}
+			}
+		}
+	}
+
+	if (!mapped) {
+		/* map with any unmapped cpu we can find */
+		for_each_possible_cpu(cpu) {
+			if (set->mq_map[cpu] == UINT_MAX) {
+				set->mq_map[cpu] = queue;
+				mapped = true;
+				break;
+			}
+		}
+	}
+
+	WARN_ON_ONCE(!mapped);
+	return 0;
+}
+
 /**
  * blk_mq_rdma_map_queues - provide a default queue mapping for rdma device
  * @set:	tagset to provide the mapping for
@@ -21,31 +76,36 @@
  * @first_vec:	first interrupt vectors to use for queues (usually 0)
  *
  * This function assumes the rdma device @dev has at least as many available
- * interrupt vetors as @set has queues.  It will then query it's affinity mask
- * and built queue mapping that maps a queue to the CPUs that have irq affinity
- * for the corresponding vector.
+ * interrupt vetors as @set has queues.  It will then query vector affinity mask
+ * and attempt to build irq affinity aware queue mappings. If optimal affinity
+ * aware mapping cannot be acheived for a given queue, we look for any unmapped
+ * cpu to map it. Lastly, we map naively all other unmapped cpus in the mq_map.
  *
  * In case either the driver passed a @dev with less vectors than
  * @set->nr_hw_queues, or @dev does not provide an affinity mask for a
  * vector, we fallback to the naive mapping.
  */
 int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set,
-		struct ib_device *dev, int first_vec)
+                struct ib_device *dev, int first_vec)
 {
-	const struct cpumask *mask;
 	unsigned int queue, cpu;
 
+	/* reset cpu mapping */
+	for_each_possible_cpu(cpu)
+		set->mq_map[cpu] = UINT_MAX;
+
 	for (queue = 0; queue < set->nr_hw_queues; queue++) {
-		mask = ib_get_vector_affinity(dev, first_vec + queue);
-		if (!mask)
+		if (blk_mq_rdma_map_queue(set, dev, first_vec, queue))
 			goto fallback;
+	}
 
-		for_each_cpu(cpu, mask)
-			set->mq_map[cpu] = queue;
+	/* map any remaining unmapped cpus */
+	for_each_possible_cpu(cpu) {
+		if (set->mq_map[cpu] == UINT_MAX)
+			blk_mq_map_queue_cpu(set, cpu);
 	}
 
 	return 0;
-
 fallback:
 	return blk_mq_map_queues(set);
 }
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d710e92874cc..6eb09c4de34f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -285,6 +285,7 @@ int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
 				     unsigned long timeout);
 
 int blk_mq_map_queues(struct blk_mq_tag_set *set);
+void blk_mq_map_queue_cpu(struct blk_mq_tag_set *set, unsigned int cpu);
 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues);
 
 void blk_mq_quiesce_queue_nowait(struct request_queue *q);
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-08-20 20:54 ` Sagi Grimberg
@ 2018-08-21  2:04   ` Ming Lei
  -1 siblings, 0 replies; 42+ messages in thread
From: Ming Lei @ 2018-08-21  2:04 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: linux-block, linux-rdma, linux-nvme, Christoph Hellwig,
	Steve Wise, Max Gurtovoy

On Mon, Aug 20, 2018 at 01:54:20PM -0700, Sagi Grimberg wrote:
> nvme-rdma attempts to map queues based on irq vector affinity.
> However, for some devices, completion vector irq affinity is
> configurable by the user which can break the existing assumption
> that irq vectors are optimally arranged over the host cpu cores.
> 
> So we map queues in two stages:
> First map queues according to corresponding to the completion
> vector IRQ affinity taking the first cpu in the vector affinity map.
> if the current irq affinity is arranged such that a vector is not
> assigned to any distinct cpu, we map it to a cpu that is on the same
> node. If numa affinity can not be sufficed, we map it to any unmapped
> cpu we can find. Then, map the remaining cpus in the possible cpumap
> naively.

I guess this way still can't fix the request allocation crash issue
triggered by using blk_mq_alloc_request_hctx(), in which one hw queue may
not be mapped from any online CPU.

Maybe this patch isn't for this issue, but it is closely related.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-08-21  2:04   ` Ming Lei
  0 siblings, 0 replies; 42+ messages in thread
From: Ming Lei @ 2018-08-21  2:04 UTC (permalink / raw)


On Mon, Aug 20, 2018@01:54:20PM -0700, Sagi Grimberg wrote:
> nvme-rdma attempts to map queues based on irq vector affinity.
> However, for some devices, completion vector irq affinity is
> configurable by the user which can break the existing assumption
> that irq vectors are optimally arranged over the host cpu cores.
> 
> So we map queues in two stages:
> First map queues according to corresponding to the completion
> vector IRQ affinity taking the first cpu in the vector affinity map.
> if the current irq affinity is arranged such that a vector is not
> assigned to any distinct cpu, we map it to a cpu that is on the same
> node. If numa affinity can not be sufficed, we map it to any unmapped
> cpu we can find. Then, map the remaining cpus in the possible cpumap
> naively.

I guess this way still can't fix the request allocation crash issue
triggered by using blk_mq_alloc_request_hctx(), in which one hw queue may
not be mapped from any online CPU.

Maybe this patch isn't for this issue, but it is closely related.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-08-20 20:54 ` Sagi Grimberg
@ 2018-08-22 13:11   ` Christoph Hellwig
  -1 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2018-08-22 13:11 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: linux-block, linux-rdma, linux-nvme, Christoph Hellwig,
	Steve Wise, Max Gurtovoy

On Mon, Aug 20, 2018 at 01:54:20PM -0700, Sagi Grimberg wrote:
> nvme-rdma attempts to map queues based on irq vector affinity.
> However, for some devices, completion vector irq affinity is
> configurable by the user which can break the existing assumption
> that irq vectors are optimally arranged over the host cpu cores.

IFF affinity is configurable we should never use this code,
as it breaks the model entirely.  ib_get_vector_affinity should
never return a valid mask if affinity is configurable.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-08-22 13:11   ` Christoph Hellwig
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2018-08-22 13:11 UTC (permalink / raw)


On Mon, Aug 20, 2018@01:54:20PM -0700, Sagi Grimberg wrote:
> nvme-rdma attempts to map queues based on irq vector affinity.
> However, for some devices, completion vector irq affinity is
> configurable by the user which can break the existing assumption
> that irq vectors are optimally arranged over the host cpu cores.

IFF affinity is configurable we should never use this code,
as it breaks the model entirely.  ib_get_vector_affinity should
never return a valid mask if affinity is configurable.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-08-21  2:04   ` Ming Lei
@ 2018-08-25  2:06     ` Sagi Grimberg
  -1 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-08-25  2:06 UTC (permalink / raw)
  To: Ming Lei
  Cc: linux-rdma, Steve Wise, linux-nvme, linux-block, Max Gurtovoy,
	Christoph Hellwig


> I guess this way still can't fix the request allocation crash issue
> triggered by using blk_mq_alloc_request_hctx(), in which one hw queue may
> not be mapped from any online CPU.

Not really. I guess we will need to simply skip queues that are
mapped to an offline cpu.

> Maybe this patch isn't for this issue, but it is closely related.

Yes, another patch is still needed.

Steve, do you still have that patch? I don't seem to
find it anywhere.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-08-25  2:06     ` Sagi Grimberg
  0 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-08-25  2:06 UTC (permalink / raw)



> I guess this way still can't fix the request allocation crash issue
> triggered by using blk_mq_alloc_request_hctx(), in which one hw queue may
> not be mapped from any online CPU.

Not really. I guess we will need to simply skip queues that are
mapped to an offline cpu.

> Maybe this patch isn't for this issue, but it is closely related.

Yes, another patch is still needed.

Steve, do you still have that patch? I don't seem to
find it anywhere.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-08-22 13:11   ` Christoph Hellwig
@ 2018-08-25  2:17     ` Sagi Grimberg
  -1 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-08-25  2:17 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-block, linux-rdma, linux-nvme, Steve Wise, Max Gurtovoy


>> nvme-rdma attempts to map queues based on irq vector affinity.
>> However, for some devices, completion vector irq affinity is
>> configurable by the user which can break the existing assumption
>> that irq vectors are optimally arranged over the host cpu cores.
> 
> IFF affinity is configurable we should never use this code,
> as it breaks the model entirely.  ib_get_vector_affinity should
> never return a valid mask if affinity is configurable.

I agree that the model intended initially doesn't fit. But it seems
that some users like to write into their nic's
/proc/irq/$IRP/smp_affinity and get mad at us for not letting them with
using managed affinity.

So instead of falling back to the block mapping function we try
to do a little better first:
1. map according to the device vector affinity
2. map vectors that end up without a mapping to cpus that belong
    to the same numa-node
3. map all the rest of the unmapped cpus like the block layer
    would do.

We could have device drivers that don't use managed affinity to never
return a valid mask but that would never allow affinity based mapping
which is optimal at least for users that do not mangle with device
irq affinity (which is probably the majority of users).

Thoughts?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-08-25  2:17     ` Sagi Grimberg
  0 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-08-25  2:17 UTC (permalink / raw)



>> nvme-rdma attempts to map queues based on irq vector affinity.
>> However, for some devices, completion vector irq affinity is
>> configurable by the user which can break the existing assumption
>> that irq vectors are optimally arranged over the host cpu cores.
> 
> IFF affinity is configurable we should never use this code,
> as it breaks the model entirely.  ib_get_vector_affinity should
> never return a valid mask if affinity is configurable.

I agree that the model intended initially doesn't fit. But it seems
that some users like to write into their nic's
/proc/irq/$IRP/smp_affinity and get mad at us for not letting them with
using managed affinity.

So instead of falling back to the block mapping function we try
to do a little better first:
1. map according to the device vector affinity
2. map vectors that end up without a mapping to cpus that belong
    to the same numa-node
3. map all the rest of the unmapped cpus like the block layer
    would do.

We could have device drivers that don't use managed affinity to never
return a valid mask but that would never allow affinity based mapping
which is optimal at least for users that do not mangle with device
irq affinity (which is probably the majority of users).

Thoughts?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH v2] block: fix rdma queue mapping
  2018-08-25  2:06     ` Sagi Grimberg
@ 2018-08-25 12:18       ` Steve Wise
  -1 siblings, 0 replies; 42+ messages in thread
From: Steve Wise @ 2018-08-25 12:18 UTC (permalink / raw)
  To: 'Sagi Grimberg', 'Ming Lei'
  Cc: linux-rdma, linux-block, 'Christoph Hellwig',
	linux-nvme, 'Max Gurtovoy'

> > I guess this way still can't fix the request allocation crash issue
> > triggered by using blk_mq_alloc_request_hctx(), in which one hw queue
> may
> > not be mapped from any online CPU.
> 
> Not really. I guess we will need to simply skip queues that are
> mapped to an offline cpu.
> 
> > Maybe this patch isn't for this issue, but it is closely related.
> 
> Yes, another patch is still needed.
> 
> Steve, do you still have that patch? I don't seem to
> find it anywhere.

I have no such patch.  I don't remember this issue.

Steve.


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-08-25 12:18       ` Steve Wise
  0 siblings, 0 replies; 42+ messages in thread
From: Steve Wise @ 2018-08-25 12:18 UTC (permalink / raw)


> > I guess this way still can't fix the request allocation crash issue
> > triggered by using blk_mq_alloc_request_hctx(), in which one hw queue
> may
> > not be mapped from any online CPU.
> 
> Not really. I guess we will need to simply skip queues that are
> mapped to an offline cpu.
> 
> > Maybe this patch isn't for this issue, but it is closely related.
> 
> Yes, another patch is still needed.
> 
> Steve, do you still have that patch? I don't seem to
> find it anywhere.

I have no such patch.  I don't remember this issue.

Steve.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-08-25 12:18       ` Steve Wise
@ 2018-08-27  3:50         ` Ming Lei
  -1 siblings, 0 replies; 42+ messages in thread
From: Ming Lei @ 2018-08-27  3:50 UTC (permalink / raw)
  To: Steve Wise
  Cc: 'Sagi Grimberg',
	linux-rdma, linux-nvme, linux-block, 'Max Gurtovoy',
	'Christoph Hellwig'

On Sat, Aug 25, 2018 at 07:18:43AM -0500, Steve Wise wrote:
> > > I guess this way still can't fix the request allocation crash issue
> > > triggered by using blk_mq_alloc_request_hctx(), in which one hw queue
> > may
> > > not be mapped from any online CPU.
> > 
> > Not really. I guess we will need to simply skip queues that are
> > mapped to an offline cpu.
> > 
> > > Maybe this patch isn't for this issue, but it is closely related.
> > 
> > Yes, another patch is still needed.
> > 
> > Steve, do you still have that patch? I don't seem to
> > find it anywhere.
> 
> I have no such patch.  I don't remember this issue.

This issue can be reproduced when running IO by doing CPU hotplug, then
the following log can be triggered:

[  396.629000] smpboot: CPU 2 is now offline
[  396.640759] nvmet: creating controller 1 for subsystem testnqn for NQN nqn.2014-08.org.nvmexpress:uuid:aa7d7d4a-0ee1-4960-ad0d-dbfe768b88d4.
[  396.642036] nvme nvme1: creating 1 I/O queues.
[  396.642480] BUG: unable to handle kernel paging request at 000060fd0a6bff48
[  396.643128] PGD 0 P4D 0
[  396.643364] Oops: 0002 [#1] PREEMPT SMP PTI
[  396.643774] CPU: 3 PID: 7 Comm: kworker/u8:0 Not tainted 4.18.0_2923b27e5424_master+ #1
[  396.644588] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-2.fc27 04/01/2014
[  396.645597] Workqueue: nvme-reset-wq nvme_loop_reset_ctrl_work [nvme_loop]
[  396.646371] RIP: 0010:blk_mq_get_request+0x2e2/0x375
[  396.646924] Code: 00 00 48 c7 83 28 01 00 00 00 00 00 00 48 c7 83 30 01 00 00 00 00 00 00 48 8b 55 10 74 0c 31 c0 41 f7 c4 00 08 06 00 0f 95 c0 <48> ff 44 c2 48 41 81 e4 00 00 06 00 c7 83 d8 00 00 00 01 00 00 00
[  396.648699] RSP: 0018:ffffc90000c87cc8 EFLAGS: 00010246
[  396.649212] RAX: 0000000000000000 RBX: ffff880276350000 RCX: 0000000000000017
[  396.649931] RDX: 000060fd0a6bff00 RSI: 000000000010e9e6 RDI: 00155563b3000000
[  396.650646] RBP: ffffc90000c87d08 R08: 00000000f461b8ce R09: 000000000000006c
[  396.651393] R10: ffffc90000c87e50 R11: 0000000000000000 R12: 0000000000000023
[  396.652100] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[  396.652831] FS:  0000000000000000(0000) GS:ffff880277b80000(0000) knlGS:0000000000000000
[  396.653652] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  396.654254] CR2: 000060fd0a6bff48 CR3: 000000000200a006 CR4: 0000000000760ee0
[  396.655051] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  396.655841] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  396.656542] PKRU: 55555554
[  396.656808] Call Trace:
[  396.657094]  blk_mq_alloc_request_hctx+0xd1/0x11c
[  396.657599]  ? pcpu_block_update_hint_alloc+0xa1/0x1a5
[  396.658101]  nvme_alloc_request+0x42/0x71
[  396.658517]  __nvme_submit_sync_cmd+0x2d/0xd1
[  396.658945]  nvmf_connect_io_queue+0x10b/0x162 [nvme_fabrics]
[  396.659543]  ? nvme_loop_connect_io_queues+0x2d/0x52 [nvme_loop]
[  396.660126]  nvme_loop_connect_io_queues+0x2d/0x52 [nvme_loop]
[  396.660743]  nvme_loop_reset_ctrl_work+0x62/0xcf [nvme_loop]
[  396.661296]  process_one_work+0x1c9/0x2f6
[  396.661694]  ? rescuer_thread+0x282/0x282
[  396.662080]  process_scheduled_works+0x27/0x2c
[  396.662510]  worker_thread+0x1e7/0x295
[  396.662960]  kthread+0x115/0x11d
[  396.663334]  ? kthread_park+0x76/0x76
[  396.663752]  ret_from_fork+0x35/0x40
[  396.664169] Modules linked in: nvme_loop nvmet nvme_fabrics null_blk scsi_debug isofs iTCO_wdt iTCO_vendor_support i2c_i801 lpc_ich i2c_core mfd_core ip_tables sr_mod cdrom usb_storage sd_mod ahci libahci libata crc32c_intel virtio_scsi qemu_fw_cfg dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_debug]
[  396.667015] Dumping ftrace buffer:
[  396.667348]    (ftrace buffer empty)
[  396.667758] CR2: 000060fd0a6bff48
[  396.668129] ---[ end trace 887712785c99c2ca ]---


Thanks,
Ming

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-08-27  3:50         ` Ming Lei
  0 siblings, 0 replies; 42+ messages in thread
From: Ming Lei @ 2018-08-27  3:50 UTC (permalink / raw)


On Sat, Aug 25, 2018@07:18:43AM -0500, Steve Wise wrote:
> > > I guess this way still can't fix the request allocation crash issue
> > > triggered by using blk_mq_alloc_request_hctx(), in which one hw queue
> > may
> > > not be mapped from any online CPU.
> > 
> > Not really. I guess we will need to simply skip queues that are
> > mapped to an offline cpu.
> > 
> > > Maybe this patch isn't for this issue, but it is closely related.
> > 
> > Yes, another patch is still needed.
> > 
> > Steve, do you still have that patch? I don't seem to
> > find it anywhere.
> 
> I have no such patch.  I don't remember this issue.

This issue can be reproduced when running IO by doing CPU hotplug, then
the following log can be triggered:

[  396.629000] smpboot: CPU 2 is now offline
[  396.640759] nvmet: creating controller 1 for subsystem testnqn for NQN nqn.2014-08.org.nvmexpress:uuid:aa7d7d4a-0ee1-4960-ad0d-dbfe768b88d4.
[  396.642036] nvme nvme1: creating 1 I/O queues.
[  396.642480] BUG: unable to handle kernel paging request at 000060fd0a6bff48
[  396.643128] PGD 0 P4D 0
[  396.643364] Oops: 0002 [#1] PREEMPT SMP PTI
[  396.643774] CPU: 3 PID: 7 Comm: kworker/u8:0 Not tainted 4.18.0_2923b27e5424_master+ #1
[  396.644588] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-2.fc27 04/01/2014
[  396.645597] Workqueue: nvme-reset-wq nvme_loop_reset_ctrl_work [nvme_loop]
[  396.646371] RIP: 0010:blk_mq_get_request+0x2e2/0x375
[  396.646924] Code: 00 00 48 c7 83 28 01 00 00 00 00 00 00 48 c7 83 30 01 00 00 00 00 00 00 48 8b 55 10 74 0c 31 c0 41 f7 c4 00 08 06 00 0f 95 c0 <48> ff 44 c2 48 41 81 e4 00 00 06 00 c7 83 d8 00 00 00 01 00 00 00
[  396.648699] RSP: 0018:ffffc90000c87cc8 EFLAGS: 00010246
[  396.649212] RAX: 0000000000000000 RBX: ffff880276350000 RCX: 0000000000000017
[  396.649931] RDX: 000060fd0a6bff00 RSI: 000000000010e9e6 RDI: 00155563b3000000
[  396.650646] RBP: ffffc90000c87d08 R08: 00000000f461b8ce R09: 000000000000006c
[  396.651393] R10: ffffc90000c87e50 R11: 0000000000000000 R12: 0000000000000023
[  396.652100] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
[  396.652831] FS:  0000000000000000(0000) GS:ffff880277b80000(0000) knlGS:0000000000000000
[  396.653652] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  396.654254] CR2: 000060fd0a6bff48 CR3: 000000000200a006 CR4: 0000000000760ee0
[  396.655051] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  396.655841] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  396.656542] PKRU: 55555554
[  396.656808] Call Trace:
[  396.657094]  blk_mq_alloc_request_hctx+0xd1/0x11c
[  396.657599]  ? pcpu_block_update_hint_alloc+0xa1/0x1a5
[  396.658101]  nvme_alloc_request+0x42/0x71
[  396.658517]  __nvme_submit_sync_cmd+0x2d/0xd1
[  396.658945]  nvmf_connect_io_queue+0x10b/0x162 [nvme_fabrics]
[  396.659543]  ? nvme_loop_connect_io_queues+0x2d/0x52 [nvme_loop]
[  396.660126]  nvme_loop_connect_io_queues+0x2d/0x52 [nvme_loop]
[  396.660743]  nvme_loop_reset_ctrl_work+0x62/0xcf [nvme_loop]
[  396.661296]  process_one_work+0x1c9/0x2f6
[  396.661694]  ? rescuer_thread+0x282/0x282
[  396.662080]  process_scheduled_works+0x27/0x2c
[  396.662510]  worker_thread+0x1e7/0x295
[  396.662960]  kthread+0x115/0x11d
[  396.663334]  ? kthread_park+0x76/0x76
[  396.663752]  ret_from_fork+0x35/0x40
[  396.664169] Modules linked in: nvme_loop nvmet nvme_fabrics null_blk scsi_debug isofs iTCO_wdt iTCO_vendor_support i2c_i801 lpc_ich i2c_core mfd_core ip_tables sr_mod cdrom usb_storage sd_mod ahci libahci libata crc32c_intel virtio_scsi qemu_fw_cfg dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_debug]
[  396.667015] Dumping ftrace buffer:
[  396.667348]    (ftrace buffer empty)
[  396.667758] CR2: 000060fd0a6bff48
[  396.668129] ---[ end trace 887712785c99c2ca ]---


Thanks,
Ming

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-08-25  2:17     ` Sagi Grimberg
@ 2018-10-03 19:05       ` Steve Wise
  -1 siblings, 0 replies; 42+ messages in thread
From: Steve Wise @ 2018-10-03 19:05 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig
  Cc: linux-block, linux-rdma, linux-nvme, Max Gurtovoy

CgpPbiA4LzI0LzIwMTggOToxNyBQTSwgU2FnaSBHcmltYmVyZyB3cm90ZToKPiAKPj4+IG52bWUt
cmRtYSBhdHRlbXB0cyB0byBtYXAgcXVldWVzIGJhc2VkIG9uIGlycSB2ZWN0b3IgYWZmaW5pdHku
Cj4+PiBIb3dldmVyLCBmb3Igc29tZSBkZXZpY2VzLCBjb21wbGV0aW9uIHZlY3RvciBpcnEgYWZm
aW5pdHkgaXMKPj4+IGNvbmZpZ3VyYWJsZSBieSB0aGUgdXNlciB3aGljaCBjYW4gYnJlYWsgdGhl
IGV4aXN0aW5nIGFzc3VtcHRpb24KPj4+IHRoYXQgaXJxIHZlY3RvcnMgYXJlIG9wdGltYWxseSBh
cnJhbmdlZCBvdmVyIHRoZSBob3N0IGNwdSBjb3Jlcy4KPj4KPj4gSUZGIGFmZmluaXR5IGlzIGNv
bmZpZ3VyYWJsZSB3ZSBzaG91bGQgbmV2ZXIgdXNlIHRoaXMgY29kZSwKPj4gYXMgaXQgYnJlYWtz
IHRoZSBtb2RlbCBlbnRpcmVseS7CoCBpYl9nZXRfdmVjdG9yX2FmZmluaXR5IHNob3VsZAo+PiBu
ZXZlciByZXR1cm4gYSB2YWxpZCBtYXNrIGlmIGFmZmluaXR5IGlzIGNvbmZpZ3VyYWJsZS4KPiAK
PiBJIGFncmVlIHRoYXQgdGhlIG1vZGVsIGludGVuZGVkIGluaXRpYWxseSBkb2Vzbid0IGZpdC4g
QnV0IGl0IHNlZW1zCj4gdGhhdCBzb21lIHVzZXJzIGxpa2UgdG8gd3JpdGUgaW50byB0aGVpciBu
aWMncwo+IC9wcm9jL2lycS8kSVJQL3NtcF9hZmZpbml0eSBhbmQgZ2V0IG1hZCBhdCB1cyBmb3Ig
bm90IGxldHRpbmcgdGhlbSB3aXRoCj4gdXNpbmcgbWFuYWdlZCBhZmZpbml0eS4KPiAKPiBTbyBp
bnN0ZWFkIG9mIGZhbGxpbmcgYmFjayB0byB0aGUgYmxvY2sgbWFwcGluZyBmdW5jdGlvbiB3ZSB0
cnkKPiB0byBkbyBhIGxpdHRsZSBiZXR0ZXIgZmlyc3Q6Cj4gMS4gbWFwIGFjY29yZGluZyB0byB0
aGUgZGV2aWNlIHZlY3RvciBhZmZpbml0eQo+IDIuIG1hcCB2ZWN0b3JzIHRoYXQgZW5kIHVwIHdp
dGhvdXQgYSBtYXBwaW5nIHRvIGNwdXMgdGhhdCBiZWxvbmcKPiDCoMKgIHRvIHRoZSBzYW1lIG51
bWEtbm9kZQo+IDMuIG1hcCBhbGwgdGhlIHJlc3Qgb2YgdGhlIHVubWFwcGVkIGNwdXMgbGlrZSB0
aGUgYmxvY2sgbGF5ZXIKPiDCoMKgIHdvdWxkIGRvLgo+IAo+IFdlIGNvdWxkIGhhdmUgZGV2aWNl
IGRyaXZlcnMgdGhhdCBkb24ndCB1c2UgbWFuYWdlZCBhZmZpbml0eSB0byBuZXZlcgo+IHJldHVy
biBhIHZhbGlkIG1hc2sgYnV0IHRoYXQgd291bGQgbmV2ZXIgYWxsb3cgYWZmaW5pdHkgYmFzZWQg
bWFwcGluZwo+IHdoaWNoIGlzIG9wdGltYWwgYXQgbGVhc3QgZm9yIHVzZXJzIHRoYXQgZG8gbm90
IG1hbmdsZSB3aXRoIGRldmljZQo+IGlycSBhZmZpbml0eSAod2hpY2ggaXMgcHJvYmFibHkgdGhl
IG1ham9yaXR5IG9mIHVzZXJzKS4KPiAKPiBUaG91Z2h0cz8KCkNhbiB3ZSBwbGVhc2UgbWFrZSBm
b3J3YXJkIHByb2dyZXNzIG9uIHRoaXM/CgpDaHJpc3RvcGgsIFNhZ2k6ICBpdCBzZWVtcyB5b3Ug
dGhpbmsgL3Byb2MvaXJxLyRJUlAvc21wX2FmZmluaXR5CnNob3VsZG4ndCBiZSBhbGxvd2VkIGlm
IGRyaXZlcnMgc3VwcG9ydCBtYW5hZ2VkIGFmZmluaXR5LiBJcyB0aGF0IGNvcnJlY3Q/CgpQZXJo
YXBzIHRoYXQgY2FuIGJlIGNvZGlmaWVkIGFuZCBiZSBhIHdheSBmb3J3YXJkPyAgSUU6IFNvbWVo
b3cgYWxsb3cKdGhlIGFkbWluIHRvIGNob29zZSBlaXRoZXIgIm1hbmFnZWQgYnkgdGhlIGRyaXZl
ci91bHBzIiBvciAibWFuYWdlZCBieQp0aGUgc3lzdGVtIGFkbWluIGRpcmVjdGx5Ij8KCk9yIGp1
c3QgdXNlIFNhZ2kncyBwYXRjaC4gIFBlcmhhcHMgYSBXQVJOX09OQ0UoKSBpZiB0aGUgYWZmaW5p
dHkgbG9va3MKd29ua2VkIHdoZW4gc2V0IHZpYSBwcm9jZnM/ICBKdXN0IHRoaW5raW5nIG91dCBs
b3VkLi4uCgpCdXQgYXMgaXQgc3RhbmRzLCB0aGluZ3MgYXJlIGp1c3QgcGxhaW4gYm9ya2VkIGlm
IGFuIHJkbWEgZHJpdmVyCnN1cHBvcnRzIGliX2dldF92ZWN0b3JfYWZmaW5pdHkoKSB5ZXQgdGhl
IGFkbWluIGNoYW5nZXMgdGhlIGFmZmluaXR5IHZpYQovcHJvYy4uLgoKVGhhbmtzLAoKU3RldmUu
CgpfX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fX19fXwpMaW51eC1u
dm1lIG1haWxpbmcgbGlzdApMaW51eC1udm1lQGxpc3RzLmluZnJhZGVhZC5vcmcKaHR0cDovL2xp
c3RzLmluZnJhZGVhZC5vcmcvbWFpbG1hbi9saXN0aW5mby9saW51eC1udm1lCg==

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-10-03 19:05       ` Steve Wise
  0 siblings, 0 replies; 42+ messages in thread
From: Steve Wise @ 2018-10-03 19:05 UTC (permalink / raw)




On 8/24/2018 9:17 PM, Sagi Grimberg wrote:
> 
>>> nvme-rdma attempts to map queues based on irq vector affinity.
>>> However, for some devices, completion vector irq affinity is
>>> configurable by the user which can break the existing assumption
>>> that irq vectors are optimally arranged over the host cpu cores.
>>
>> IFF affinity is configurable we should never use this code,
>> as it breaks the model entirely.? ib_get_vector_affinity should
>> never return a valid mask if affinity is configurable.
> 
> I agree that the model intended initially doesn't fit. But it seems
> that some users like to write into their nic's
> /proc/irq/$IRP/smp_affinity and get mad at us for not letting them with
> using managed affinity.
> 
> So instead of falling back to the block mapping function we try
> to do a little better first:
> 1. map according to the device vector affinity
> 2. map vectors that end up without a mapping to cpus that belong
> ?? to the same numa-node
> 3. map all the rest of the unmapped cpus like the block layer
> ?? would do.
> 
> We could have device drivers that don't use managed affinity to never
> return a valid mask but that would never allow affinity based mapping
> which is optimal at least for users that do not mangle with device
> irq affinity (which is probably the majority of users).
> 
> Thoughts?

Can we please make forward progress on this?

Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
shouldn't be allowed if drivers support managed affinity. Is that correct?

Perhaps that can be codified and be a way forward?  IE: Somehow allow
the admin to choose either "managed by the driver/ulps" or "managed by
the system admin directly"?

Or just use Sagi's patch.  Perhaps a WARN_ONCE() if the affinity looks
wonked when set via procfs?  Just thinking out loud...

But as it stands, things are just plain borked if an rdma driver
supports ib_get_vector_affinity() yet the admin changes the affinity via
/proc...

Thanks,

Steve.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-10-03 19:05       ` Steve Wise
@ 2018-10-03 21:14         ` Sagi Grimberg
  -1 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-10-03 21:14 UTC (permalink / raw)
  To: Steve Wise, Christoph Hellwig
  Cc: linux-block, linux-rdma, linux-nvme, Max Gurtovoy


> Can we please make forward progress on this?
> 
> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
> shouldn't be allowed if drivers support managed affinity. Is that correct?
> 
> Perhaps that can be codified and be a way forward?  IE: Somehow allow
> the admin to choose either "managed by the driver/ulps" or "managed by
> the system admin directly"?
> 
> Or just use Sagi's patch.  Perhaps a WARN_ONCE() if the affinity looks
> wonked when set via procfs?  Just thinking out loud...

I think we should take my patch. While ideally we'd like to have
perfect linear irq based queue mapping, people apparently really like
to modify devices irq affinity. With this, at least some of the queues
are assigned according to the device affinity. Its not ideal, but at
least better than blindly falling to default mapping.

Christoph?

If there is strong resistance to the patch we should at least start
by falling to the default mapping unconditionally.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-10-03 21:14         ` Sagi Grimberg
  0 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-10-03 21:14 UTC (permalink / raw)



> Can we please make forward progress on this?
> 
> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
> shouldn't be allowed if drivers support managed affinity. Is that correct?
> 
> Perhaps that can be codified and be a way forward?  IE: Somehow allow
> the admin to choose either "managed by the driver/ulps" or "managed by
> the system admin directly"?
> 
> Or just use Sagi's patch.  Perhaps a WARN_ONCE() if the affinity looks
> wonked when set via procfs?  Just thinking out loud...

I think we should take my patch. While ideally we'd like to have
perfect linear irq based queue mapping, people apparently really like
to modify devices irq affinity. With this, at least some of the queues
are assigned according to the device affinity. Its not ideal, but at
least better than blindly falling to default mapping.

Christoph?

If there is strong resistance to the patch we should at least start
by falling to the default mapping unconditionally.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-10-03 21:14         ` Sagi Grimberg
@ 2018-10-03 21:21           ` Steve Wise
  -1 siblings, 0 replies; 42+ messages in thread
From: Steve Wise @ 2018-10-03 21:21 UTC (permalink / raw)
  To: Sagi Grimberg, Christoph Hellwig
  Cc: linux-block, linux-rdma, linux-nvme, Max Gurtovoy



On 10/3/2018 4:14 PM, Sagi Grimberg wrote:
> 
>> Can we please make forward progress on this?
>>
>> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
>> shouldn't be allowed if drivers support managed affinity. Is that
>> correct?
>>
>> Perhaps that can be codified and be a way forward?  IE: Somehow allow
>> the admin to choose either "managed by the driver/ulps" or "managed by
>> the system admin directly"?
>>
>> Or just use Sagi's patch.  Perhaps a WARN_ONCE() if the affinity looks
>> wonked when set via procfs?  Just thinking out loud...
> 
> I think we should take my patch. While ideally we'd like to have
> perfect linear irq based queue mapping, people apparently really like
> to modify devices irq affinity. With this, at least some of the queues
> are assigned according to the device affinity. Its not ideal, but at
> least better than blindly falling to default mapping.
> 
> Christoph?
> 
> If there is strong resistance to the patch we should at least start
> by falling to the default mapping unconditionally.

Perhaps we should still add a WARN_ONCE if we don't get a linear
mapping?  IE admins are now tweaking this because in the past the
default affinity could hurt performance, and adjusting it via procfs was
the answer.  But now we're making the devices and ULPs smarter, so we
should maybe let the admin know when we think the ideal affinity is not
being achieved.

Steve.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-10-03 21:21           ` Steve Wise
  0 siblings, 0 replies; 42+ messages in thread
From: Steve Wise @ 2018-10-03 21:21 UTC (permalink / raw)




On 10/3/2018 4:14 PM, Sagi Grimberg wrote:
> 
>> Can we please make forward progress on this?
>>
>> Christoph, Sagi:? it seems you think /proc/irq/$IRP/smp_affinity
>> shouldn't be allowed if drivers support managed affinity. Is that
>> correct?
>>
>> Perhaps that can be codified and be a way forward?? IE: Somehow allow
>> the admin to choose either "managed by the driver/ulps" or "managed by
>> the system admin directly"?
>>
>> Or just use Sagi's patch.? Perhaps a WARN_ONCE() if the affinity looks
>> wonked when set via procfs?? Just thinking out loud...
> 
> I think we should take my patch. While ideally we'd like to have
> perfect linear irq based queue mapping, people apparently really like
> to modify devices irq affinity. With this, at least some of the queues
> are assigned according to the device affinity. Its not ideal, but at
> least better than blindly falling to default mapping.
> 
> Christoph?
> 
> If there is strong resistance to the patch we should at least start
> by falling to the default mapping unconditionally.

Perhaps we should still add a WARN_ONCE if we don't get a linear
mapping?  IE admins are now tweaking this because in the past the
default affinity could hurt performance, and adjusting it via procfs was
the answer.  But now we're making the devices and ULPs smarter, so we
should maybe let the admin know when we think the ideal affinity is not
being achieved.

Steve.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-10-03 21:14         ` Sagi Grimberg
@ 2018-10-16  1:04           ` Sagi Grimberg
  -1 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-10-16  1:04 UTC (permalink / raw)
  To: Steve Wise, Christoph Hellwig
  Cc: linux-block, linux-rdma, linux-nvme, Max Gurtovoy


>> Can we please make forward progress on this?
>>
>> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
>> shouldn't be allowed if drivers support managed affinity. Is that 
>> correct?
>>
>> Perhaps that can be codified and be a way forward?  IE: Somehow allow
>> the admin to choose either "managed by the driver/ulps" or "managed by
>> the system admin directly"?
>>
>> Or just use Sagi's patch.  Perhaps a WARN_ONCE() if the affinity looks
>> wonked when set via procfs?  Just thinking out loud...
> 
> I think we should take my patch. While ideally we'd like to have
> perfect linear irq based queue mapping, people apparently really like
> to modify devices irq affinity. With this, at least some of the queues
> are assigned according to the device affinity. Its not ideal, but at
> least better than blindly falling to default mapping.
> 
> Christoph?
> 
> If there is strong resistance to the patch we should at least start
> by falling to the default mapping unconditionally.

Ping?

Would you prefer we fallback to the naive mapping if the device is not
using manged affinity?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-10-16  1:04           ` Sagi Grimberg
  0 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-10-16  1:04 UTC (permalink / raw)



>> Can we please make forward progress on this?
>>
>> Christoph, Sagi:? it seems you think /proc/irq/$IRP/smp_affinity
>> shouldn't be allowed if drivers support managed affinity. Is that 
>> correct?
>>
>> Perhaps that can be codified and be a way forward?? IE: Somehow allow
>> the admin to choose either "managed by the driver/ulps" or "managed by
>> the system admin directly"?
>>
>> Or just use Sagi's patch.? Perhaps a WARN_ONCE() if the affinity looks
>> wonked when set via procfs?? Just thinking out loud...
> 
> I think we should take my patch. While ideally we'd like to have
> perfect linear irq based queue mapping, people apparently really like
> to modify devices irq affinity. With this, at least some of the queues
> are assigned according to the device affinity. Its not ideal, but at
> least better than blindly falling to default mapping.
> 
> Christoph?
> 
> If there is strong resistance to the patch we should at least start
> by falling to the default mapping unconditionally.

Ping?

Would you prefer we fallback to the naive mapping if the device is not
using manged affinity?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-10-03 19:05       ` Steve Wise
@ 2018-10-17 16:37         ` Christoph Hellwig
  -1 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2018-10-17 16:37 UTC (permalink / raw)
  To: Steve Wise
  Cc: Sagi Grimberg, Christoph Hellwig, linux-block, linux-rdma,
	linux-nvme, Max Gurtovoy

On Wed, Oct 03, 2018 at 02:05:16PM -0500, Steve Wise wrote:
> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
> shouldn't be allowed if drivers support managed affinity. Is that correct?

Not just shouldn't, but simply can't.

> But as it stands, things are just plain borked if an rdma driver
> supports ib_get_vector_affinity() yet the admin changes the affinity via
> /proc...

I think we need to fix ib_get_vector_affinity to not return anything
if the device doesn't use managed irq affinity.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-10-17 16:37         ` Christoph Hellwig
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2018-10-17 16:37 UTC (permalink / raw)


On Wed, Oct 03, 2018@02:05:16PM -0500, Steve Wise wrote:
> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
> shouldn't be allowed if drivers support managed affinity. Is that correct?

Not just shouldn't, but simply can't.

> But as it stands, things are just plain borked if an rdma driver
> supports ib_get_vector_affinity() yet the admin changes the affinity via
> /proc...

I think we need to fix ib_get_vector_affinity to not return anything
if the device doesn't use managed irq affinity.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-10-16  1:04           ` Sagi Grimberg
@ 2018-10-17 16:37             ` Christoph Hellwig
  -1 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2018-10-17 16:37 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Steve Wise, Christoph Hellwig, linux-block, linux-rdma,
	linux-nvme, Max Gurtovoy

On Mon, Oct 15, 2018 at 06:04:46PM -0700, Sagi Grimberg wrote:
> Would you prefer we fallback to the naive mapping if the device is not
> using manged affinity?

Yes.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-10-17 16:37             ` Christoph Hellwig
  0 siblings, 0 replies; 42+ messages in thread
From: Christoph Hellwig @ 2018-10-17 16:37 UTC (permalink / raw)


On Mon, Oct 15, 2018@06:04:46PM -0700, Sagi Grimberg wrote:
> Would you prefer we fallback to the naive mapping if the device is not
> using manged affinity?

Yes.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-10-17 16:37         ` Christoph Hellwig
@ 2018-10-23  6:02           ` Sagi Grimberg
  -1 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-10-23  6:02 UTC (permalink / raw)
  To: Christoph Hellwig, Steve Wise
  Cc: linux-block, linux-rdma, linux-nvme, Max Gurtovoy


>> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
>> shouldn't be allowed if drivers support managed affinity. Is that correct?
> 
> Not just shouldn't, but simply can't.
> 
>> But as it stands, things are just plain borked if an rdma driver
>> supports ib_get_vector_affinity() yet the admin changes the affinity via
>> /proc...
> 
> I think we need to fix ib_get_vector_affinity to not return anything
> if the device doesn't use managed irq affinity.

Steve, does iw_cxgb4 use managed affinity?

I'll send a patch for mlx5 to simply not return anything as managed
affinity is not something that the maintainers want to do.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-10-23  6:02           ` Sagi Grimberg
  0 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-10-23  6:02 UTC (permalink / raw)



>> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
>> shouldn't be allowed if drivers support managed affinity. Is that correct?
> 
> Not just shouldn't, but simply can't.
> 
>> But as it stands, things are just plain borked if an rdma driver
>> supports ib_get_vector_affinity() yet the admin changes the affinity via
>> /proc...
> 
> I think we need to fix ib_get_vector_affinity to not return anything
> if the device doesn't use managed irq affinity.

Steve, does iw_cxgb4 use managed affinity?

I'll send a patch for mlx5 to simply not return anything as managed
affinity is not something that the maintainers want to do.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH v2] block: fix rdma queue mapping
  2018-10-23  6:02           ` Sagi Grimberg
@ 2018-10-23 13:00             ` Steve Wise
  -1 siblings, 0 replies; 42+ messages in thread
From: Steve Wise @ 2018-10-23 13:00 UTC (permalink / raw)
  To: 'Sagi Grimberg', 'Christoph Hellwig'
  Cc: linux-block, linux-rdma, linux-nvme, 'Max Gurtovoy'

> >> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
> >> shouldn't be allowed if drivers support managed affinity. Is that correct?
> >
> > Not just shouldn't, but simply can't.
> >
> >> But as it stands, things are just plain borked if an rdma driver
> >> supports ib_get_vector_affinity() yet the admin changes the affinity via
> >> /proc...
> >
> > I think we need to fix ib_get_vector_affinity to not return anything
> > if the device doesn't use managed irq affinity.
> 
> Steve, does iw_cxgb4 use managed affinity?
> 
> I'll send a patch for mlx5 to simply not return anything as managed
> affinity is not something that the maintainers want to do.

I'm beginning to think I don't know what "managed affinity" actually is.  Currently iw_cxgb4 doesn't support ib_get_vector_affinity().  I have a patch for it, but ran into this whole issue with nvme failing if someone changes the affinity map via /proc.

Steve.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-10-23 13:00             ` Steve Wise
  0 siblings, 0 replies; 42+ messages in thread
From: Steve Wise @ 2018-10-23 13:00 UTC (permalink / raw)


> >> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
> >> shouldn't be allowed if drivers support managed affinity. Is that correct?
> >
> > Not just shouldn't, but simply can't.
> >
> >> But as it stands, things are just plain borked if an rdma driver
> >> supports ib_get_vector_affinity() yet the admin changes the affinity via
> >> /proc...
> >
> > I think we need to fix ib_get_vector_affinity to not return anything
> > if the device doesn't use managed irq affinity.
> 
> Steve, does iw_cxgb4 use managed affinity?
> 
> I'll send a patch for mlx5 to simply not return anything as managed
> affinity is not something that the maintainers want to do.

I'm beginning to think I don't know what "managed affinity" actually is.  Currently iw_cxgb4 doesn't support ib_get_vector_affinity().  I have a patch for it, but ran into this whole issue with nvme failing if someone changes the affinity map via /proc.

Steve.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-10-23 13:00             ` Steve Wise
@ 2018-10-23 21:25               ` Sagi Grimberg
  -1 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-10-23 21:25 UTC (permalink / raw)
  To: Steve Wise, 'Christoph Hellwig'
  Cc: linux-block, linux-rdma, linux-nvme, 'Max Gurtovoy'


>>>> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
>>>> shouldn't be allowed if drivers support managed affinity. Is that correct?
>>>
>>> Not just shouldn't, but simply can't.
>>>
>>>> But as it stands, things are just plain borked if an rdma driver
>>>> supports ib_get_vector_affinity() yet the admin changes the affinity via
>>>> /proc...
>>>
>>> I think we need to fix ib_get_vector_affinity to not return anything
>>> if the device doesn't use managed irq affinity.
>>
>> Steve, does iw_cxgb4 use managed affinity?
>>
>> I'll send a patch for mlx5 to simply not return anything as managed
>> affinity is not something that the maintainers want to do.
> 
> I'm beginning to think I don't know what "managed affinity" actually is.  Currently iw_cxgb4 doesn't support ib_get_vector_affinity().  I have a patch for it, but ran into this whole issue with nvme failing if someone changes the affinity map via /proc.

That means that the pci subsystem gets your vector(s) affinity right and
immutable. It also guarantees that you have reserved vectors and not get
a best effort assignment when cpu cores are offlined.

You can simply enable it by adding PCI_IRQ_AFFINITY to
pci_alloc_irq_vectors() or call pci_alloc_irq_vectors_affinity()
to communicate post/pre vectors that don't participate in
affinitization (nvme uses it for admin queue).

This way you can easily plug ->get_vector_affinity() to return
pci_irq_get_affinity(dev, vector)

The original patch set from hch:
https://lwn.net/Articles/693653/

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-10-23 21:25               ` Sagi Grimberg
  0 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-10-23 21:25 UTC (permalink / raw)



>>>> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
>>>> shouldn't be allowed if drivers support managed affinity. Is that correct?
>>>
>>> Not just shouldn't, but simply can't.
>>>
>>>> But as it stands, things are just plain borked if an rdma driver
>>>> supports ib_get_vector_affinity() yet the admin changes the affinity via
>>>> /proc...
>>>
>>> I think we need to fix ib_get_vector_affinity to not return anything
>>> if the device doesn't use managed irq affinity.
>>
>> Steve, does iw_cxgb4 use managed affinity?
>>
>> I'll send a patch for mlx5 to simply not return anything as managed
>> affinity is not something that the maintainers want to do.
> 
> I'm beginning to think I don't know what "managed affinity" actually is.  Currently iw_cxgb4 doesn't support ib_get_vector_affinity().  I have a patch for it, but ran into this whole issue with nvme failing if someone changes the affinity map via /proc.

That means that the pci subsystem gets your vector(s) affinity right and
immutable. It also guarantees that you have reserved vectors and not get
a best effort assignment when cpu cores are offlined.

You can simply enable it by adding PCI_IRQ_AFFINITY to
pci_alloc_irq_vectors() or call pci_alloc_irq_vectors_affinity()
to communicate post/pre vectors that don't participate in
affinitization (nvme uses it for admin queue).

This way you can easily plug ->get_vector_affinity() to return
pci_irq_get_affinity(dev, vector)

The original patch set from hch:
https://lwn.net/Articles/693653/

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH v2] block: fix rdma queue mapping
  2018-10-23 21:25               ` Sagi Grimberg
@ 2018-10-23 21:31                 ` Steve Wise
  -1 siblings, 0 replies; 42+ messages in thread
From: Steve Wise @ 2018-10-23 21:31 UTC (permalink / raw)
  To: 'Sagi Grimberg', 'Christoph Hellwig'
  Cc: linux-block, linux-rdma, linux-nvme, 'Max Gurtovoy'



> -----Original Message-----
> From: Sagi Grimberg <sagi@grimberg.me>
> Sent: Tuesday, October 23, 2018 4:25 PM
> To: Steve Wise <swise@opengridcomputing.com>; 'Christoph Hellwig'
> <hch@lst.de>
> Cc: linux-block@vger.kernel.org; linux-rdma@vger.kernel.org; linux-
> nvme@lists.infradead.org; 'Max Gurtovoy' <maxg@mellanox.com>
> Subject: Re: [PATCH v2] block: fix rdma queue mapping
> 
> 
> >>>> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
> >>>> shouldn't be allowed if drivers support managed affinity. Is that
> correct?
> >>>
> >>> Not just shouldn't, but simply can't.
> >>>
> >>>> But as it stands, things are just plain borked if an rdma driver
> >>>> supports ib_get_vector_affinity() yet the admin changes the affinity via
> >>>> /proc...
> >>>
> >>> I think we need to fix ib_get_vector_affinity to not return anything
> >>> if the device doesn't use managed irq affinity.
> >>
> >> Steve, does iw_cxgb4 use managed affinity?
> >>
> >> I'll send a patch for mlx5 to simply not return anything as managed
> >> affinity is not something that the maintainers want to do.
> >
> > I'm beginning to think I don't know what "managed affinity" actually is.
> Currently iw_cxgb4 doesn't support ib_get_vector_affinity().  I have a patch
> for it, but ran into this whole issue with nvme failing if someone changes the
> affinity map via /proc.
> 
> That means that the pci subsystem gets your vector(s) affinity right and
> immutable. It also guarantees that you have reserved vectors and not get
> a best effort assignment when cpu cores are offlined.
> 
> You can simply enable it by adding PCI_IRQ_AFFINITY to
> pci_alloc_irq_vectors() or call pci_alloc_irq_vectors_affinity()
> to communicate post/pre vectors that don't participate in
> affinitization (nvme uses it for admin queue).
> 
> This way you can easily plug ->get_vector_affinity() to return
> pci_irq_get_affinity(dev, vector)
> 
> The original patch set from hch:
> https://lwn.net/Articles/693653/

Thanks for educating me. 😊  I'll have a look into this. 

Steve.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-10-23 21:31                 ` Steve Wise
  0 siblings, 0 replies; 42+ messages in thread
From: Steve Wise @ 2018-10-23 21:31 UTC (permalink / raw)




> -----Original Message-----
> From: Sagi Grimberg <sagi at grimberg.me>
> Sent: Tuesday, October 23, 2018 4:25 PM
> To: Steve Wise <swise at opengridcomputing.com>; 'Christoph Hellwig'
> <hch at lst.de>
> Cc: linux-block at vger.kernel.org; linux-rdma at vger.kernel.org; linux-
> nvme at lists.infradead.org; 'Max Gurtovoy' <maxg at mellanox.com>
> Subject: Re: [PATCH v2] block: fix rdma queue mapping
> 
> 
> >>>> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
> >>>> shouldn't be allowed if drivers support managed affinity. Is that
> correct?
> >>>
> >>> Not just shouldn't, but simply can't.
> >>>
> >>>> But as it stands, things are just plain borked if an rdma driver
> >>>> supports ib_get_vector_affinity() yet the admin changes the affinity via
> >>>> /proc...
> >>>
> >>> I think we need to fix ib_get_vector_affinity to not return anything
> >>> if the device doesn't use managed irq affinity.
> >>
> >> Steve, does iw_cxgb4 use managed affinity?
> >>
> >> I'll send a patch for mlx5 to simply not return anything as managed
> >> affinity is not something that the maintainers want to do.
> >
> > I'm beginning to think I don't know what "managed affinity" actually is.
> Currently iw_cxgb4 doesn't support ib_get_vector_affinity().  I have a patch
> for it, but ran into this whole issue with nvme failing if someone changes the
> affinity map via /proc.
> 
> That means that the pci subsystem gets your vector(s) affinity right and
> immutable. It also guarantees that you have reserved vectors and not get
> a best effort assignment when cpu cores are offlined.
> 
> You can simply enable it by adding PCI_IRQ_AFFINITY to
> pci_alloc_irq_vectors() or call pci_alloc_irq_vectors_affinity()
> to communicate post/pre vectors that don't participate in
> affinitization (nvme uses it for admin queue).
> 
> This way you can easily plug ->get_vector_affinity() to return
> pci_irq_get_affinity(dev, vector)
> 
> The original patch set from hch:
> https://lwn.net/Articles/693653/

Thanks for educating me. ?  I'll have a look into this. 

Steve.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-10-23 21:25               ` Sagi Grimberg
@ 2018-10-24  0:09                 ` Shiraz Saleem
  -1 siblings, 0 replies; 42+ messages in thread
From: Shiraz Saleem @ 2018-10-24  0:09 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: linux-rdma, Steve Wise, linux-nvme, linux-block,
	'Max Gurtovoy', 'Christoph Hellwig'

On Tue, Oct 23, 2018 at 03:25:06PM -0600, Sagi Grimberg wrote:
> 
> >>>> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
> >>>> shouldn't be allowed if drivers support managed affinity. Is that correct?
> >>>
> >>> Not just shouldn't, but simply can't.
> >>>
> >>>> But as it stands, things are just plain borked if an rdma driver
> >>>> supports ib_get_vector_affinity() yet the admin changes the affinity via
> >>>> /proc...
> >>>
> >>> I think we need to fix ib_get_vector_affinity to not return anything
> >>> if the device doesn't use managed irq affinity.
> >>
> >> Steve, does iw_cxgb4 use managed affinity?
> >>
> >> I'll send a patch for mlx5 to simply not return anything as managed
> >> affinity is not something that the maintainers want to do.
> > 
> > I'm beginning to think I don't know what "managed affinity" actually is.  Currently iw_cxgb4 doesn't support ib_get_vector_affinity().  I have a patch for it, but ran into this whole issue with nvme failing if someone changes the affinity map via /proc.
> 
> That means that the pci subsystem gets your vector(s) affinity right and
> immutable. It also guarantees that you have reserved vectors and not get
> a best effort assignment when cpu cores are offlined.
> 
> You can simply enable it by adding PCI_IRQ_AFFINITY to
> pci_alloc_irq_vectors() or call pci_alloc_irq_vectors_affinity()
> to communicate post/pre vectors that don't participate in
> affinitization (nvme uses it for admin queue).
> 
> This way you can easily plug ->get_vector_affinity() to return
> pci_irq_get_affinity(dev, vector)
> 
> The original patch set from hch:
> https://lwn.net/Articles/693653/

Sagi - From what I can tell, i40iw is also exposed to this same issue if the IRQ affinity
is configured by user.


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-10-24  0:09                 ` Shiraz Saleem
  0 siblings, 0 replies; 42+ messages in thread
From: Shiraz Saleem @ 2018-10-24  0:09 UTC (permalink / raw)


On Tue, Oct 23, 2018@03:25:06PM -0600, Sagi Grimberg wrote:
> 
> >>>> Christoph, Sagi:  it seems you think /proc/irq/$IRP/smp_affinity
> >>>> shouldn't be allowed if drivers support managed affinity. Is that correct?
> >>>
> >>> Not just shouldn't, but simply can't.
> >>>
> >>>> But as it stands, things are just plain borked if an rdma driver
> >>>> supports ib_get_vector_affinity() yet the admin changes the affinity via
> >>>> /proc...
> >>>
> >>> I think we need to fix ib_get_vector_affinity to not return anything
> >>> if the device doesn't use managed irq affinity.
> >>
> >> Steve, does iw_cxgb4 use managed affinity?
> >>
> >> I'll send a patch for mlx5 to simply not return anything as managed
> >> affinity is not something that the maintainers want to do.
> > 
> > I'm beginning to think I don't know what "managed affinity" actually is.  Currently iw_cxgb4 doesn't support ib_get_vector_affinity().  I have a patch for it, but ran into this whole issue with nvme failing if someone changes the affinity map via /proc.
> 
> That means that the pci subsystem gets your vector(s) affinity right and
> immutable. It also guarantees that you have reserved vectors and not get
> a best effort assignment when cpu cores are offlined.
> 
> You can simply enable it by adding PCI_IRQ_AFFINITY to
> pci_alloc_irq_vectors() or call pci_alloc_irq_vectors_affinity()
> to communicate post/pre vectors that don't participate in
> affinitization (nvme uses it for admin queue).
> 
> This way you can easily plug ->get_vector_affinity() to return
> pci_irq_get_affinity(dev, vector)
> 
> The original patch set from hch:
> https://lwn.net/Articles/693653/

Sagi - From what I can tell, i40iw is also exposed to this same issue if the IRQ affinity
is configured by user.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-10-24  0:09                 ` Shiraz Saleem
@ 2018-10-24  0:37                   ` Sagi Grimberg
  -1 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-10-24  0:37 UTC (permalink / raw)
  To: Shiraz Saleem
  Cc: linux-rdma, Steve Wise, linux-nvme, linux-block,
	'Max Gurtovoy', 'Christoph Hellwig'


> Sagi - From what I can tell, i40iw is also exposed to this same issue if the IRQ affinity
> is configured by user.

managed affinity does not allow setting smp_affinity from userspace.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-10-24  0:37                   ` Sagi Grimberg
  0 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-10-24  0:37 UTC (permalink / raw)



> Sagi - From what I can tell, i40iw is also exposed to this same issue if the IRQ affinity
> is configured by user.

managed affinity does not allow setting smp_affinity from userspace.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH v2] block: fix rdma queue mapping
  2018-10-24  0:37                   ` Sagi Grimberg
@ 2018-10-29 23:58                     ` Saleem, Shiraz
  -1 siblings, 0 replies; 42+ messages in thread
From: Saleem, Shiraz @ 2018-10-29 23:58 UTC (permalink / raw)
  To: Sagi Grimberg
  Cc: Steve Wise, 'Christoph Hellwig',
	linux-block, linux-rdma, linux-nvme, 'Max Gurtovoy'

PlN1YmplY3Q6IFJlOiBbUEFUQ0ggdjJdIGJsb2NrOiBmaXggcmRtYSBxdWV1ZSBtYXBwaW5nDQo+
DQo+DQo+PiBTYWdpIC0gRnJvbSB3aGF0IEkgY2FuIHRlbGwsIGk0MGl3IGlzIGFsc28gZXhwb3Nl
ZCB0byB0aGlzIHNhbWUgaXNzdWUNCj4+IGlmIHRoZSBJUlEgYWZmaW5pdHkgaXMgY29uZmlndXJl
ZCBieSB1c2VyLg0KPg0KPm1hbmFnZWQgYWZmaW5pdHkgZG9lcyBub3QgYWxsb3cgc2V0dGluZyBz
bXBfYWZmaW5pdHkgZnJvbSB1c2Vyc3BhY2UuDQoNCk9LLiBCdXQgb3VyIGRldmljZSBJUlFzIGFy
ZSBub3Qgc2V0LXVwIGFzIHRvIGJlIG1hbmFnZWQgYWZmaW5pdHkgYmFzZWQgYW5kIGNhbiBiZSB0
dW5lZCBieSB1c2VyLg0KQW5kIGl0IHNlZW1zIHRoYXQgaXMgcmVhc29uYWJsZSBhcHByb2FjaCBz
aW5jZSB3ZSBjYW4gbmV2ZXIgdGVsbCBhaGVhZCBvZiB0aW1lIHdoYXQgbWlnaHQgYmUgYW4gb3B0
aW1hbCBjb25maWcuIGZvciBhIHBhcnRpY3VsYXIgd29ya2xvYWQuDQoNClNoaXJheg0K

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-10-29 23:58                     ` Saleem, Shiraz
  0 siblings, 0 replies; 42+ messages in thread
From: Saleem, Shiraz @ 2018-10-29 23:58 UTC (permalink / raw)


>Subject: Re: [PATCH v2] block: fix rdma queue mapping
>
>
>> Sagi - From what I can tell, i40iw is also exposed to this same issue
>> if the IRQ affinity is configured by user.
>
>managed affinity does not allow setting smp_affinity from userspace.

OK. But our device IRQs are not set-up as to be managed affinity based and can be tuned by user.
And it seems that is reasonable approach since we can never tell ahead of time what might be an optimal config. for a particular workload.

Shiraz

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2] block: fix rdma queue mapping
  2018-10-29 23:58                     ` Saleem, Shiraz
@ 2018-10-30 18:26                       ` Sagi Grimberg
  -1 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-10-30 18:26 UTC (permalink / raw)
  To: Saleem, Shiraz
  Cc: Steve Wise, 'Christoph Hellwig',
	linux-block, linux-rdma, linux-nvme, 'Max Gurtovoy'


>> Subject: Re: [PATCH v2] block: fix rdma queue mapping
>>
>>
>>> Sagi - From what I can tell, i40iw is also exposed to this same issue
>>> if the IRQ affinity is configured by user.
>>
>> managed affinity does not allow setting smp_affinity from userspace.
> 
> OK. But our device IRQs are not set-up as to be managed affinity based and can be tuned by user.
> And it seems that is reasonable approach since we can never tell ahead of time what might be an optimal config. for a particular workload.

I understand, but its not necessarily the case, the vast majority of
users don't touch (or at least don't want to) and managed affinity gives
you benefits of linear assignment good practice (among others).

The same argument can be made for a number of pcie devices that do use
managed affinity, but the choice was to get it right from the start.
Still not sure why (r)nics are different with that respect.

Anyways, I was just asking, wasn't telling you to go and use it..

^ permalink raw reply	[flat|nested] 42+ messages in thread

* [PATCH v2] block: fix rdma queue mapping
@ 2018-10-30 18:26                       ` Sagi Grimberg
  0 siblings, 0 replies; 42+ messages in thread
From: Sagi Grimberg @ 2018-10-30 18:26 UTC (permalink / raw)



>> Subject: Re: [PATCH v2] block: fix rdma queue mapping
>>
>>
>>> Sagi - From what I can tell, i40iw is also exposed to this same issue
>>> if the IRQ affinity is configured by user.
>>
>> managed affinity does not allow setting smp_affinity from userspace.
> 
> OK. But our device IRQs are not set-up as to be managed affinity based and can be tuned by user.
> And it seems that is reasonable approach since we can never tell ahead of time what might be an optimal config. for a particular workload.

I understand, but its not necessarily the case, the vast majority of
users don't touch (or at least don't want to) and managed affinity gives
you benefits of linear assignment good practice (among others).

The same argument can be made for a number of pcie devices that do use
managed affinity, but the choice was to get it right from the start.
Still not sure why (r)nics are different with that respect.

Anyways, I was just asking, wasn't telling you to go and use it..

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2018-10-31  3:21 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-20 20:54 [PATCH v2] block: fix rdma queue mapping Sagi Grimberg
2018-08-20 20:54 ` Sagi Grimberg
2018-08-21  2:04 ` Ming Lei
2018-08-21  2:04   ` Ming Lei
2018-08-25  2:06   ` Sagi Grimberg
2018-08-25  2:06     ` Sagi Grimberg
2018-08-25 12:18     ` Steve Wise
2018-08-25 12:18       ` Steve Wise
2018-08-27  3:50       ` Ming Lei
2018-08-27  3:50         ` Ming Lei
2018-08-22 13:11 ` Christoph Hellwig
2018-08-22 13:11   ` Christoph Hellwig
2018-08-25  2:17   ` Sagi Grimberg
2018-08-25  2:17     ` Sagi Grimberg
2018-10-03 19:05     ` Steve Wise
2018-10-03 19:05       ` Steve Wise
2018-10-03 21:14       ` Sagi Grimberg
2018-10-03 21:14         ` Sagi Grimberg
2018-10-03 21:21         ` Steve Wise
2018-10-03 21:21           ` Steve Wise
2018-10-16  1:04         ` Sagi Grimberg
2018-10-16  1:04           ` Sagi Grimberg
2018-10-17 16:37           ` Christoph Hellwig
2018-10-17 16:37             ` Christoph Hellwig
2018-10-17 16:37       ` Christoph Hellwig
2018-10-17 16:37         ` Christoph Hellwig
2018-10-23  6:02         ` Sagi Grimberg
2018-10-23  6:02           ` Sagi Grimberg
2018-10-23 13:00           ` Steve Wise
2018-10-23 13:00             ` Steve Wise
2018-10-23 21:25             ` Sagi Grimberg
2018-10-23 21:25               ` Sagi Grimberg
2018-10-23 21:31               ` Steve Wise
2018-10-23 21:31                 ` Steve Wise
2018-10-24  0:09               ` Shiraz Saleem
2018-10-24  0:09                 ` Shiraz Saleem
2018-10-24  0:37                 ` Sagi Grimberg
2018-10-24  0:37                   ` Sagi Grimberg
2018-10-29 23:58                   ` Saleem, Shiraz
2018-10-29 23:58                     ` Saleem, Shiraz
2018-10-30 18:26                     ` Sagi Grimberg
2018-10-30 18:26                       ` Sagi Grimberg

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.