[patch v6 0/7] genirq/affinity: Overhaul the multiple interrupt sets support

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [patch v6 0/7] genirq/affinity: Overhaul the multiple interrupt sets support
@ 2019-02-16 17:13 Thomas Gleixner
  2019-02-16 17:13 ` [patch v6 1/7] genirq/affinity: Code consolidation Thomas Gleixner
                   ` (7 more replies)
  0 siblings, 8 replies; 27+ messages in thread
From: Thomas Gleixner @ 2019-02-16 17:13 UTC (permalink / raw)
  To: LKML
  Cc: Ming Lei, Christoph Hellwig, Bjorn Helgaas, Jens Axboe,
	linux-block, Sagi Grimberg, linux-nvme, linux-pci, Keith Busch,
	Marc Zyngier, Sumit Saxena, Kashyap Desai,
	Shivasharan Srikanteshwara

This is the final update to the series with a few corner cases fixes
vs. V5 which can be found here:

   https://lkml.kernel.org/r/20190214204755.819014197@linutronix.de

The series applies against:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git master

and is also available from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git WIP.irq

Changes vs. V5:

  - Change the invocation for the driver callback so it is invoked even
    when there are no interrupts left to spread out. That ensures that the
    driver can adjust to the situation (in case of NVME a single interrupt)

  - Make sure the callback is invoked in the legacy irq fallback case so
    the driver is not in a stale state from a failed MSI[X} allocation
    attempt.

  - Fix the adjustment logic in the NVME driver as pointed out by Ming and
    Marc, plus another corner case I found during testing.

  - Simplify the unmanaged set support

Thanks,

	tglx

8<-------------
 drivers/nvme/host/pci.c         |  117 +++++++++++++---------------------------
 drivers/pci/msi.c               |   39 +++++--------
 drivers/scsi/be2iscsi/be_main.c |    2 
 include/linux/interrupt.h       |   35 ++++++++---
 include/linux/pci.h             |    4 -
 kernel/irq/affinity.c           |  116 ++++++++++++++++++++++++---------------
 6 files changed, 153 insertions(+), 160 deletions(-)



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch v6 1/7] genirq/affinity: Code consolidation
  2019-02-16 17:13 [patch v6 0/7] genirq/affinity: Overhaul the multiple interrupt sets support Thomas Gleixner
@ 2019-02-16 17:13 ` Thomas Gleixner
  2019-02-17 13:36   ` Ming Lei
  2019-02-18 10:25   ` [tip:irq/core] " tip-bot for Thomas Gleixner
  2019-02-16 17:13 ` [patch v6 2/7] genirq/affinity: Store interrupt sets size in struct irq_affinity Thomas Gleixner
                   ` (6 subsequent siblings)
  7 siblings, 2 replies; 27+ messages in thread
From: Thomas Gleixner @ 2019-02-16 17:13 UTC (permalink / raw)
  To: LKML
  Cc: Ming Lei, Christoph Hellwig, Bjorn Helgaas, Jens Axboe,
	linux-block, Sagi Grimberg, linux-nvme, linux-pci, Keith Busch,
	Marc Zyngier, Sumit Saxena, Kashyap Desai,
	Shivasharan Srikanteshwara

All information and calculations in the interrupt affinity spreading code
is strictly unsigned int. Though the code uses int all over the place.

Convert it over to unsigned int.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |   20 +++++++++-------
 kernel/irq/affinity.c     |   56 ++++++++++++++++++++++------------------------
 2 files changed, 38 insertions(+), 38 deletions(-)

--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -251,10 +251,10 @@ struct irq_affinity_notify {
  * @sets:		Number of affinitized sets
  */
 struct irq_affinity {
-	int	pre_vectors;
-	int	post_vectors;
-	int	nr_sets;
-	int	*sets;
+	unsigned int	pre_vectors;
+	unsigned int	post_vectors;
+	unsigned int	nr_sets;
+	unsigned int	*sets;
 };
 
 /**
@@ -314,9 +314,10 @@ extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
 
 struct irq_affinity_desc *
-irq_create_affinity_masks(int nvec, const struct irq_affinity *affd);
+irq_create_affinity_masks(unsigned int nvec, const struct irq_affinity *affd);
 
-int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd);
+unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
+				       const struct irq_affinity *affd);
 
 #else /* CONFIG_SMP */
 
@@ -350,13 +351,14 @@ irq_set_affinity_notifier(unsigned int i
 }
 
 static inline struct irq_affinity_desc *
-irq_create_affinity_masks(int nvec, const struct irq_affinity *affd)
+irq_create_affinity_masks(unsigned int nvec, const struct irq_affinity *affd)
 {
 	return NULL;
 }
 
-static inline int
-irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
+static inline unsigned int
+irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
+			  const struct irq_affinity *affd)
 {
 	return maxvec;
 }
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -9,7 +9,7 @@
 #include <linux/cpu.h>
 
 static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
-				int cpus_per_vec)
+				unsigned int cpus_per_vec)
 {
 	const struct cpumask *siblmsk;
 	int cpu, sibl;
@@ -95,15 +95,17 @@ static int get_nodes_in_cpumask(cpumask_
 }
 
 static int __irq_build_affinity_masks(const struct irq_affinity *affd,
-				      int startvec, int numvecs, int firstvec,
+				      unsigned int startvec,
+				      unsigned int numvecs,
+				      unsigned int firstvec,
 				      cpumask_var_t *node_to_cpumask,
 				      const struct cpumask *cpu_mask,
 				      struct cpumask *nmsk,
 				      struct irq_affinity_desc *masks)
 {
-	int n, nodes, cpus_per_vec, extra_vecs, done = 0;
-	int last_affv = firstvec + numvecs;
-	int curvec = startvec;
+	unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0;
+	unsigned int last_affv = firstvec + numvecs;
+	unsigned int curvec = startvec;
 	nodemask_t nodemsk = NODE_MASK_NONE;
 
 	if (!cpumask_weight(cpu_mask))
@@ -117,18 +119,16 @@ static int __irq_build_affinity_masks(co
 	 */
 	if (numvecs <= nodes) {
 		for_each_node_mask(n, nodemsk) {
-			cpumask_or(&masks[curvec].mask,
-					&masks[curvec].mask,
-					node_to_cpumask[n]);
+			cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
+				   node_to_cpumask[n]);
 			if (++curvec == last_affv)
 				curvec = firstvec;
 		}
-		done = numvecs;
-		goto out;
+		return numvecs;
 	}
 
 	for_each_node_mask(n, nodemsk) {
-		int ncpus, v, vecs_to_assign, vecs_per_node;
+		unsigned int ncpus, v, vecs_to_assign, vecs_per_node;
 
 		/* Spread the vectors per node */
 		vecs_per_node = (numvecs - (curvec - firstvec)) / nodes;
@@ -163,8 +163,6 @@ static int __irq_build_affinity_masks(co
 			curvec = firstvec;
 		--nodes;
 	}
-
-out:
 	return done;
 }
 
@@ -174,13 +172,14 @@ static int __irq_build_affinity_masks(co
  *	2) spread other possible CPUs on these vectors
  */
 static int irq_build_affinity_masks(const struct irq_affinity *affd,
-				    int startvec, int numvecs, int firstvec,
+				    unsigned int startvec, unsigned int numvecs,
+				    unsigned int firstvec,
 				    struct irq_affinity_desc *masks)
 {
-	int curvec = startvec, nr_present, nr_others;
-	int ret = -ENOMEM;
-	cpumask_var_t nmsk, npresmsk;
+	unsigned int curvec = startvec, nr_present, nr_others;
 	cpumask_var_t *node_to_cpumask;
+	cpumask_var_t nmsk, npresmsk;
+	int ret = -ENOMEM;
 
 	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
 		return ret;
@@ -239,12 +238,10 @@ static int irq_build_affinity_masks(cons
  * Returns the irq_affinity_desc pointer or NULL if allocation failed.
  */
 struct irq_affinity_desc *
-irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+irq_create_affinity_masks(unsigned int nvecs, const struct irq_affinity *affd)
 {
-	int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
-	int curvec, usedvecs;
+	unsigned int affvecs, curvec, usedvecs, nr_sets, i;
 	struct irq_affinity_desc *masks = NULL;
-	int i, nr_sets;
 
 	/*
 	 * If there aren't any vectors left after applying the pre/post
@@ -264,16 +261,17 @@ irq_create_affinity_masks(int nvecs, con
 	 * Spread on present CPUs starting from affd->pre_vectors. If we
 	 * have multiple sets, build each sets affinity mask separately.
 	 */
+	affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
 	nr_sets = affd->nr_sets;
 	if (!nr_sets)
 		nr_sets = 1;
 
 	for (i = 0, usedvecs = 0; i < nr_sets; i++) {
-		int this_vecs = affd->sets ? affd->sets[i] : affvecs;
+		unsigned int this_vecs = affd->sets ? affd->sets[i] : affvecs;
 		int ret;
 
 		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
-						curvec, masks);
+					       curvec, masks);
 		if (ret) {
 			kfree(masks);
 			return NULL;
@@ -303,17 +301,17 @@ irq_create_affinity_masks(int nvecs, con
  * @maxvec:	The maximum number of vectors available
  * @affd:	Description of the affinity requirements
  */
-int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
+unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
+				       const struct irq_affinity *affd)
 {
-	int resv = affd->pre_vectors + affd->post_vectors;
-	int vecs = maxvec - resv;
-	int set_vecs;
+	unsigned int resv = affd->pre_vectors + affd->post_vectors;
+	unsigned int set_vecs;
 
 	if (resv > minvec)
 		return 0;
 
 	if (affd->nr_sets) {
-		int i;
+		unsigned int i;
 
 		for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
 			set_vecs += affd->sets[i];
@@ -323,5 +321,5 @@ int irq_calc_affinity_vectors(int minvec
 		put_online_cpus();
 	}
 
-	return resv + min(set_vecs, vecs);
+	return resv + min(set_vecs, maxvec - resv);
 }



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch v6 2/7] genirq/affinity: Store interrupt sets size in struct irq_affinity
  2019-02-16 17:13 [patch v6 0/7] genirq/affinity: Overhaul the multiple interrupt sets support Thomas Gleixner
  2019-02-16 17:13 ` [patch v6 1/7] genirq/affinity: Code consolidation Thomas Gleixner
@ 2019-02-16 17:13 ` Thomas Gleixner
  2019-02-18 10:26   ` [tip:irq/core] " tip-bot for Ming Lei
  2019-02-16 17:13 ` [patch v6 3/7] genirq/affinity: Add new callback for (re)calculating interrupt sets Thomas Gleixner
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2019-02-16 17:13 UTC (permalink / raw)
  To: LKML
  Cc: Ming Lei, Christoph Hellwig, Bjorn Helgaas, Jens Axboe,
	linux-block, Sagi Grimberg, linux-nvme, linux-pci, Keith Busch,
	Marc Zyngier, Sumit Saxena, Kashyap Desai,
	Shivasharan Srikanteshwara

From: Ming Lei <ming.lei@redhat.com>

The interrupt affinity spreading mechanism supports to spread out
affinities for one or more interrupt sets. A interrupt set contains one
or more interrupts. Each set is mapped to a specific functionality of a
device, e.g. general I/O queues and read I/O queus of multiqueue block
devices.

The number of interrupts per set is defined by the driver. It depends on
the total number of available interrupts for the device, which is
determined by the PCI capabilites and the availability of underlying CPU
resources, and the number of queues which the device provides and the
driver wants to instantiate.

The driver passes initial configuration for the interrupt allocation via
a pointer to struct irq_affinity.

Right now the allocation mechanism is complex as it requires to have a
loop in the driver to determine the maximum number of interrupts which
are provided by the PCI capabilities and the underlying CPU resources.
This loop would have to be replicated in every driver which wants to
utilize this mechanism. That's unwanted code duplication and error
prone.

In order to move this into generic facilities it is required to have a
mechanism, which allows the recalculation of the interrupt sets and
their size, in the core code. As the core code does not have any
knowledge about the underlying device, a driver specific callback will
be added to struct affinity_desc, which will be invoked by the core
code. The callback will get the number of available interupts as an
argument, so the driver can calculate the corresponding number and size
of interrupt sets.

To support this, two modifications for the handling of struct irq_affinity
are required:

1) The (optional) interrupt sets size information is contained in a
   separate array of integers and struct irq_affinity contains a
   pointer to it.

   This is cumbersome and as the maximum number of interrupt sets is small,
   there is no reason to have separate storage. Moving the size array into
   struct affinity_desc avoids indirections and makes the code simpler.

2) At the moment the struct irq_affinity pointer which is handed in from
   the driver and passed through to several core functions is marked
   'const'.

   With the upcoming callback to recalculate the number and size of
   interrupt sets, it's necessary to remove the 'const'
   qualifier. Otherwise the callback would not be able to update the data.

Implement #1 and store the interrupt sets size in 'struct irq_affinity'.

No functional change.

[ tglx: Fixed the memcpy() size so it won't copy beyond the size of the
  	source. Fixed the kernel doc comments for struct irq_affinity and
  	de-'This patch'-ed the changelog ]

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 drivers/nvme/host/pci.c   |    7 +++----
 include/linux/interrupt.h |    9 ++++++---
 kernel/irq/affinity.c     |   16 ++++++++++++----
 3 files changed, 21 insertions(+), 11 deletions(-)

--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2081,12 +2081,11 @@ static void nvme_calc_io_queues(struct n
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 {
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
-	int irq_sets[2];
 	struct irq_affinity affd = {
-		.pre_vectors = 1,
-		.nr_sets = ARRAY_SIZE(irq_sets),
-		.sets = irq_sets,
+		.pre_vectors	= 1,
+		.nr_sets	= 2,
 	};
+	unsigned int *irq_sets = affd.set_size;
 	int result = 0;
 	unsigned int irq_queues, this_p_queues;

--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -241,20 +241,23 @@ struct irq_affinity_notify {
 	void (*release)(struct kref *ref);
 };

+#define	IRQ_AFFINITY_MAX_SETS  4
+
 /**
  * struct irq_affinity - Description for automatic irq affinity assignements
  * @pre_vectors:	Don't apply affinity to @pre_vectors at beginning of
  *			the MSI(-X) vector space
  * @post_vectors:	Don't apply affinity to @post_vectors at end of
  *			the MSI(-X) vector space
- * @nr_sets:		Length of passed in *sets array
- * @sets:		Number of affinitized sets
+ * @nr_sets:		The number of interrupt sets for which affinity
+ *			spreading is required
+ * @set_size:		Array holding the size of each interrupt set
  */
 struct irq_affinity {
 	unsigned int	pre_vectors;
 	unsigned int	post_vectors;
 	unsigned int	nr_sets;
-	unsigned int	*sets;
+	unsigned int	set_size[IRQ_AFFINITY_MAX_SETS];
 };

 /**
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -238,9 +238,10 @@ static int irq_build_affinity_masks(cons
  * Returns the irq_affinity_desc pointer or NULL if allocation failed.
  */
 struct irq_affinity_desc *
-irq_create_affinity_masks(unsigned int nvecs, const struct irq_affinity *affd)
+irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
 {
 	unsigned int affvecs, curvec, usedvecs, nr_sets, i;
+	unsigned int set_size[IRQ_AFFINITY_MAX_SETS];
 	struct irq_affinity_desc *masks = NULL;

 	/*
@@ -250,6 +251,9 @@ irq_create_affinity_masks(unsigned int n
 	if (nvecs == affd->pre_vectors + affd->post_vectors)
 		return NULL;

+	if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
+		return NULL;
+
 	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
 	if (!masks)
 		return NULL;
@@ -263,11 +267,15 @@ irq_create_affinity_masks(unsigned int n
 	 */
 	affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
 	nr_sets = affd->nr_sets;
-	if (!nr_sets)
+	if (!nr_sets) {
 		nr_sets = 1;
+		set_size[0] = affvecs;
+	} else {
+		memcpy(set_size, affd->set_size, nr_sets * sizeof(unsigned int));
+	}

 	for (i = 0, usedvecs = 0; i < nr_sets; i++) {
-		unsigned int this_vecs = affd->sets ? affd->sets[i] : affvecs;
+		unsigned int this_vecs = set_size[i];
 		int ret;

 		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
@@ -314,7 +322,7 @@ unsigned int irq_calc_affinity_vectors(u
 		unsigned int i;

 		for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
-			set_vecs += affd->sets[i];
+			set_vecs += affd->set_size[i];
 	} else {
 		get_online_cpus();
 		set_vecs = cpumask_weight(cpu_possible_mask);

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch v6 3/7] genirq/affinity: Add new callback for (re)calculating interrupt sets
  2019-02-16 17:13 [patch v6 0/7] genirq/affinity: Overhaul the multiple interrupt sets support Thomas Gleixner
  2019-02-16 17:13 ` [patch v6 1/7] genirq/affinity: Code consolidation Thomas Gleixner
  2019-02-16 17:13 ` [patch v6 2/7] genirq/affinity: Store interrupt sets size in struct irq_affinity Thomas Gleixner
@ 2019-02-16 17:13 ` Thomas Gleixner
  2019-02-18 10:27   ` [tip:irq/core] " tip-bot for Ming Lei
  2021-06-15 19:57   ` [patch v6 3/7] " Bjorn Helgaas
  2019-02-16 17:13 ` [patch v6 4/7] nvme-pci: Simplify interrupt allocation Thomas Gleixner
                   ` (4 subsequent siblings)
  7 siblings, 2 replies; 27+ messages in thread
From: Thomas Gleixner @ 2019-02-16 17:13 UTC (permalink / raw)
  To: LKML
  Cc: Ming Lei, Christoph Hellwig, Bjorn Helgaas, Jens Axboe,
	linux-block, Sagi Grimberg, linux-nvme, linux-pci, Keith Busch,
	Marc Zyngier, Sumit Saxena, Kashyap Desai,
	Shivasharan Srikanteshwara

From: Ming Lei <ming.lei@redhat.com>

The interrupt affinity spreading mechanism supports to spread out
affinities for one or more interrupt sets. A interrupt set contains one or
more interrupts. Each set is mapped to a specific functionality of a
device, e.g. general I/O queues and read I/O queus of multiqueue block
devices.

The number of interrupts per set is defined by the driver. It depends on
the total number of available interrupts for the device, which is
determined by the PCI capabilites and the availability of underlying CPU
resources, and the number of queues which the device provides and the
driver wants to instantiate.

The driver passes initial configuration for the interrupt allocation via a
pointer to struct irq_affinity.

Right now the allocation mechanism is complex as it requires to have a loop
in the driver to determine the maximum number of interrupts which are
provided by the PCI capabilities and the underlying CPU resources.  This
loop would have to be replicated in every driver which wants to utilize
this mechanism. That's unwanted code duplication and error prone.

In order to move this into generic facilities it is required to have a
mechanism, which allows the recalculation of the interrupt sets and their
size, in the core code. As the core code does not have any knowledge about the
underlying device, a driver specific callback is required in struct
irq_affinity, which can be invoked by the core code. The callback gets the
number of available interupts as an argument, so the driver can calculate the
corresponding number and size of interrupt sets.

At the moment the struct irq_affinity pointer which is handed in from the
driver and passed through to several core functions is marked 'const', but for
the callback to be able to modify the data in the struct it's required to
remove the 'const' qualifier.

Add the optional callback to struct irq_affinity, which allows drivers to
recalculate the number and size of interrupt sets and remove the 'const'
qualifier.

For simple invocations, which do not supply a callback, a default callback
is installed, which just sets nr_sets to 1 and transfers the number of
spreadable vectors to the set_size array at index 0.

This is for now guarded by a check for nr_sets != 0 to keep the NVME driver
working until it is converted to the callback mechanism.

To make sure that the driver configuration is correct under all circumstances
the callback is invoked even when there are no interrupts for queues left,
i.e. the pre/post requirements already exhaust the numner of available
interrupts.

At the PCI layer irq_create_affinity_masks() has to be invoked even for the
case where the legacy interrupt is used. That ensures that the callback is
invoked and the device driver can adjust to that situation.

[ tglx: Fixed the simple case (no sets required). Moved the sanity check
  	for nr_sets after the invocation of the callback so it catches
  	broken drivers. Fixed the kernel doc comments for struct
  	irq_affinity and de-'This patch'-ed the changelog ]

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 drivers/pci/msi.c               |   25 ++++++++++------
 drivers/scsi/be2iscsi/be_main.c |    2 -
 include/linux/interrupt.h       |   10 +++++-
 include/linux/pci.h             |    4 +-
 kernel/irq/affinity.c           |   62 ++++++++++++++++++++++++++++------------
 5 files changed, 71 insertions(+), 32 deletions(-)

Index: b/drivers/pci/msi.c
===================================================================
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -532,7 +532,7 @@ static int populate_msi_sysfs(struct pci
 }
 
 static struct msi_desc *
-msi_setup_entry(struct pci_dev *dev, int nvec, const struct irq_affinity *affd)
+msi_setup_entry(struct pci_dev *dev, int nvec, struct irq_affinity *affd)
 {
 	struct irq_affinity_desc *masks = NULL;
 	struct msi_desc *entry;
@@ -597,7 +597,7 @@ static int msi_verify_entries(struct pci
  * which could have been allocated.
  */
 static int msi_capability_init(struct pci_dev *dev, int nvec,
-			       const struct irq_affinity *affd)
+			       struct irq_affinity *affd)
 {
 	struct msi_desc *entry;
 	int ret;
@@ -669,7 +669,7 @@ static void __iomem *msix_map_region(str
 
 static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 			      struct msix_entry *entries, int nvec,
-			      const struct irq_affinity *affd)
+			      struct irq_affinity *affd)
 {
 	struct irq_affinity_desc *curmsk, *masks = NULL;
 	struct msi_desc *entry;
@@ -736,7 +736,7 @@ static void msix_program_entries(struct
  * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
-				int nvec, const struct irq_affinity *affd)
+				int nvec, struct irq_affinity *affd)
 {
 	int ret;
 	u16 control;
@@ -932,7 +932,7 @@ int pci_msix_vec_count(struct pci_dev *d
 EXPORT_SYMBOL(pci_msix_vec_count);
 
 static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
-			     int nvec, const struct irq_affinity *affd)
+			     int nvec, struct irq_affinity *affd)
 {
 	int nr_entries;
 	int i, j;
@@ -1018,7 +1018,7 @@ int pci_msi_enabled(void)
 EXPORT_SYMBOL(pci_msi_enabled);
 
 static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
-				  const struct irq_affinity *affd)
+				  struct irq_affinity *affd)
 {
 	int nvec;
 	int rc;
@@ -1086,7 +1086,7 @@ EXPORT_SYMBOL(pci_enable_msi);
 
 static int __pci_enable_msix_range(struct pci_dev *dev,
 				   struct msix_entry *entries, int minvec,
-				   int maxvec, const struct irq_affinity *affd)
+				   int maxvec, struct irq_affinity *affd)
 {
 	int rc, nvec = maxvec;
 
@@ -1165,9 +1165,9 @@ EXPORT_SYMBOL(pci_enable_msix_range);
  */
 int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 				   unsigned int max_vecs, unsigned int flags,
-				   const struct irq_affinity *affd)
+				   struct irq_affinity *affd)
 {
-	static const struct irq_affinity msi_default_affd;
+	struct irq_affinity msi_default_affd = {0};
 	int msix_vecs = -ENOSPC;
 	int msi_vecs = -ENOSPC;
 
@@ -1196,6 +1196,13 @@ int pci_alloc_irq_vectors_affinity(struc
 	/* use legacy irq if allowed */
 	if (flags & PCI_IRQ_LEGACY) {
 		if (min_vecs == 1 && dev->irq) {
+			/*
+			 * Invoke the affinity spreading logic to ensure that
+			 * the device driver can adjust queue configuration
+			 * for the single interrupt case.
+			 */
+			if (affd)
+				irq_create_affinity_masks(1, affd);
 			pci_intx(dev, 1);
 			return 1;
 		}
Index: b/drivers/scsi/be2iscsi/be_main.c
===================================================================
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -3566,7 +3566,7 @@ static void be2iscsi_enable_msix(struct
 
 	/* if eqid_count == 1 fall back to INTX */
 	if (enable_msix && nvec > 1) {
-		const struct irq_affinity desc = { .post_vectors = 1 };
+		struct irq_affinity desc = { .post_vectors = 1 };
 
 		if (pci_alloc_irq_vectors_affinity(phba->pcidev, 2, nvec,
 				PCI_IRQ_MSIX | PCI_IRQ_AFFINITY, &desc) < 0) {
Index: b/include/linux/interrupt.h
===================================================================
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -252,12 +252,18 @@ struct irq_affinity_notify {
  * @nr_sets:		The number of interrupt sets for which affinity
  *			spreading is required
  * @set_size:		Array holding the size of each interrupt set
+ * @calc_sets:		Callback for calculating the number and size
+ *			of interrupt sets
+ * @priv:		Private data for usage by @calc_sets, usually a
+ *			pointer to driver/device specific data.
  */
 struct irq_affinity {
 	unsigned int	pre_vectors;
 	unsigned int	post_vectors;
 	unsigned int	nr_sets;
 	unsigned int	set_size[IRQ_AFFINITY_MAX_SETS];
+	void		(*calc_sets)(struct irq_affinity *, unsigned int nvecs);
+	void		*priv;
 };
 
 /**
@@ -317,7 +323,7 @@ extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
 
 struct irq_affinity_desc *
-irq_create_affinity_masks(unsigned int nvec, const struct irq_affinity *affd);
+irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd);
 
 unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
 				       const struct irq_affinity *affd);
@@ -354,7 +360,7 @@ irq_set_affinity_notifier(unsigned int i
 }
 
 static inline struct irq_affinity_desc *
-irq_create_affinity_masks(unsigned int nvec, const struct irq_affinity *affd)
+irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd)
 {
 	return NULL;
 }
Index: b/include/linux/pci.h
===================================================================
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1393,7 +1393,7 @@ static inline int pci_enable_msix_exact(
 }
 int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 				   unsigned int max_vecs, unsigned int flags,
-				   const struct irq_affinity *affd);
+				   struct irq_affinity *affd);
 
 void pci_free_irq_vectors(struct pci_dev *dev);
 int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
@@ -1419,7 +1419,7 @@ static inline int pci_enable_msix_exact(
 static inline int
 pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 			       unsigned int max_vecs, unsigned int flags,
-			       const struct irq_affinity *aff_desc)
+			       struct irq_affinity *aff_desc)
 {
 	if ((flags & PCI_IRQ_LEGACY) && min_vecs == 1 && dev->irq)
 		return 1;
Index: b/kernel/irq/affinity.c
===================================================================
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -230,6 +230,12 @@ static int irq_build_affinity_masks(cons
 	return ret;
 }
 
+static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
+{
+	affd->nr_sets = 1;
+	affd->set_size[0] = affvecs;
+}
+
 /**
  * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
  * @nvecs:	The total number of vectors
@@ -240,20 +246,46 @@ static int irq_build_affinity_masks(cons
 struct irq_affinity_desc *
 irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
 {
-	unsigned int affvecs, curvec, usedvecs, nr_sets, i;
-	unsigned int set_size[IRQ_AFFINITY_MAX_SETS];
+	unsigned int affvecs, curvec, usedvecs, i;
 	struct irq_affinity_desc *masks = NULL;
 
 	/*
-	 * If there aren't any vectors left after applying the pre/post
-	 * vectors don't bother with assigning affinity.
+	 * Determine the number of vectors which need interrupt affinities
+	 * assigned. If the pre/post request exhausts the available vectors
+	 * then nothing to do here except for invoking the calc_sets()
+	 * callback so the device driver can adjust to the situation. If there
+	 * is only a single vector, then managing the queue is pointless as
+	 * well.
 	 */
-	if (nvecs == affd->pre_vectors + affd->post_vectors)
-		return NULL;
+	if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors)
+		affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
+	else
+		affvecs = 0;
+
+	/*
+	 * Simple invocations do not provide a calc_sets() callback. Install
+	 * the generic one. The check for affd->nr_sets is a temporary
+	 * workaround and will be removed after the NVME driver is converted
+	 * over.
+	 */
+	if (!affd->nr_sets && !affd->calc_sets)
+		affd->calc_sets = default_calc_sets;
+
+	/*
+	 * If the device driver provided a calc_sets() callback let it
+	 * recalculate the number of sets and their size. The check will go
+	 * away once the NVME driver is converted over.
+	 */
+	if (affd->calc_sets)
+		affd->calc_sets(affd, affvecs);
 
 	if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
 		return NULL;
 
+	/* Nothing to assign? */
+	if (!affvecs)
+		return NULL;
+
 	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
 	if (!masks)
 		return NULL;
@@ -261,21 +293,13 @@ irq_create_affinity_masks(unsigned int n
 	/* Fill out vectors at the beginning that don't need affinity */
 	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
 		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
+
 	/*
 	 * Spread on present CPUs starting from affd->pre_vectors. If we
 	 * have multiple sets, build each sets affinity mask separately.
 	 */
-	affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
-	nr_sets = affd->nr_sets;
-	if (!nr_sets) {
-		nr_sets = 1;
-		set_size[0] = affvecs;
-	} else {
-		memcpy(set_size, affd->set_size, nr_sets * sizeof(unsigned int));
-	}
-
-	for (i = 0, usedvecs = 0; i < nr_sets; i++) {
-		unsigned int this_vecs = set_size[i];
+	for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
+		unsigned int this_vecs = affd->set_size[i];
 		int ret;
 
 		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
@@ -318,7 +342,9 @@ unsigned int irq_calc_affinity_vectors(u
 	if (resv > minvec)
 		return 0;
 
-	if (affd->nr_sets) {
+	if (affd->calc_sets) {
+		set_vecs = maxvec - resv;
+	} else if (affd->nr_sets) {
 		unsigned int i;
 
 		for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch v6 4/7] nvme-pci: Simplify interrupt allocation
  2019-02-16 17:13 [patch v6 0/7] genirq/affinity: Overhaul the multiple interrupt sets support Thomas Gleixner
                   ` (2 preceding siblings ...)
  2019-02-16 17:13 ` [patch v6 3/7] genirq/affinity: Add new callback for (re)calculating interrupt sets Thomas Gleixner
@ 2019-02-16 17:13 ` Thomas Gleixner
  2019-02-18 10:27   ` [tip:irq/core] " tip-bot for Ming Lei
  2019-02-16 17:13 ` [patch v6 5/7] genirq/affinity: Remove the leftovers of the original set support Thomas Gleixner
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2019-02-16 17:13 UTC (permalink / raw)
  To: LKML
  Cc: Ming Lei, Christoph Hellwig, Bjorn Helgaas, Jens Axboe,
	linux-block, Sagi Grimberg, linux-nvme, linux-pci, Keith Busch,
	Marc Zyngier, Sumit Saxena, Kashyap Desai,
	Shivasharan Srikanteshwara

From: Ming Lei <ming.lei@redhat.com>

The NVME PCI driver contains a tedious mechanism for interrupt
allocation, which is necessary to adjust the number and size of interrupt
sets to the maximum available number of interrupts which depends on the
underlying PCI capabilities and the available CPU resources.

It works around the former short comings of the PCI and core interrupt
allocation mechanims in combination with interrupt sets.

The PCI interrupt allocation function allows to provide a maximum and a
minimum number of interrupts to be allocated and tries to allocate as
many as possible. This worked without driver interaction as long as there
was only a single set of interrupts to handle.

With the addition of support for multiple interrupt sets in the generic
affinity spreading logic, which is invoked from the PCI interrupt
allocation, the adaptive loop in the PCI interrupt allocation did not
work for multiple interrupt sets. The reason is that depending on the
total number of interrupts which the PCI allocation adaptive loop tries
to allocate in each step, the number and the size of the interrupt sets
need to be adapted as well. Due to the way the interrupt sets support was
implemented there was no way for the PCI interrupt allocation code or the
core affinity spreading mechanism to invoke a driver specific function
for adapting the interrupt sets configuration.

As a consequence the driver had to implement another adaptive loop around
the PCI interrupt allocation function and calling that with maximum and
minimum interrupts set to the same value. This ensured that the
allocation either succeeded or immediately failed without any attempt to
adjust the number of interrupts in the PCI code.

The core code now allows drivers to provide a callback to recalculate the
number and the size of interrupt sets during PCI interrupt allocation,
which in turn allows the PCI interrupt allocation function to be called
in the same way as with a single set of interrupts. The PCI code handles
the adaptive loop and the interrupt affinity spreading mechanism invokes
the driver callback to adapt the interrupt set configuration to the
current loop value. This replaces the adaptive loop in the driver
completely.

Implement the NVME specific callback which adjusts the interrupt sets
configuration and remove the adaptive allocation loop.

[ tglx: Simplify the callback further and restore the dropped adjustment of
  	number of sets ]

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 drivers/nvme/host/pci.c |  116 ++++++++++++++++--------------------------------
 1 file changed, 39 insertions(+), 77 deletions(-)

Index: b/drivers/nvme/host/pci.c
===================================================================
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2041,41 +2041,42 @@ static int nvme_setup_host_mem(struct nv
 	return ret;
 }
 
-/* irq_queues covers admin queue */
-static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues)
+/*
+ * nirqs is the number of interrupts available for write and read
+ * queues. The core already reserved an interrupt for the admin queue.
+ */
+static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
 {
-	unsigned int this_w_queues = write_queues;
-
-	WARN_ON(!irq_queues);
-
-	/*
-	 * Setup read/write queue split, assign admin queue one independent
-	 * irq vector if irq_queues is > 1.
-	 */
-	if (irq_queues <= 2) {
-		dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
-		dev->io_queues[HCTX_TYPE_READ] = 0;
-		return;
-	}
-
-	/*
-	 * If 'write_queues' is set, ensure it leaves room for at least
-	 * one read queue and one admin queue
-	 */
-	if (this_w_queues >= irq_queues)
-		this_w_queues = irq_queues - 2;
+	struct nvme_dev *dev = affd->priv;
+	unsigned int nr_read_queues;
 
 	/*
-	 * If 'write_queues' is set to zero, reads and writes will share
-	 * a queue set.
-	 */
-	if (!this_w_queues) {
-		dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues - 1;
-		dev->io_queues[HCTX_TYPE_READ] = 0;
+	 * If there is no interupt available for queues, ensure that
+	 * the default queue is set to 1. The affinity set size is
+	 * also set to one, but the irq core ignores it for this case.
+	 *
+	 * If only one interrupt is available or 'write_queue' == 0, combine
+	 * write and read queues.
+	 *
+	 * If 'write_queues' > 0, ensure it leaves room for at least one read
+	 * queue.
+	 */
+	if (!nrirqs) {
+		nrirqs = 1;
+		nr_read_queues = 0;
+	} else if (nrirqs == 1 || !write_queues) {
+		nr_read_queues = 0;
+	} else if (write_queues >= nrirqs) {
+		nr_read_queues = 1;
 	} else {
-		dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
-		dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues - 1;
+		nr_read_queues = nrirqs - write_queues;
 	}
+
+	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
+	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
+	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
+	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
+	affd->nr_sets = nr_read_queues ? 2 : 1;
 }
 
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
@@ -2083,10 +2084,9 @@ static int nvme_setup_irqs(struct nvme_d
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 	struct irq_affinity affd = {
 		.pre_vectors	= 1,
-		.nr_sets	= 2,
+		.calc_sets	= nvme_calc_irq_sets,
+		.priv		= dev,
 	};
-	unsigned int *irq_sets = affd.set_size;
-	int result = 0;
 	unsigned int irq_queues, this_p_queues;
 
 	/*
@@ -2102,51 +2102,12 @@ static int nvme_setup_irqs(struct nvme_d
 	}
 	dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
 
-	/*
-	 * For irq sets, we have to ask for minvec == maxvec. This passes
-	 * any reduction back to us, so we can adjust our queue counts and
-	 * IRQ vector needs.
-	 */
-	do {
-		nvme_calc_io_queues(dev, irq_queues);
-		irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
-		irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
-		if (!irq_sets[1])
-			affd.nr_sets = 1;
-
-		/*
-		 * If we got a failure and we're down to asking for just
-		 * 1 + 1 queues, just ask for a single vector. We'll share
-		 * that between the single IO queue and the admin queue.
-		 * Otherwise, we assign one independent vector to admin queue.
-		 */
-		if (irq_queues > 1)
-			irq_queues = irq_sets[0] + irq_sets[1] + 1;
+	/* Initialize for the single interrupt case */
+	dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
+	dev->io_queues[HCTX_TYPE_READ] = 0;
 
-		result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
-				irq_queues,
-				PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
-
-		/*
-		 * Need to reduce our vec counts. If we get ENOSPC, the
-		 * platform should support mulitple vecs, we just need
-		 * to decrease our ask. If we get EINVAL, the platform
-		 * likely does not. Back down to ask for just one vector.
-		 */
-		if (result == -ENOSPC) {
-			irq_queues--;
-			if (!irq_queues)
-				return result;
-			continue;
-		} else if (result == -EINVAL) {
-			irq_queues = 1;
-			continue;
-		} else if (result <= 0)
-			return -EIO;
-		break;
-	} while (1);
-
-	return result;
+	return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
+			      PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
 }
 
 static void nvme_disable_io_queues(struct nvme_dev *dev)
@@ -3021,6 +2982,7 @@ static struct pci_driver nvme_driver = {
 
 static int __init nvme_init(void)
 {
+	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
 	return pci_register_driver(&nvme_driver);
 }
 



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch v6 5/7] genirq/affinity: Remove the leftovers of the original set support
  2019-02-16 17:13 [patch v6 0/7] genirq/affinity: Overhaul the multiple interrupt sets support Thomas Gleixner
                   ` (3 preceding siblings ...)
  2019-02-16 17:13 ` [patch v6 4/7] nvme-pci: Simplify interrupt allocation Thomas Gleixner
@ 2019-02-16 17:13 ` Thomas Gleixner
  2019-02-17 13:39   ` Ming Lei
  2019-02-18 10:28   ` [tip:irq/core] " tip-bot for Thomas Gleixner
  2019-02-16 17:13 ` [patch v6 6/7] PCI/MSI: Remove obsolete sanity checks for multiple interrupt sets Thomas Gleixner
                   ` (2 subsequent siblings)
  7 siblings, 2 replies; 27+ messages in thread
From: Thomas Gleixner @ 2019-02-16 17:13 UTC (permalink / raw)
  To: LKML
  Cc: Ming Lei, Christoph Hellwig, Bjorn Helgaas, Jens Axboe,
	linux-block, Sagi Grimberg, linux-nvme, linux-pci, Keith Busch,
	Marc Zyngier, Sumit Saxena, Kashyap Desai,
	Shivasharan Srikanteshwara

Now that the NVME driver is converted over to the calc_set() callback, the
workarounds of the original set support can be removed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/affinity.c |   20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

Index: b/kernel/irq/affinity.c
===================================================================
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -264,20 +264,13 @@ irq_create_affinity_masks(unsigned int n
 
 	/*
 	 * Simple invocations do not provide a calc_sets() callback. Install
-	 * the generic one. The check for affd->nr_sets is a temporary
-	 * workaround and will be removed after the NVME driver is converted
-	 * over.
+	 * the generic one.
 	 */
-	if (!affd->nr_sets && !affd->calc_sets)
+	if (!affd->calc_sets)
 		affd->calc_sets = default_calc_sets;
 
-	/*
-	 * If the device driver provided a calc_sets() callback let it
-	 * recalculate the number of sets and their size. The check will go
-	 * away once the NVME driver is converted over.
-	 */
-	if (affd->calc_sets)
-		affd->calc_sets(affd, affvecs);
+	/* Recalculate the sets */
+	affd->calc_sets(affd, affvecs);
 
 	if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
 		return NULL;
@@ -344,11 +337,6 @@ unsigned int irq_calc_affinity_vectors(u
 
 	if (affd->calc_sets) {
 		set_vecs = maxvec - resv;
-	} else if (affd->nr_sets) {
-		unsigned int i;
-
-		for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
-			set_vecs += affd->set_size[i];
 	} else {
 		get_online_cpus();
 		set_vecs = cpumask_weight(cpu_possible_mask);



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch v6 6/7] PCI/MSI: Remove obsolete sanity checks for multiple interrupt sets
  2019-02-16 17:13 [patch v6 0/7] genirq/affinity: Overhaul the multiple interrupt sets support Thomas Gleixner
                   ` (4 preceding siblings ...)
  2019-02-16 17:13 ` [patch v6 5/7] genirq/affinity: Remove the leftovers of the original set support Thomas Gleixner
@ 2019-02-16 17:13 ` Thomas Gleixner
  2019-02-17 13:39   ` Ming Lei
  2019-02-18 10:29   ` [tip:irq/core] " tip-bot for Thomas Gleixner
  2019-02-16 17:13 ` [patch v6 7/7] genirq/affinity: Add support for non-managed affinity sets Thomas Gleixner
  2019-02-18  8:43 ` [patch v6 0/7] genirq/affinity: Overhaul the multiple interrupt sets support Marc Zyngier
  7 siblings, 2 replies; 27+ messages in thread
From: Thomas Gleixner @ 2019-02-16 17:13 UTC (permalink / raw)
  To: LKML
  Cc: Ming Lei, Christoph Hellwig, Bjorn Helgaas, Jens Axboe,
	linux-block, Sagi Grimberg, linux-nvme, linux-pci, Keith Busch,
	Marc Zyngier, Sumit Saxena, Kashyap Desai,
	Shivasharan Srikanteshwara

Multiple interrupt sets for affinity spreading are now handled in the core
code and the number of sets and their size is recalculated via a driver
supplied callback.

That avoids the requirement to invoke pci_alloc_irq_vectors_affinity() with
the arguments minvecs and maxvecs set to the same value and the callsite
handling the ENOSPC situation.

Remove the now obsolete sanity checks and the related comments.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 drivers/pci/msi.c |   14 --------------
 1 file changed, 14 deletions(-)

--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1035,13 +1035,6 @@ static int __pci_enable_msi_range(struct
 	if (maxvec < minvec)
 		return -ERANGE;
 
-	/*
-	 * If the caller is passing in sets, we can't support a range of
-	 * vectors. The caller needs to handle that.
-	 */
-	if (affd && affd->nr_sets && minvec != maxvec)
-		return -EINVAL;
-
 	if (WARN_ON_ONCE(dev->msi_enabled))
 		return -EINVAL;
 
@@ -1093,13 +1086,6 @@ static int __pci_enable_msix_range(struc
 	if (maxvec < minvec)
 		return -ERANGE;
 
-	/*
-	 * If the caller is passing in sets, we can't support a range of
-	 * supported vectors. The caller needs to handle that.
-	 */
-	if (affd && affd->nr_sets && minvec != maxvec)
-		return -EINVAL;
-
 	if (WARN_ON_ONCE(dev->msix_enabled))
 		return -EINVAL;
 



^ permalink raw reply	[flat|nested] 27+ messages in thread

* [patch v6 7/7] genirq/affinity: Add support for non-managed affinity sets
  2019-02-16 17:13 [patch v6 0/7] genirq/affinity: Overhaul the multiple interrupt sets support Thomas Gleixner
                   ` (5 preceding siblings ...)
  2019-02-16 17:13 ` [patch v6 6/7] PCI/MSI: Remove obsolete sanity checks for multiple interrupt sets Thomas Gleixner
@ 2019-02-16 17:13 ` Thomas Gleixner
  2019-02-17 13:45   ` Ming Lei
  2019-02-18  8:43 ` [patch v6 0/7] genirq/affinity: Overhaul the multiple interrupt sets support Marc Zyngier
  7 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2019-02-16 17:13 UTC (permalink / raw)
  To: LKML
  Cc: Ming Lei, Christoph Hellwig, Bjorn Helgaas, Jens Axboe,
	linux-block, Sagi Grimberg, linux-nvme, linux-pci, Keith Busch,
	Marc Zyngier, Sumit Saxena, Kashyap Desai,
	Shivasharan Srikanteshwara

Some drivers need an extra set of interrupts which should not be marked
managed, but should get initial interrupt spreading.

Add a bitmap to struct irq_affinity which allows the driver to mark a
particular set of interrupts as non managed. Check the bitmap during
spreading and use the result to mark the interrupts in the sets
accordingly.

The unmanaged interrupts get initial spreading, but user space can change
their affinity later on. For the managed sets, i.e. the corresponding bit
in the mask is not set, there is no change in behaviour.

Usage example:

	struct irq_affinity affd = {
		.pre_vectors	= 2,
		.unmanaged_sets	= 0x02,
		.calc_sets	= drv_calc_sets,
	};
	....

For both interrupt sets the interrupts are properly spread out, but the
second set is not marked managed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |    2 ++
 kernel/irq/affinity.c     |   16 +++++++++++-----
 2 files changed, 13 insertions(+), 5 deletions(-)

Index: b/include/linux/interrupt.h
===================================================================
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -251,6 +251,7 @@ struct irq_affinity_notify {
  *			the MSI(-X) vector space
  * @nr_sets:		The number of interrupt sets for which affinity
  *			spreading is required
+ * @unmanaged_sets:	Bitmap to mark entries in the @set_size array unmanaged
  * @set_size:		Array holding the size of each interrupt set
  * @calc_sets:		Callback for calculating the number and size
  *			of interrupt sets
@@ -261,6 +262,7 @@ struct irq_affinity {
 	unsigned int	pre_vectors;
 	unsigned int	post_vectors;
 	unsigned int	nr_sets;
+	unsigned int	unmanaged_sets;
 	unsigned int	set_size[IRQ_AFFINITY_MAX_SETS];
 	void		(*calc_sets)(struct irq_affinity *, unsigned int nvecs);
 	void		*priv;
Index: b/kernel/irq/affinity.c
===================================================================
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -249,6 +249,8 @@ irq_create_affinity_masks(unsigned int n
 	unsigned int affvecs, curvec, usedvecs, i;
 	struct irq_affinity_desc *masks = NULL;
 
+	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS > sizeof(affd->unmanaged_sets) * 8);
+
 	/*
 	 * Determine the number of vectors which need interrupt affinities
 	 * assigned. If the pre/post request exhausts the available vectors
@@ -292,7 +294,8 @@ irq_create_affinity_masks(unsigned int n
 	 * have multiple sets, build each sets affinity mask separately.
 	 */
 	for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
-		unsigned int this_vecs = affd->set_size[i];
+		bool managed = affd->unmanaged_sets & (1U << i) ? true : false;
+		unsigned int idx, this_vecs = affd->set_size[i];
 		int ret;
 
 		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
@@ -301,8 +304,15 @@ irq_create_affinity_masks(unsigned int n
 			kfree(masks);
 			return NULL;
 		}
+
+		idx = curvec;
 		curvec += this_vecs;
 		usedvecs += this_vecs;
+		if (managed) {
+			/* Mark the managed interrupts */
+			for (; idx < curvec; idx++)
+				masks[idx].is_managed = 1;
+		}
 	}
 
 	/* Fill out vectors at the end that don't need affinity */
@@ -313,10 +323,6 @@ irq_create_affinity_masks(unsigned int n
 	for (; curvec < nvecs; curvec++)
 		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
 
-	/* Mark the managed interrupts */
-	for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
-		masks[i].is_managed = 1;
-
 	return masks;
 }
 



^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch v6 1/7] genirq/affinity: Code consolidation
  2019-02-16 17:13 ` [patch v6 1/7] genirq/affinity: Code consolidation Thomas Gleixner
@ 2019-02-17 13:36   ` Ming Lei
  2019-02-18 10:25   ` [tip:irq/core] " tip-bot for Thomas Gleixner
  1 sibling, 0 replies; 27+ messages in thread
From: Ming Lei @ 2019-02-17 13:36 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Christoph Hellwig, Bjorn Helgaas, Jens Axboe, linux-block,
	Sagi Grimberg, linux-nvme, linux-pci, Keith Busch, Marc Zyngier,
	Sumit Saxena, Kashyap Desai, Shivasharan Srikanteshwara

On Sat, Feb 16, 2019 at 06:13:07PM +0100, Thomas Gleixner wrote:
> All information and calculations in the interrupt affinity spreading code
> is strictly unsigned int. Though the code uses int all over the place.
> 
> Convert it over to unsigned int.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  include/linux/interrupt.h |   20 +++++++++-------
>  kernel/irq/affinity.c     |   56 ++++++++++++++++++++++------------------------
>  2 files changed, 38 insertions(+), 38 deletions(-)
> 
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -251,10 +251,10 @@ struct irq_affinity_notify {
>   * @sets:		Number of affinitized sets
>   */
>  struct irq_affinity {
> -	int	pre_vectors;
> -	int	post_vectors;
> -	int	nr_sets;
> -	int	*sets;
> +	unsigned int	pre_vectors;
> +	unsigned int	post_vectors;
> +	unsigned int	nr_sets;
> +	unsigned int	*sets;
>  };
>  
>  /**
> @@ -314,9 +314,10 @@ extern int
>  irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
>  
>  struct irq_affinity_desc *
> -irq_create_affinity_masks(int nvec, const struct irq_affinity *affd);
> +irq_create_affinity_masks(unsigned int nvec, const struct irq_affinity *affd);
>  
> -int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd);
> +unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
> +				       const struct irq_affinity *affd);
>  
>  #else /* CONFIG_SMP */
>  
> @@ -350,13 +351,14 @@ irq_set_affinity_notifier(unsigned int i
>  }
>  
>  static inline struct irq_affinity_desc *
> -irq_create_affinity_masks(int nvec, const struct irq_affinity *affd)
> +irq_create_affinity_masks(unsigned int nvec, const struct irq_affinity *affd)
>  {
>  	return NULL;
>  }
>  
> -static inline int
> -irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
> +static inline unsigned int
> +irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
> +			  const struct irq_affinity *affd)
>  {
>  	return maxvec;
>  }
> --- a/kernel/irq/affinity.c
> +++ b/kernel/irq/affinity.c
> @@ -9,7 +9,7 @@
>  #include <linux/cpu.h>
>  
>  static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
> -				int cpus_per_vec)
> +				unsigned int cpus_per_vec)
>  {
>  	const struct cpumask *siblmsk;
>  	int cpu, sibl;
> @@ -95,15 +95,17 @@ static int get_nodes_in_cpumask(cpumask_
>  }
>  
>  static int __irq_build_affinity_masks(const struct irq_affinity *affd,
> -				      int startvec, int numvecs, int firstvec,
> +				      unsigned int startvec,
> +				      unsigned int numvecs,
> +				      unsigned int firstvec,
>  				      cpumask_var_t *node_to_cpumask,
>  				      const struct cpumask *cpu_mask,
>  				      struct cpumask *nmsk,
>  				      struct irq_affinity_desc *masks)
>  {
> -	int n, nodes, cpus_per_vec, extra_vecs, done = 0;
> -	int last_affv = firstvec + numvecs;
> -	int curvec = startvec;
> +	unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0;
> +	unsigned int last_affv = firstvec + numvecs;
> +	unsigned int curvec = startvec;
>  	nodemask_t nodemsk = NODE_MASK_NONE;
>  
>  	if (!cpumask_weight(cpu_mask))
> @@ -117,18 +119,16 @@ static int __irq_build_affinity_masks(co
>  	 */
>  	if (numvecs <= nodes) {
>  		for_each_node_mask(n, nodemsk) {
> -			cpumask_or(&masks[curvec].mask,
> -					&masks[curvec].mask,
> -					node_to_cpumask[n]);
> +			cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
> +				   node_to_cpumask[n]);
>  			if (++curvec == last_affv)
>  				curvec = firstvec;
>  		}
> -		done = numvecs;
> -		goto out;
> +		return numvecs;
>  	}
>  
>  	for_each_node_mask(n, nodemsk) {
> -		int ncpus, v, vecs_to_assign, vecs_per_node;
> +		unsigned int ncpus, v, vecs_to_assign, vecs_per_node;
>  
>  		/* Spread the vectors per node */
>  		vecs_per_node = (numvecs - (curvec - firstvec)) / nodes;
> @@ -163,8 +163,6 @@ static int __irq_build_affinity_masks(co
>  			curvec = firstvec;
>  		--nodes;
>  	}
> -
> -out:
>  	return done;
>  }
>  
> @@ -174,13 +172,14 @@ static int __irq_build_affinity_masks(co
>   *	2) spread other possible CPUs on these vectors
>   */
>  static int irq_build_affinity_masks(const struct irq_affinity *affd,
> -				    int startvec, int numvecs, int firstvec,
> +				    unsigned int startvec, unsigned int numvecs,
> +				    unsigned int firstvec,
>  				    struct irq_affinity_desc *masks)
>  {
> -	int curvec = startvec, nr_present, nr_others;
> -	int ret = -ENOMEM;
> -	cpumask_var_t nmsk, npresmsk;
> +	unsigned int curvec = startvec, nr_present, nr_others;
>  	cpumask_var_t *node_to_cpumask;
> +	cpumask_var_t nmsk, npresmsk;
> +	int ret = -ENOMEM;
>  
>  	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
>  		return ret;
> @@ -239,12 +238,10 @@ static int irq_build_affinity_masks(cons
>   * Returns the irq_affinity_desc pointer or NULL if allocation failed.
>   */
>  struct irq_affinity_desc *
> -irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
> +irq_create_affinity_masks(unsigned int nvecs, const struct irq_affinity *affd)
>  {
> -	int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
> -	int curvec, usedvecs;
> +	unsigned int affvecs, curvec, usedvecs, nr_sets, i;
>  	struct irq_affinity_desc *masks = NULL;
> -	int i, nr_sets;
>  
>  	/*
>  	 * If there aren't any vectors left after applying the pre/post
> @@ -264,16 +261,17 @@ irq_create_affinity_masks(int nvecs, con
>  	 * Spread on present CPUs starting from affd->pre_vectors. If we
>  	 * have multiple sets, build each sets affinity mask separately.
>  	 */
> +	affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
>  	nr_sets = affd->nr_sets;
>  	if (!nr_sets)
>  		nr_sets = 1;
>  
>  	for (i = 0, usedvecs = 0; i < nr_sets; i++) {
> -		int this_vecs = affd->sets ? affd->sets[i] : affvecs;
> +		unsigned int this_vecs = affd->sets ? affd->sets[i] : affvecs;
>  		int ret;
>  
>  		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
> -						curvec, masks);
> +					       curvec, masks);
>  		if (ret) {
>  			kfree(masks);
>  			return NULL;
> @@ -303,17 +301,17 @@ irq_create_affinity_masks(int nvecs, con
>   * @maxvec:	The maximum number of vectors available
>   * @affd:	Description of the affinity requirements
>   */
> -int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
> +unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
> +				       const struct irq_affinity *affd)
>  {
> -	int resv = affd->pre_vectors + affd->post_vectors;
> -	int vecs = maxvec - resv;
> -	int set_vecs;
> +	unsigned int resv = affd->pre_vectors + affd->post_vectors;
> +	unsigned int set_vecs;
>  
>  	if (resv > minvec)
>  		return 0;
>  
>  	if (affd->nr_sets) {
> -		int i;
> +		unsigned int i;
>  
>  		for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
>  			set_vecs += affd->sets[i];
> @@ -323,5 +321,5 @@ int irq_calc_affinity_vectors(int minvec
>  		put_online_cpus();
>  	}
>  
> -	return resv + min(set_vecs, vecs);
> +	return resv + min(set_vecs, maxvec - resv);
>  }

Reviewed-by: Ming Lei <ming.lei@redhat.com>


Thanks,
Ming

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch v6 5/7] genirq/affinity: Remove the leftovers of the original set support
  2019-02-16 17:13 ` [patch v6 5/7] genirq/affinity: Remove the leftovers of the original set support Thomas Gleixner
@ 2019-02-17 13:39   ` Ming Lei
  2019-02-18 10:28   ` [tip:irq/core] " tip-bot for Thomas Gleixner
  1 sibling, 0 replies; 27+ messages in thread
From: Ming Lei @ 2019-02-17 13:39 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Christoph Hellwig, Bjorn Helgaas, Jens Axboe, linux-block,
	Sagi Grimberg, linux-nvme, linux-pci, Keith Busch, Marc Zyngier,
	Sumit Saxena, Kashyap Desai, Shivasharan Srikanteshwara

On Sat, Feb 16, 2019 at 06:13:11PM +0100, Thomas Gleixner wrote:
> Now that the NVME driver is converted over to the calc_set() callback, the
> workarounds of the original set support can be removed.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  kernel/irq/affinity.c |   20 ++++----------------
>  1 file changed, 4 insertions(+), 16 deletions(-)
> 
> Index: b/kernel/irq/affinity.c
> ===================================================================
> --- a/kernel/irq/affinity.c
> +++ b/kernel/irq/affinity.c
> @@ -264,20 +264,13 @@ irq_create_affinity_masks(unsigned int n
>  
>  	/*
>  	 * Simple invocations do not provide a calc_sets() callback. Install
> -	 * the generic one. The check for affd->nr_sets is a temporary
> -	 * workaround and will be removed after the NVME driver is converted
> -	 * over.
> +	 * the generic one.
>  	 */
> -	if (!affd->nr_sets && !affd->calc_sets)
> +	if (!affd->calc_sets)
>  		affd->calc_sets = default_calc_sets;
>  
> -	/*
> -	 * If the device driver provided a calc_sets() callback let it
> -	 * recalculate the number of sets and their size. The check will go
> -	 * away once the NVME driver is converted over.
> -	 */
> -	if (affd->calc_sets)
> -		affd->calc_sets(affd, affvecs);
> +	/* Recalculate the sets */
> +	affd->calc_sets(affd, affvecs);
>  
>  	if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
>  		return NULL;
> @@ -344,11 +337,6 @@ unsigned int irq_calc_affinity_vectors(u
>  
>  	if (affd->calc_sets) {
>  		set_vecs = maxvec - resv;
> -	} else if (affd->nr_sets) {
> -		unsigned int i;
> -
> -		for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
> -			set_vecs += affd->set_size[i];
>  	} else {
>  		get_online_cpus();
>  		set_vecs = cpumask_weight(cpu_possible_mask);
> 
> 

Reviewed-by: Ming Lei <ming.lei@redhat.com>

Thanks,
Ming

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch v6 6/7] PCI/MSI: Remove obsolete sanity checks for multiple interrupt sets
  2019-02-16 17:13 ` [patch v6 6/7] PCI/MSI: Remove obsolete sanity checks for multiple interrupt sets Thomas Gleixner
@ 2019-02-17 13:39   ` Ming Lei
  2019-02-18 10:29   ` [tip:irq/core] " tip-bot for Thomas Gleixner
  1 sibling, 0 replies; 27+ messages in thread
From: Ming Lei @ 2019-02-17 13:39 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Christoph Hellwig, Bjorn Helgaas, Jens Axboe, linux-block,
	Sagi Grimberg, linux-nvme, linux-pci, Keith Busch, Marc Zyngier,
	Sumit Saxena, Kashyap Desai, Shivasharan Srikanteshwara

On Sat, Feb 16, 2019 at 06:13:12PM +0100, Thomas Gleixner wrote:
> Multiple interrupt sets for affinity spreading are now handled in the core
> code and the number of sets and their size is recalculated via a driver
> supplied callback.
> 
> That avoids the requirement to invoke pci_alloc_irq_vectors_affinity() with
> the arguments minvecs and maxvecs set to the same value and the callsite
> handling the ENOSPC situation.
> 
> Remove the now obsolete sanity checks and the related comments.
> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> 
> ---
>  drivers/pci/msi.c |   14 --------------
>  1 file changed, 14 deletions(-)
> 
> --- a/drivers/pci/msi.c
> +++ b/drivers/pci/msi.c
> @@ -1035,13 +1035,6 @@ static int __pci_enable_msi_range(struct
>  	if (maxvec < minvec)
>  		return -ERANGE;
>  
> -	/*
> -	 * If the caller is passing in sets, we can't support a range of
> -	 * vectors. The caller needs to handle that.
> -	 */
> -	if (affd && affd->nr_sets && minvec != maxvec)
> -		return -EINVAL;
> -
>  	if (WARN_ON_ONCE(dev->msi_enabled))
>  		return -EINVAL;
>  
> @@ -1093,13 +1086,6 @@ static int __pci_enable_msix_range(struc
>  	if (maxvec < minvec)
>  		return -ERANGE;
>  
> -	/*
> -	 * If the caller is passing in sets, we can't support a range of
> -	 * supported vectors. The caller needs to handle that.
> -	 */
> -	if (affd && affd->nr_sets && minvec != maxvec)
> -		return -EINVAL;
> -
>  	if (WARN_ON_ONCE(dev->msix_enabled))
>  		return -EINVAL;
>  
> 
> 

Reviewed-by: Ming Lei <ming.lei@redhat.com>

Thanks,
Ming

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch v6 7/7] genirq/affinity: Add support for non-managed affinity sets
  2019-02-16 17:13 ` [patch v6 7/7] genirq/affinity: Add support for non-managed affinity sets Thomas Gleixner
@ 2019-02-17 13:45   ` Ming Lei
  2019-02-17 19:17     ` Thomas Gleixner
  0 siblings, 1 reply; 27+ messages in thread
From: Ming Lei @ 2019-02-17 13:45 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Christoph Hellwig, Bjorn Helgaas, Jens Axboe, linux-block,
	Sagi Grimberg, linux-nvme, linux-pci, Keith Busch, Marc Zyngier,
	Sumit Saxena, Kashyap Desai, Shivasharan Srikanteshwara

Hi Thomas,

On Sat, Feb 16, 2019 at 06:13:13PM +0100, Thomas Gleixner wrote:
> Some drivers need an extra set of interrupts which should not be marked
> managed, but should get initial interrupt spreading.

Could you share the drivers and their use case?

> 
> Add a bitmap to struct irq_affinity which allows the driver to mark a
> particular set of interrupts as non managed. Check the bitmap during
> spreading and use the result to mark the interrupts in the sets
> accordingly.
> 
> The unmanaged interrupts get initial spreading, but user space can change
> their affinity later on. For the managed sets, i.e. the corresponding bit
> in the mask is not set, there is no change in behaviour.
> 
> Usage example:
> 
> 	struct irq_affinity affd = {
> 		.pre_vectors	= 2,
> 		.unmanaged_sets	= 0x02,
> 		.calc_sets	= drv_calc_sets,
> 	};
> 	....
> 
> For both interrupt sets the interrupts are properly spread out, but the
> second set is not marked managed.

Given drivers only care the managed vs non-managed interrupt numbers,
just wondering why this case can't be covered by .pre_vectors &
.post_vectors?

Also this kind of usage may break blk-mq easily, in which the following
rule needs to be respected:

1) all CPUs are required to spread among each interrupt set

2) no any CPU is shared between two IRQs in same set.

> 
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  include/linux/interrupt.h |    2 ++
>  kernel/irq/affinity.c     |   16 +++++++++++-----
>  2 files changed, 13 insertions(+), 5 deletions(-)
> 
> Index: b/include/linux/interrupt.h
> ===================================================================
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -251,6 +251,7 @@ struct irq_affinity_notify {
>   *			the MSI(-X) vector space
>   * @nr_sets:		The number of interrupt sets for which affinity
>   *			spreading is required
> + * @unmanaged_sets:	Bitmap to mark entries in the @set_size array unmanaged
>   * @set_size:		Array holding the size of each interrupt set
>   * @calc_sets:		Callback for calculating the number and size
>   *			of interrupt sets
> @@ -261,6 +262,7 @@ struct irq_affinity {
>  	unsigned int	pre_vectors;
>  	unsigned int	post_vectors;
>  	unsigned int	nr_sets;
> +	unsigned int	unmanaged_sets;
>  	unsigned int	set_size[IRQ_AFFINITY_MAX_SETS];
>  	void		(*calc_sets)(struct irq_affinity *, unsigned int nvecs);
>  	void		*priv;
> Index: b/kernel/irq/affinity.c
> ===================================================================
> --- a/kernel/irq/affinity.c
> +++ b/kernel/irq/affinity.c
> @@ -249,6 +249,8 @@ irq_create_affinity_masks(unsigned int n
>  	unsigned int affvecs, curvec, usedvecs, i;
>  	struct irq_affinity_desc *masks = NULL;
>  
> +	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS > sizeof(affd->unmanaged_sets) * 8);
> +
>  	/*
>  	 * Determine the number of vectors which need interrupt affinities
>  	 * assigned. If the pre/post request exhausts the available vectors
> @@ -292,7 +294,8 @@ irq_create_affinity_masks(unsigned int n
>  	 * have multiple sets, build each sets affinity mask separately.
>  	 */
>  	for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
> -		unsigned int this_vecs = affd->set_size[i];
> +		bool managed = affd->unmanaged_sets & (1U << i) ? true : false;

The above check is inverted.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch v6 7/7] genirq/affinity: Add support for non-managed affinity sets
  2019-02-17 13:45   ` Ming Lei
@ 2019-02-17 19:17     ` Thomas Gleixner
  2019-02-18  2:49       ` Ming Lei
  0 siblings, 1 reply; 27+ messages in thread
From: Thomas Gleixner @ 2019-02-17 19:17 UTC (permalink / raw)
  To: Ming Lei
  Cc: LKML, Christoph Hellwig, Bjorn Helgaas, Jens Axboe, linux-block,
	Sagi Grimberg, linux-nvme, linux-pci, Keith Busch, Marc Zyngier,
	Sumit Saxena, Kashyap Desai, Shivasharan Srikanteshwara

On Sun, 17 Feb 2019, Ming Lei wrote:
> On Sat, Feb 16, 2019 at 06:13:13PM +0100, Thomas Gleixner wrote:
> > Some drivers need an extra set of interrupts which should not be marked
> > managed, but should get initial interrupt spreading.
> 
> Could you share the drivers and their use case?

You were Cc'ed on that old discussion:

 https://lkml.kernel.org/r/300d6fef733ca76ced581f8c6304bac6@mail.gmail.com

> > For both interrupt sets the interrupts are properly spread out, but the
> > second set is not marked managed.
> 
> Given drivers only care the managed vs non-managed interrupt numbers,
> just wondering why this case can't be covered by .pre_vectors &
> .post_vectors?

Well, yes, but post/pre are not subject to spreading and I really don't
want to go there.

> Also this kind of usage may break blk-mq easily, in which the following
> rule needs to be respected:
> 
> 1) all CPUs are required to spread among each interrupt set
> 
> 2) no any CPU is shared between two IRQs in same set.

I don't see how that would break blk-mq. The unmanaged set is not used by
the blk-mq stuff, that's some driver internal voodoo. So blk-mq still gets
a perfectly spread and managed interrupt set for the queues.

> >  	for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
> > -		unsigned int this_vecs = affd->set_size[i];
> > +		bool managed = affd->unmanaged_sets & (1U << i) ? true : false;
> 
> The above check is inverted.

Doh. Stupid me.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch v6 7/7] genirq/affinity: Add support for non-managed affinity sets
  2019-02-17 19:17     ` Thomas Gleixner
@ 2019-02-18  2:49       ` Ming Lei
  2019-02-18  7:25         ` Thomas Gleixner
  0 siblings, 1 reply; 27+ messages in thread
From: Ming Lei @ 2019-02-18  2:49 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Christoph Hellwig, Bjorn Helgaas, Jens Axboe, linux-block,
	Sagi Grimberg, linux-nvme, linux-pci, Keith Busch, Marc Zyngier,
	Sumit Saxena, Kashyap Desai, Shivasharan Srikanteshwara

Hi Thomas,

On Sun, Feb 17, 2019 at 08:17:05PM +0100, Thomas Gleixner wrote:
> On Sun, 17 Feb 2019, Ming Lei wrote:
> > On Sat, Feb 16, 2019 at 06:13:13PM +0100, Thomas Gleixner wrote:
> > > Some drivers need an extra set of interrupts which should not be marked
> > > managed, but should get initial interrupt spreading.
> > 
> > Could you share the drivers and their use case?
> 
> You were Cc'ed on that old discussion:
> 
>  https://lkml.kernel.org/r/300d6fef733ca76ced581f8c6304bac6@mail.gmail.com

Thanks for providing the link.

> 
> > > For both interrupt sets the interrupts are properly spread out, but the
> > > second set is not marked managed.
> > 
> > Given drivers only care the managed vs non-managed interrupt numbers,
> > just wondering why this case can't be covered by .pre_vectors &
> > .post_vectors?
> 
> Well, yes, but post/pre are not subject to spreading and I really don't
> want to go there.
> 
> > Also this kind of usage may break blk-mq easily, in which the following
> > rule needs to be respected:
> > 
> > 1) all CPUs are required to spread among each interrupt set
> > 
> > 2) no any CPU is shared between two IRQs in same set.
> 
> I don't see how that would break blk-mq. The unmanaged set is not used by
> the blk-mq stuff, that's some driver internal voodoo. So blk-mq still gets
> a perfectly spread and managed interrupt set for the queues.

From the discussion above, the use case is for megaraid_sas. And one of the
two interrupt sets(managed and non-managed) will be chosen according to
workloads runtime.

Each interrupt set actually defines one blk-mq queue mapping, and the
queue mapping needs to respect the rule I mentioned now. However,
non-managed affinity can be changed to any way anytime by user-space.

Recently HPSA tried to add one module parameter to use non-managed
IRQ[1].

Also NVMe RDMA uses non-managed interrupts, and at least one CPU hotplug
issue is never fixed yet[2]. 

[1] https://marc.info/?t=154387665200001&r=1&w=2
[2] https://www.spinics.net/lists/linux-block/msg24140.html


thanks,
Ming

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch v6 7/7] genirq/affinity: Add support for non-managed affinity sets
  2019-02-18  2:49       ` Ming Lei
@ 2019-02-18  7:25         ` Thomas Gleixner
  0 siblings, 0 replies; 27+ messages in thread
From: Thomas Gleixner @ 2019-02-18  7:25 UTC (permalink / raw)
  To: Ming Lei
  Cc: LKML, Christoph Hellwig, Bjorn Helgaas, Jens Axboe, linux-block,
	Sagi Grimberg, linux-nvme, linux-pci, Keith Busch, Marc Zyngier,
	Sumit Saxena, Kashyap Desai, Shivasharan Srikanteshwara

Ming,

On Mon, 18 Feb 2019, Ming Lei wrote:
> On Sun, Feb 17, 2019 at 08:17:05PM +0100, Thomas Gleixner wrote:
> > I don't see how that would break blk-mq. The unmanaged set is not used by
> > the blk-mq stuff, that's some driver internal voodoo. So blk-mq still gets
> > a perfectly spread and managed interrupt set for the queues.
> 
> >From the discussion above, the use case is for megaraid_sas. And one of the
> two interrupt sets(managed and non-managed) will be chosen according to
> workloads runtime.
> 
> Each interrupt set actually defines one blk-mq queue mapping, and the
> queue mapping needs to respect the rule I mentioned now. However,
> non-managed affinity can be changed to any way anytime by user-space.
> 
> Recently HPSA tried to add one module parameter to use non-managed
> IRQ[1].
> 
> Also NVMe RDMA uses non-managed interrupts, and at least one CPU hotplug
> issue is never fixed yet[2]. 
> 
> [1] https://marc.info/?t=154387665200001&r=1&w=2
> [2] https://www.spinics.net/lists/linux-block/msg24140.html

Interesting. I misread the description which megasas folks provided
then. I'll drop that patch and get the other lot merged. Thanks for your
work and help with that!

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch v6 0/7] genirq/affinity: Overhaul the multiple interrupt sets support
  2019-02-16 17:13 [patch v6 0/7] genirq/affinity: Overhaul the multiple interrupt sets support Thomas Gleixner
                   ` (6 preceding siblings ...)
  2019-02-16 17:13 ` [patch v6 7/7] genirq/affinity: Add support for non-managed affinity sets Thomas Gleixner
@ 2019-02-18  8:43 ` Marc Zyngier
  7 siblings, 0 replies; 27+ messages in thread
From: Marc Zyngier @ 2019-02-18  8:43 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ming Lei, Christoph Hellwig, Bjorn Helgaas, Jens Axboe,
	linux-block, Sagi Grimberg, linux-nvme, linux-pci, Keith Busch,
	Sumit Saxena, Kashyap Desai, Shivasharan Srikanteshwara

On Sat, 16 Feb 2019 18:13:06 +0100
Thomas Gleixner <tglx@linutronix.de> wrote:

> This is the final update to the series with a few corner cases fixes
> vs. V5 which can be found here:
> 
>    https://lkml.kernel.org/r/20190214204755.819014197@linutronix.de
> 
> The series applies against:
> 
>    git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git master
> 
> and is also available from:
> 
>    git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git WIP.irq
> 
> Changes vs. V5:
> 
>   - Change the invocation for the driver callback so it is invoked even
>     when there are no interrupts left to spread out. That ensures that the
>     driver can adjust to the situation (in case of NVME a single interrupt)
> 
>   - Make sure the callback is invoked in the legacy irq fallback case so
>     the driver is not in a stale state from a failed MSI[X} allocation
>     attempt.
> 
>   - Fix the adjustment logic in the NVME driver as pointed out by Ming and
>     Marc, plus another corner case I found during testing.
> 
>   - Simplify the unmanaged set support
> 
> Thanks,
> 
> 	tglx
> 
> 8<-------------
>  drivers/nvme/host/pci.c         |  117 +++++++++++++---------------------------
>  drivers/pci/msi.c               |   39 +++++--------
>  drivers/scsi/be2iscsi/be_main.c |    2 
>  include/linux/interrupt.h       |   35 ++++++++---
>  include/linux/pci.h             |    4 -
>  kernel/irq/affinity.c           |  116 ++++++++++++++++++++++++---------------
>  6 files changed, 153 insertions(+), 160 deletions(-)
> 
> 

For the series (with or without patch 7), feel free to add my

Acked-by: Marc Zyngier <marc.zyngier@arm.com>

Thanks,

	M.
-- 
Without deviation from the norm, progress is not possible.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [tip:irq/core] genirq/affinity: Code consolidation
  2019-02-16 17:13 ` [patch v6 1/7] genirq/affinity: Code consolidation Thomas Gleixner
  2019-02-17 13:36   ` Ming Lei
@ 2019-02-18 10:25   ` tip-bot for Thomas Gleixner
  1 sibling, 0 replies; 27+ messages in thread
From: tip-bot for Thomas Gleixner @ 2019-02-18 10:25 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: shivasharan.srikanteshwara, axboe, sagi, hch, linux-kernel,
	mingo, marc.zyngier, keith.busch, sumit.saxena, hpa,
	kashyap.desai, tglx, helgaas, ming.lei

Commit-ID:  0145c30e896d26e638d27c957d9eed72893c1c92
Gitweb:     https://git.kernel.org/tip/0145c30e896d26e638d27c957d9eed72893c1c92
Author:     Thomas Gleixner <tglx@linutronix.de>
AuthorDate: Sat, 16 Feb 2019 18:13:07 +0100
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Mon, 18 Feb 2019 11:21:27 +0100

genirq/affinity: Code consolidation

All information and calculations in the interrupt affinity spreading code
is strictly unsigned int. Though the code uses int all over the place.

Convert it over to unsigned int.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Bjorn Helgaas <helgaas@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: linux-nvme@lists.infradead.org
Cc: linux-pci@vger.kernel.org
Cc: Keith Busch <keith.busch@intel.com>
Cc: Sumit Saxena <sumit.saxena@broadcom.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Shivasharan Srikanteshwara <shivasharan.srikanteshwara@broadcom.com>
Link: https://lkml.kernel.org/r/20190216172228.336424556@linutronix.de

---
 include/linux/interrupt.h | 20 +++++++++--------
 kernel/irq/affinity.c     | 56 +++++++++++++++++++++++------------------------
 2 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 4a728dba02e2..35e7389c2011 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -251,10 +251,10 @@ struct irq_affinity_notify {
  * @sets:		Number of affinitized sets
  */
 struct irq_affinity {
-	int	pre_vectors;
-	int	post_vectors;
-	int	nr_sets;
-	int	*sets;
+	unsigned int	pre_vectors;
+	unsigned int	post_vectors;
+	unsigned int	nr_sets;
+	unsigned int	*sets;
 };
 
 /**
@@ -314,9 +314,10 @@ extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
 
 struct irq_affinity_desc *
-irq_create_affinity_masks(int nvec, const struct irq_affinity *affd);
+irq_create_affinity_masks(unsigned int nvec, const struct irq_affinity *affd);
 
-int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd);
+unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
+				       const struct irq_affinity *affd);
 
 #else /* CONFIG_SMP */
 
@@ -350,13 +351,14 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
 }
 
 static inline struct irq_affinity_desc *
-irq_create_affinity_masks(int nvec, const struct irq_affinity *affd)
+irq_create_affinity_masks(unsigned int nvec, const struct irq_affinity *affd)
 {
 	return NULL;
 }
 
-static inline int
-irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
+static inline unsigned int
+irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
+			  const struct irq_affinity *affd)
 {
 	return maxvec;
 }
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 118b66d64a53..82e8799374e9 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -9,7 +9,7 @@
 #include <linux/cpu.h>
 
 static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
-				int cpus_per_vec)
+				unsigned int cpus_per_vec)
 {
 	const struct cpumask *siblmsk;
 	int cpu, sibl;
@@ -95,15 +95,17 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
 }
 
 static int __irq_build_affinity_masks(const struct irq_affinity *affd,
-				      int startvec, int numvecs, int firstvec,
+				      unsigned int startvec,
+				      unsigned int numvecs,
+				      unsigned int firstvec,
 				      cpumask_var_t *node_to_cpumask,
 				      const struct cpumask *cpu_mask,
 				      struct cpumask *nmsk,
 				      struct irq_affinity_desc *masks)
 {
-	int n, nodes, cpus_per_vec, extra_vecs, done = 0;
-	int last_affv = firstvec + numvecs;
-	int curvec = startvec;
+	unsigned int n, nodes, cpus_per_vec, extra_vecs, done = 0;
+	unsigned int last_affv = firstvec + numvecs;
+	unsigned int curvec = startvec;
 	nodemask_t nodemsk = NODE_MASK_NONE;
 
 	if (!cpumask_weight(cpu_mask))
@@ -117,18 +119,16 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 	 */
 	if (numvecs <= nodes) {
 		for_each_node_mask(n, nodemsk) {
-			cpumask_or(&masks[curvec].mask,
-					&masks[curvec].mask,
-					node_to_cpumask[n]);
+			cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
+				   node_to_cpumask[n]);
 			if (++curvec == last_affv)
 				curvec = firstvec;
 		}
-		done = numvecs;
-		goto out;
+		return numvecs;
 	}
 
 	for_each_node_mask(n, nodemsk) {
-		int ncpus, v, vecs_to_assign, vecs_per_node;
+		unsigned int ncpus, v, vecs_to_assign, vecs_per_node;
 
 		/* Spread the vectors per node */
 		vecs_per_node = (numvecs - (curvec - firstvec)) / nodes;
@@ -163,8 +163,6 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 			curvec = firstvec;
 		--nodes;
 	}
-
-out:
 	return done;
 }
 
@@ -174,13 +172,14 @@ out:
  *	2) spread other possible CPUs on these vectors
  */
 static int irq_build_affinity_masks(const struct irq_affinity *affd,
-				    int startvec, int numvecs, int firstvec,
+				    unsigned int startvec, unsigned int numvecs,
+				    unsigned int firstvec,
 				    struct irq_affinity_desc *masks)
 {
-	int curvec = startvec, nr_present, nr_others;
-	int ret = -ENOMEM;
-	cpumask_var_t nmsk, npresmsk;
+	unsigned int curvec = startvec, nr_present, nr_others;
 	cpumask_var_t *node_to_cpumask;
+	cpumask_var_t nmsk, npresmsk;
+	int ret = -ENOMEM;
 
 	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
 		return ret;
@@ -239,12 +238,10 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
  * Returns the irq_affinity_desc pointer or NULL if allocation failed.
  */
 struct irq_affinity_desc *
-irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
+irq_create_affinity_masks(unsigned int nvecs, const struct irq_affinity *affd)
 {
-	int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
-	int curvec, usedvecs;
+	unsigned int affvecs, curvec, usedvecs, nr_sets, i;
 	struct irq_affinity_desc *masks = NULL;
-	int i, nr_sets;
 
 	/*
 	 * If there aren't any vectors left after applying the pre/post
@@ -264,16 +261,17 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	 * Spread on present CPUs starting from affd->pre_vectors. If we
 	 * have multiple sets, build each sets affinity mask separately.
 	 */
+	affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
 	nr_sets = affd->nr_sets;
 	if (!nr_sets)
 		nr_sets = 1;
 
 	for (i = 0, usedvecs = 0; i < nr_sets; i++) {
-		int this_vecs = affd->sets ? affd->sets[i] : affvecs;
+		unsigned int this_vecs = affd->sets ? affd->sets[i] : affvecs;
 		int ret;
 
 		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
-						curvec, masks);
+					       curvec, masks);
 		if (ret) {
 			kfree(masks);
 			return NULL;
@@ -303,17 +301,17 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
  * @maxvec:	The maximum number of vectors available
  * @affd:	Description of the affinity requirements
  */
-int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
+unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
+				       const struct irq_affinity *affd)
 {
-	int resv = affd->pre_vectors + affd->post_vectors;
-	int vecs = maxvec - resv;
-	int set_vecs;
+	unsigned int resv = affd->pre_vectors + affd->post_vectors;
+	unsigned int set_vecs;
 
 	if (resv > minvec)
 		return 0;
 
 	if (affd->nr_sets) {
-		int i;
+		unsigned int i;
 
 		for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
 			set_vecs += affd->sets[i];
@@ -323,5 +321,5 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity
 		put_online_cpus();
 	}
 
-	return resv + min(set_vecs, vecs);
+	return resv + min(set_vecs, maxvec - resv);
 }

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [tip:irq/core] genirq/affinity: Store interrupt sets size in struct irq_affinity
  2019-02-16 17:13 ` [patch v6 2/7] genirq/affinity: Store interrupt sets size in struct irq_affinity Thomas Gleixner
@ 2019-02-18 10:26   ` tip-bot for Ming Lei
  0 siblings, 0 replies; 27+ messages in thread
From: tip-bot for Ming Lei @ 2019-02-18 10:26 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, marc.zyngier, tglx, shivasharan.srikanteshwara,
	sumit.saxena, keith.busch, hch, ming.lei, hpa, axboe,
	kashyap.desai, mingo, sagi, helgaas

Commit-ID:  9cfef55bb57e7620c63087be18a76351628f8d0f
Gitweb:     https://git.kernel.org/tip/9cfef55bb57e7620c63087be18a76351628f8d0f
Author:     Ming Lei <ming.lei@redhat.com>
AuthorDate: Sat, 16 Feb 2019 18:13:08 +0100
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Mon, 18 Feb 2019 11:21:27 +0100

genirq/affinity: Store interrupt sets size in struct irq_affinity

The interrupt affinity spreading mechanism supports to spread out
affinities for one or more interrupt sets. A interrupt set contains one
or more interrupts. Each set is mapped to a specific functionality of a
device, e.g. general I/O queues and read I/O queus of multiqueue block
devices.

The number of interrupts per set is defined by the driver. It depends on
the total number of available interrupts for the device, which is
determined by the PCI capabilites and the availability of underlying CPU
resources, and the number of queues which the device provides and the
driver wants to instantiate.

The driver passes initial configuration for the interrupt allocation via
a pointer to struct irq_affinity.

Right now the allocation mechanism is complex as it requires to have a
loop in the driver to determine the maximum number of interrupts which
are provided by the PCI capabilities and the underlying CPU resources.
This loop would have to be replicated in every driver which wants to
utilize this mechanism. That's unwanted code duplication and error
prone.

In order to move this into generic facilities it is required to have a
mechanism, which allows the recalculation of the interrupt sets and
their size, in the core code. As the core code does not have any
knowledge about the underlying device, a driver specific callback will
be added to struct affinity_desc, which will be invoked by the core
code. The callback will get the number of available interupts as an
argument, so the driver can calculate the corresponding number and size
of interrupt sets.

To support this, two modifications for the handling of struct irq_affinity
are required:

1) The (optional) interrupt sets size information is contained in a
   separate array of integers and struct irq_affinity contains a
   pointer to it.

   This is cumbersome and as the maximum number of interrupt sets is small,
   there is no reason to have separate storage. Moving the size array into
   struct affinity_desc avoids indirections and makes the code simpler.

2) At the moment the struct irq_affinity pointer which is handed in from
   the driver and passed through to several core functions is marked
   'const'.

   With the upcoming callback to recalculate the number and size of
   interrupt sets, it's necessary to remove the 'const'
   qualifier. Otherwise the callback would not be able to update the data.

Implement #1 and store the interrupt sets size in 'struct irq_affinity'.

No functional change.

[ tglx: Fixed the memcpy() size so it won't copy beyond the size of the
  	source. Fixed the kernel doc comments for struct irq_affinity and
  	de-'This patch'-ed the changelog ]

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Bjorn Helgaas <helgaas@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: linux-nvme@lists.infradead.org
Cc: linux-pci@vger.kernel.org
Cc: Keith Busch <keith.busch@intel.com>
Cc: Sumit Saxena <sumit.saxena@broadcom.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Shivasharan Srikanteshwara <shivasharan.srikanteshwara@broadcom.com>
Link: https://lkml.kernel.org/r/20190216172228.423723127@linutronix.de

---
 drivers/nvme/host/pci.c   |  7 +++----
 include/linux/interrupt.h |  9 ++++++---
 kernel/irq/affinity.c     | 16 ++++++++++++----
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9bc585415d9b..21ffd671b6ed 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2081,12 +2081,11 @@ static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues)
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 {
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
-	int irq_sets[2];
 	struct irq_affinity affd = {
-		.pre_vectors = 1,
-		.nr_sets = ARRAY_SIZE(irq_sets),
-		.sets = irq_sets,
+		.pre_vectors	= 1,
+		.nr_sets	= 2,
 	};
+	unsigned int *irq_sets = affd.set_size;
 	int result = 0;
 	unsigned int irq_queues, this_p_queues;

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 35e7389c2011..5afdfd5dc39b 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -241,20 +241,23 @@ struct irq_affinity_notify {
 	void (*release)(struct kref *ref);
 };

+#define	IRQ_AFFINITY_MAX_SETS  4
+
 /**
  * struct irq_affinity - Description for automatic irq affinity assignements
  * @pre_vectors:	Don't apply affinity to @pre_vectors at beginning of
  *			the MSI(-X) vector space
  * @post_vectors:	Don't apply affinity to @post_vectors at end of
  *			the MSI(-X) vector space
- * @nr_sets:		Length of passed in *sets array
- * @sets:		Number of affinitized sets
+ * @nr_sets:		The number of interrupt sets for which affinity
+ *			spreading is required
+ * @set_size:		Array holding the size of each interrupt set
  */
 struct irq_affinity {
 	unsigned int	pre_vectors;
 	unsigned int	post_vectors;
 	unsigned int	nr_sets;
-	unsigned int	*sets;
+	unsigned int	set_size[IRQ_AFFINITY_MAX_SETS];
 };

 /**
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 82e8799374e9..278289c091bb 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -238,9 +238,10 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
  * Returns the irq_affinity_desc pointer or NULL if allocation failed.
  */
 struct irq_affinity_desc *
-irq_create_affinity_masks(unsigned int nvecs, const struct irq_affinity *affd)
+irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
 {
 	unsigned int affvecs, curvec, usedvecs, nr_sets, i;
+	unsigned int set_size[IRQ_AFFINITY_MAX_SETS];
 	struct irq_affinity_desc *masks = NULL;

 	/*
@@ -250,6 +251,9 @@ irq_create_affinity_masks(unsigned int nvecs, const struct irq_affinity *affd)
 	if (nvecs == affd->pre_vectors + affd->post_vectors)
 		return NULL;

+	if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
+		return NULL;
+
 	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
 	if (!masks)
 		return NULL;
@@ -263,11 +267,15 @@ irq_create_affinity_masks(unsigned int nvecs, const struct irq_affinity *affd)
 	 */
 	affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
 	nr_sets = affd->nr_sets;
-	if (!nr_sets)
+	if (!nr_sets) {
 		nr_sets = 1;
+		set_size[0] = affvecs;
+	} else {
+		memcpy(set_size, affd->set_size, nr_sets * sizeof(unsigned int));
+	}

 	for (i = 0, usedvecs = 0; i < nr_sets; i++) {
-		unsigned int this_vecs = affd->sets ? affd->sets[i] : affvecs;
+		unsigned int this_vecs = set_size[i];
 		int ret;

 		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
@@ -314,7 +322,7 @@ unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
 		unsigned int i;

 		for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
-			set_vecs += affd->sets[i];
+			set_vecs += affd->set_size[i];
 	} else {
 		get_online_cpus();
 		set_vecs = cpumask_weight(cpu_possible_mask);

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [tip:irq/core] genirq/affinity: Add new callback for (re)calculating interrupt sets
  2019-02-16 17:13 ` [patch v6 3/7] genirq/affinity: Add new callback for (re)calculating interrupt sets Thomas Gleixner
@ 2019-02-18 10:27   ` tip-bot for Ming Lei
  2021-06-15 19:57   ` [patch v6 3/7] " Bjorn Helgaas
  1 sibling, 0 replies; 27+ messages in thread
From: tip-bot for Ming Lei @ 2019-02-18 10:27 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: axboe, marc.zyngier, sagi, linux-kernel, keith.busch,
	sumit.saxena, shivasharan.srikanteshwara, hpa, mingo, tglx,
	kashyap.desai, hch, helgaas, ming.lei

Commit-ID:  c66d4bd110a1f8a68c1a88bfbf866eb50c6464b7
Gitweb:     https://git.kernel.org/tip/c66d4bd110a1f8a68c1a88bfbf866eb50c6464b7
Author:     Ming Lei <ming.lei@redhat.com>
AuthorDate: Sat, 16 Feb 2019 18:13:09 +0100
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Mon, 18 Feb 2019 11:21:28 +0100

genirq/affinity: Add new callback for (re)calculating interrupt sets

The interrupt affinity spreading mechanism supports to spread out
affinities for one or more interrupt sets. A interrupt set contains one or
more interrupts. Each set is mapped to a specific functionality of a
device, e.g. general I/O queues and read I/O queus of multiqueue block
devices.

The number of interrupts per set is defined by the driver. It depends on
the total number of available interrupts for the device, which is
determined by the PCI capabilites and the availability of underlying CPU
resources, and the number of queues which the device provides and the
driver wants to instantiate.

The driver passes initial configuration for the interrupt allocation via a
pointer to struct irq_affinity.

Right now the allocation mechanism is complex as it requires to have a loop
in the driver to determine the maximum number of interrupts which are
provided by the PCI capabilities and the underlying CPU resources.  This
loop would have to be replicated in every driver which wants to utilize
this mechanism. That's unwanted code duplication and error prone.

In order to move this into generic facilities it is required to have a
mechanism, which allows the recalculation of the interrupt sets and their
size, in the core code. As the core code does not have any knowledge about the
underlying device, a driver specific callback is required in struct
irq_affinity, which can be invoked by the core code. The callback gets the
number of available interupts as an argument, so the driver can calculate the
corresponding number and size of interrupt sets.

At the moment the struct irq_affinity pointer which is handed in from the
driver and passed through to several core functions is marked 'const', but for
the callback to be able to modify the data in the struct it's required to
remove the 'const' qualifier.

Add the optional callback to struct irq_affinity, which allows drivers to
recalculate the number and size of interrupt sets and remove the 'const'
qualifier.

For simple invocations, which do not supply a callback, a default callback
is installed, which just sets nr_sets to 1 and transfers the number of
spreadable vectors to the set_size array at index 0.

This is for now guarded by a check for nr_sets != 0 to keep the NVME driver
working until it is converted to the callback mechanism.

To make sure that the driver configuration is correct under all circumstances
the callback is invoked even when there are no interrupts for queues left,
i.e. the pre/post requirements already exhaust the numner of available
interrupts.

At the PCI layer irq_create_affinity_masks() has to be invoked even for the
case where the legacy interrupt is used. That ensures that the callback is
invoked and the device driver can adjust to that situation.

[ tglx: Fixed the simple case (no sets required). Moved the sanity check
  	for nr_sets after the invocation of the callback so it catches
  	broken drivers. Fixed the kernel doc comments for struct
  	irq_affinity and de-'This patch'-ed the changelog ]

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Bjorn Helgaas <helgaas@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: linux-nvme@lists.infradead.org
Cc: linux-pci@vger.kernel.org
Cc: Keith Busch <keith.busch@intel.com>
Cc: Sumit Saxena <sumit.saxena@broadcom.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Shivasharan Srikanteshwara <shivasharan.srikanteshwara@broadcom.com>
Link: https://lkml.kernel.org/r/20190216172228.512444498@linutronix.de


---
 drivers/pci/msi.c               | 25 +++++++++++------
 drivers/scsi/be2iscsi/be_main.c |  2 +-
 include/linux/interrupt.h       | 10 +++++--
 include/linux/pci.h             |  4 +--
 kernel/irq/affinity.c           | 62 +++++++++++++++++++++++++++++------------
 5 files changed, 71 insertions(+), 32 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 4c0b47867258..7149d6315726 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -532,7 +532,7 @@ error_attrs:
 }
 
 static struct msi_desc *
-msi_setup_entry(struct pci_dev *dev, int nvec, const struct irq_affinity *affd)
+msi_setup_entry(struct pci_dev *dev, int nvec, struct irq_affinity *affd)
 {
 	struct irq_affinity_desc *masks = NULL;
 	struct msi_desc *entry;
@@ -597,7 +597,7 @@ static int msi_verify_entries(struct pci_dev *dev)
  * which could have been allocated.
  */
 static int msi_capability_init(struct pci_dev *dev, int nvec,
-			       const struct irq_affinity *affd)
+			       struct irq_affinity *affd)
 {
 	struct msi_desc *entry;
 	int ret;
@@ -669,7 +669,7 @@ static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries)
 
 static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 			      struct msix_entry *entries, int nvec,
-			      const struct irq_affinity *affd)
+			      struct irq_affinity *affd)
 {
 	struct irq_affinity_desc *curmsk, *masks = NULL;
 	struct msi_desc *entry;
@@ -736,7 +736,7 @@ static void msix_program_entries(struct pci_dev *dev,
  * requested MSI-X entries with allocated irqs or non-zero for otherwise.
  **/
 static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
-				int nvec, const struct irq_affinity *affd)
+				int nvec, struct irq_affinity *affd)
 {
 	int ret;
 	u16 control;
@@ -932,7 +932,7 @@ int pci_msix_vec_count(struct pci_dev *dev)
 EXPORT_SYMBOL(pci_msix_vec_count);
 
 static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
-			     int nvec, const struct irq_affinity *affd)
+			     int nvec, struct irq_affinity *affd)
 {
 	int nr_entries;
 	int i, j;
@@ -1018,7 +1018,7 @@ int pci_msi_enabled(void)
 EXPORT_SYMBOL(pci_msi_enabled);
 
 static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
-				  const struct irq_affinity *affd)
+				  struct irq_affinity *affd)
 {
 	int nvec;
 	int rc;
@@ -1086,7 +1086,7 @@ EXPORT_SYMBOL(pci_enable_msi);
 
 static int __pci_enable_msix_range(struct pci_dev *dev,
 				   struct msix_entry *entries, int minvec,
-				   int maxvec, const struct irq_affinity *affd)
+				   int maxvec, struct irq_affinity *affd)
 {
 	int rc, nvec = maxvec;
 
@@ -1165,9 +1165,9 @@ EXPORT_SYMBOL(pci_enable_msix_range);
  */
 int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 				   unsigned int max_vecs, unsigned int flags,
-				   const struct irq_affinity *affd)
+				   struct irq_affinity *affd)
 {
-	static const struct irq_affinity msi_default_affd;
+	struct irq_affinity msi_default_affd = {0};
 	int msix_vecs = -ENOSPC;
 	int msi_vecs = -ENOSPC;
 
@@ -1196,6 +1196,13 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 	/* use legacy irq if allowed */
 	if (flags & PCI_IRQ_LEGACY) {
 		if (min_vecs == 1 && dev->irq) {
+			/*
+			 * Invoke the affinity spreading logic to ensure that
+			 * the device driver can adjust queue configuration
+			 * for the single interrupt case.
+			 */
+			if (affd)
+				irq_create_affinity_masks(1, affd);
 			pci_intx(dev, 1);
 			return 1;
 		}
diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c
index 74e260027c7d..76e49d902609 100644
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -3566,7 +3566,7 @@ static void be2iscsi_enable_msix(struct beiscsi_hba *phba)
 
 	/* if eqid_count == 1 fall back to INTX */
 	if (enable_msix && nvec > 1) {
-		const struct irq_affinity desc = { .post_vectors = 1 };
+		struct irq_affinity desc = { .post_vectors = 1 };
 
 		if (pci_alloc_irq_vectors_affinity(phba->pcidev, 2, nvec,
 				PCI_IRQ_MSIX | PCI_IRQ_AFFINITY, &desc) < 0) {
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 5afdfd5dc39b..dcdddf4fa76b 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -252,12 +252,18 @@ struct irq_affinity_notify {
  * @nr_sets:		The number of interrupt sets for which affinity
  *			spreading is required
  * @set_size:		Array holding the size of each interrupt set
+ * @calc_sets:		Callback for calculating the number and size
+ *			of interrupt sets
+ * @priv:		Private data for usage by @calc_sets, usually a
+ *			pointer to driver/device specific data.
  */
 struct irq_affinity {
 	unsigned int	pre_vectors;
 	unsigned int	post_vectors;
 	unsigned int	nr_sets;
 	unsigned int	set_size[IRQ_AFFINITY_MAX_SETS];
+	void		(*calc_sets)(struct irq_affinity *, unsigned int nvecs);
+	void		*priv;
 };
 
 /**
@@ -317,7 +323,7 @@ extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
 
 struct irq_affinity_desc *
-irq_create_affinity_masks(unsigned int nvec, const struct irq_affinity *affd);
+irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd);
 
 unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
 				       const struct irq_affinity *affd);
@@ -354,7 +360,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
 }
 
 static inline struct irq_affinity_desc *
-irq_create_affinity_masks(unsigned int nvec, const struct irq_affinity *affd)
+irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd)
 {
 	return NULL;
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 65f1d8c2f082..e7c51b00cdfe 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1393,7 +1393,7 @@ static inline int pci_enable_msix_exact(struct pci_dev *dev,
 }
 int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 				   unsigned int max_vecs, unsigned int flags,
-				   const struct irq_affinity *affd);
+				   struct irq_affinity *affd);
 
 void pci_free_irq_vectors(struct pci_dev *dev);
 int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
@@ -1419,7 +1419,7 @@ static inline int pci_enable_msix_exact(struct pci_dev *dev,
 static inline int
 pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 			       unsigned int max_vecs, unsigned int flags,
-			       const struct irq_affinity *aff_desc)
+			       struct irq_affinity *aff_desc)
 {
 	if ((flags & PCI_IRQ_LEGACY) && min_vecs == 1 && dev->irq)
 		return 1;
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 278289c091bb..d737dc60ab52 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -230,6 +230,12 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
 	return ret;
 }
 
+static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
+{
+	affd->nr_sets = 1;
+	affd->set_size[0] = affvecs;
+}
+
 /**
  * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
  * @nvecs:	The total number of vectors
@@ -240,20 +246,46 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
 struct irq_affinity_desc *
 irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
 {
-	unsigned int affvecs, curvec, usedvecs, nr_sets, i;
-	unsigned int set_size[IRQ_AFFINITY_MAX_SETS];
+	unsigned int affvecs, curvec, usedvecs, i;
 	struct irq_affinity_desc *masks = NULL;
 
 	/*
-	 * If there aren't any vectors left after applying the pre/post
-	 * vectors don't bother with assigning affinity.
+	 * Determine the number of vectors which need interrupt affinities
+	 * assigned. If the pre/post request exhausts the available vectors
+	 * then nothing to do here except for invoking the calc_sets()
+	 * callback so the device driver can adjust to the situation. If there
+	 * is only a single vector, then managing the queue is pointless as
+	 * well.
 	 */
-	if (nvecs == affd->pre_vectors + affd->post_vectors)
-		return NULL;
+	if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors)
+		affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
+	else
+		affvecs = 0;
+
+	/*
+	 * Simple invocations do not provide a calc_sets() callback. Install
+	 * the generic one. The check for affd->nr_sets is a temporary
+	 * workaround and will be removed after the NVME driver is converted
+	 * over.
+	 */
+	if (!affd->nr_sets && !affd->calc_sets)
+		affd->calc_sets = default_calc_sets;
+
+	/*
+	 * If the device driver provided a calc_sets() callback let it
+	 * recalculate the number of sets and their size. The check will go
+	 * away once the NVME driver is converted over.
+	 */
+	if (affd->calc_sets)
+		affd->calc_sets(affd, affvecs);
 
 	if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
 		return NULL;
 
+	/* Nothing to assign? */
+	if (!affvecs)
+		return NULL;
+
 	masks = kcalloc(nvecs, sizeof(*masks), GFP_KERNEL);
 	if (!masks)
 		return NULL;
@@ -261,21 +293,13 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
 	/* Fill out vectors at the beginning that don't need affinity */
 	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
 		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
+
 	/*
 	 * Spread on present CPUs starting from affd->pre_vectors. If we
 	 * have multiple sets, build each sets affinity mask separately.
 	 */
-	affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
-	nr_sets = affd->nr_sets;
-	if (!nr_sets) {
-		nr_sets = 1;
-		set_size[0] = affvecs;
-	} else {
-		memcpy(set_size, affd->set_size, nr_sets * sizeof(unsigned int));
-	}
-
-	for (i = 0, usedvecs = 0; i < nr_sets; i++) {
-		unsigned int this_vecs = set_size[i];
+	for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
+		unsigned int this_vecs = affd->set_size[i];
 		int ret;
 
 		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
@@ -318,7 +342,9 @@ unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
 	if (resv > minvec)
 		return 0;
 
-	if (affd->nr_sets) {
+	if (affd->calc_sets) {
+		set_vecs = maxvec - resv;
+	} else if (affd->nr_sets) {
 		unsigned int i;
 
 		for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [tip:irq/core] nvme-pci: Simplify interrupt allocation
  2019-02-16 17:13 ` [patch v6 4/7] nvme-pci: Simplify interrupt allocation Thomas Gleixner
@ 2019-02-18 10:27   ` tip-bot for Ming Lei
  0 siblings, 0 replies; 27+ messages in thread
From: tip-bot for Ming Lei @ 2019-02-18 10:27 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: marc.zyngier, tglx, hch, mingo, ming.lei, sumit.saxena, hpa,
	shivasharan.srikanteshwara, keith.busch, axboe, linux-kernel,
	sagi, kashyap.desai, helgaas

Commit-ID:  612b72862b4dd7f3f5e42651522daac6733b8ea6
Gitweb:     https://git.kernel.org/tip/612b72862b4dd7f3f5e42651522daac6733b8ea6
Author:     Ming Lei <ming.lei@redhat.com>
AuthorDate: Sat, 16 Feb 2019 18:13:10 +0100
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Mon, 18 Feb 2019 11:21:28 +0100

nvme-pci: Simplify interrupt allocation

The NVME PCI driver contains a tedious mechanism for interrupt
allocation, which is necessary to adjust the number and size of interrupt
sets to the maximum available number of interrupts which depends on the
underlying PCI capabilities and the available CPU resources.

It works around the former short comings of the PCI and core interrupt
allocation mechanims in combination with interrupt sets.

The PCI interrupt allocation function allows to provide a maximum and a
minimum number of interrupts to be allocated and tries to allocate as
many as possible. This worked without driver interaction as long as there
was only a single set of interrupts to handle.

With the addition of support for multiple interrupt sets in the generic
affinity spreading logic, which is invoked from the PCI interrupt
allocation, the adaptive loop in the PCI interrupt allocation did not
work for multiple interrupt sets. The reason is that depending on the
total number of interrupts which the PCI allocation adaptive loop tries
to allocate in each step, the number and the size of the interrupt sets
need to be adapted as well. Due to the way the interrupt sets support was
implemented there was no way for the PCI interrupt allocation code or the
core affinity spreading mechanism to invoke a driver specific function
for adapting the interrupt sets configuration.

As a consequence the driver had to implement another adaptive loop around
the PCI interrupt allocation function and calling that with maximum and
minimum interrupts set to the same value. This ensured that the
allocation either succeeded or immediately failed without any attempt to
adjust the number of interrupts in the PCI code.

The core code now allows drivers to provide a callback to recalculate the
number and the size of interrupt sets during PCI interrupt allocation,
which in turn allows the PCI interrupt allocation function to be called
in the same way as with a single set of interrupts. The PCI code handles
the adaptive loop and the interrupt affinity spreading mechanism invokes
the driver callback to adapt the interrupt set configuration to the
current loop value. This replaces the adaptive loop in the driver
completely.

Implement the NVME specific callback which adjusts the interrupt sets
configuration and remove the adaptive allocation loop.

[ tglx: Simplify the callback further and restore the dropped adjustment of
  	number of sets ]

Signed-off-by: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Bjorn Helgaas <helgaas@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: linux-nvme@lists.infradead.org
Cc: linux-pci@vger.kernel.org
Cc: Keith Busch <keith.busch@intel.com>
Cc: Sumit Saxena <sumit.saxena@broadcom.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Shivasharan Srikanteshwara <shivasharan.srikanteshwara@broadcom.com>
Link: https://lkml.kernel.org/r/20190216172228.602546658@linutronix.de


---
 drivers/nvme/host/pci.c | 114 ++++++++++++++++--------------------------------
 1 file changed, 38 insertions(+), 76 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 21ffd671b6ed..a0fdd5fb4e7a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2041,41 +2041,42 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
 	return ret;
 }
 
-/* irq_queues covers admin queue */
-static void nvme_calc_io_queues(struct nvme_dev *dev, unsigned int irq_queues)
+/*
+ * nirqs is the number of interrupts available for write and read
+ * queues. The core already reserved an interrupt for the admin queue.
+ */
+static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs)
 {
-	unsigned int this_w_queues = write_queues;
-
-	WARN_ON(!irq_queues);
-
-	/*
-	 * Setup read/write queue split, assign admin queue one independent
-	 * irq vector if irq_queues is > 1.
-	 */
-	if (irq_queues <= 2) {
-		dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
-		dev->io_queues[HCTX_TYPE_READ] = 0;
-		return;
-	}
-
-	/*
-	 * If 'write_queues' is set, ensure it leaves room for at least
-	 * one read queue and one admin queue
-	 */
-	if (this_w_queues >= irq_queues)
-		this_w_queues = irq_queues - 2;
+	struct nvme_dev *dev = affd->priv;
+	unsigned int nr_read_queues;
 
 	/*
-	 * If 'write_queues' is set to zero, reads and writes will share
-	 * a queue set.
+	 * If there is no interupt available for queues, ensure that
+	 * the default queue is set to 1. The affinity set size is
+	 * also set to one, but the irq core ignores it for this case.
+	 *
+	 * If only one interrupt is available or 'write_queue' == 0, combine
+	 * write and read queues.
+	 *
+	 * If 'write_queues' > 0, ensure it leaves room for at least one read
+	 * queue.
 	 */
-	if (!this_w_queues) {
-		dev->io_queues[HCTX_TYPE_DEFAULT] = irq_queues - 1;
-		dev->io_queues[HCTX_TYPE_READ] = 0;
+	if (!nrirqs) {
+		nrirqs = 1;
+		nr_read_queues = 0;
+	} else if (nrirqs == 1 || !write_queues) {
+		nr_read_queues = 0;
+	} else if (write_queues >= nrirqs) {
+		nr_read_queues = 1;
 	} else {
-		dev->io_queues[HCTX_TYPE_DEFAULT] = this_w_queues;
-		dev->io_queues[HCTX_TYPE_READ] = irq_queues - this_w_queues - 1;
+		nr_read_queues = nrirqs - write_queues;
 	}
+
+	dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
+	affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues;
+	dev->io_queues[HCTX_TYPE_READ] = nr_read_queues;
+	affd->set_size[HCTX_TYPE_READ] = nr_read_queues;
+	affd->nr_sets = nr_read_queues ? 2 : 1;
 }
 
 static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
@@ -2083,10 +2084,9 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 	struct pci_dev *pdev = to_pci_dev(dev->dev);
 	struct irq_affinity affd = {
 		.pre_vectors	= 1,
-		.nr_sets	= 2,
+		.calc_sets	= nvme_calc_irq_sets,
+		.priv		= dev,
 	};
-	unsigned int *irq_sets = affd.set_size;
-	int result = 0;
 	unsigned int irq_queues, this_p_queues;
 
 	/*
@@ -2102,51 +2102,12 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
 	}
 	dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
 
-	/*
-	 * For irq sets, we have to ask for minvec == maxvec. This passes
-	 * any reduction back to us, so we can adjust our queue counts and
-	 * IRQ vector needs.
-	 */
-	do {
-		nvme_calc_io_queues(dev, irq_queues);
-		irq_sets[0] = dev->io_queues[HCTX_TYPE_DEFAULT];
-		irq_sets[1] = dev->io_queues[HCTX_TYPE_READ];
-		if (!irq_sets[1])
-			affd.nr_sets = 1;
-
-		/*
-		 * If we got a failure and we're down to asking for just
-		 * 1 + 1 queues, just ask for a single vector. We'll share
-		 * that between the single IO queue and the admin queue.
-		 * Otherwise, we assign one independent vector to admin queue.
-		 */
-		if (irq_queues > 1)
-			irq_queues = irq_sets[0] + irq_sets[1] + 1;
+	/* Initialize for the single interrupt case */
+	dev->io_queues[HCTX_TYPE_DEFAULT] = 1;
+	dev->io_queues[HCTX_TYPE_READ] = 0;
 
-		result = pci_alloc_irq_vectors_affinity(pdev, irq_queues,
-				irq_queues,
-				PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
-
-		/*
-		 * Need to reduce our vec counts. If we get ENOSPC, the
-		 * platform should support mulitple vecs, we just need
-		 * to decrease our ask. If we get EINVAL, the platform
-		 * likely does not. Back down to ask for just one vector.
-		 */
-		if (result == -ENOSPC) {
-			irq_queues--;
-			if (!irq_queues)
-				return result;
-			continue;
-		} else if (result == -EINVAL) {
-			irq_queues = 1;
-			continue;
-		} else if (result <= 0)
-			return -EIO;
-		break;
-	} while (1);
-
-	return result;
+	return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues,
+			      PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
 }
 
 static void nvme_disable_io_queues(struct nvme_dev *dev)
@@ -3019,6 +2980,7 @@ static struct pci_driver nvme_driver = {
 
 static int __init nvme_init(void)
 {
+	BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2);
 	return pci_register_driver(&nvme_driver);
 }
 

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [tip:irq/core] genirq/affinity: Remove the leftovers of the original set support
  2019-02-16 17:13 ` [patch v6 5/7] genirq/affinity: Remove the leftovers of the original set support Thomas Gleixner
  2019-02-17 13:39   ` Ming Lei
@ 2019-02-18 10:28   ` tip-bot for Thomas Gleixner
  1 sibling, 0 replies; 27+ messages in thread
From: tip-bot for Thomas Gleixner @ 2019-02-18 10:28 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: tglx, keith.busch, mingo, helgaas, sumit.saxena,
	shivasharan.srikanteshwara, linux-kernel, kashyap.desai, hch,
	sagi, marc.zyngier, axboe, hpa, ming.lei

Commit-ID:  a6a309edba13866b31dc4d8aebad3864a6d56ade
Gitweb:     https://git.kernel.org/tip/a6a309edba13866b31dc4d8aebad3864a6d56ade
Author:     Thomas Gleixner <tglx@linutronix.de>
AuthorDate: Sat, 16 Feb 2019 18:13:11 +0100
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Mon, 18 Feb 2019 11:21:29 +0100

genirq/affinity: Remove the leftovers of the original set support

Now that the NVME driver is converted over to the calc_set() callback, the
workarounds of the original set support can be removed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Bjorn Helgaas <helgaas@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: linux-nvme@lists.infradead.org
Cc: linux-pci@vger.kernel.org
Cc: Keith Busch <keith.busch@intel.com>
Cc: Sumit Saxena <sumit.saxena@broadcom.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Shivasharan Srikanteshwara <shivasharan.srikanteshwara@broadcom.com>
Link: https://lkml.kernel.org/r/20190216172228.689834224@linutronix.de

---
 kernel/irq/affinity.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index d737dc60ab52..f18cd5aa33e8 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -264,20 +264,13 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
 
 	/*
 	 * Simple invocations do not provide a calc_sets() callback. Install
-	 * the generic one. The check for affd->nr_sets is a temporary
-	 * workaround and will be removed after the NVME driver is converted
-	 * over.
+	 * the generic one.
 	 */
-	if (!affd->nr_sets && !affd->calc_sets)
+	if (!affd->calc_sets)
 		affd->calc_sets = default_calc_sets;
 
-	/*
-	 * If the device driver provided a calc_sets() callback let it
-	 * recalculate the number of sets and their size. The check will go
-	 * away once the NVME driver is converted over.
-	 */
-	if (affd->calc_sets)
-		affd->calc_sets(affd, affvecs);
+	/* Recalculate the sets */
+	affd->calc_sets(affd, affvecs);
 
 	if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
 		return NULL;
@@ -344,11 +337,6 @@ unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
 
 	if (affd->calc_sets) {
 		set_vecs = maxvec - resv;
-	} else if (affd->nr_sets) {
-		unsigned int i;
-
-		for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
-			set_vecs += affd->set_size[i];
 	} else {
 		get_online_cpus();
 		set_vecs = cpumask_weight(cpu_possible_mask);

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [tip:irq/core] PCI/MSI: Remove obsolete sanity checks for multiple interrupt sets
  2019-02-16 17:13 ` [patch v6 6/7] PCI/MSI: Remove obsolete sanity checks for multiple interrupt sets Thomas Gleixner
  2019-02-17 13:39   ` Ming Lei
@ 2019-02-18 10:29   ` tip-bot for Thomas Gleixner
  1 sibling, 0 replies; 27+ messages in thread
From: tip-bot for Thomas Gleixner @ 2019-02-18 10:29 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: marc.zyngier, axboe, kashyap.desai, hpa, keith.busch, hch, tglx,
	sumit.saxena, sagi, ming.lei, helgaas, linux-kernel,
	shivasharan.srikanteshwara, mingo

Commit-ID:  4e6b26d23dc1faee318796d5c7f91b5692b1e6be
Gitweb:     https://git.kernel.org/tip/4e6b26d23dc1faee318796d5c7f91b5692b1e6be
Author:     Thomas Gleixner <tglx@linutronix.de>
AuthorDate: Sat, 16 Feb 2019 18:13:12 +0100
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Mon, 18 Feb 2019 11:21:29 +0100

PCI/MSI: Remove obsolete sanity checks for multiple interrupt sets

Multiple interrupt sets for affinity spreading are now handled in the core
code and the number of sets and their size is recalculated via a driver
supplied callback.

That avoids the requirement to invoke pci_alloc_irq_vectors_affinity() with
the arguments minvecs and maxvecs set to the same value and the callsite
handling the ENOSPC situation.

Remove the now obsolete sanity checks and the related comments.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ming Lei <ming.lei@redhat.com>
Acked-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Bjorn Helgaas <helgaas@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: Sagi Grimberg <sagi@grimberg.me>
Cc: linux-nvme@lists.infradead.org
Cc: linux-pci@vger.kernel.org
Cc: Keith Busch <keith.busch@intel.com>
Cc: Sumit Saxena <sumit.saxena@broadcom.com>
Cc: Kashyap Desai <kashyap.desai@broadcom.com>
Cc: Shivasharan Srikanteshwara <shivasharan.srikanteshwara@broadcom.com>
Link: https://lkml.kernel.org/r/20190216172228.778630549@linutronix.de


---
 drivers/pci/msi.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 7149d6315726..73986825d221 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1035,13 +1035,6 @@ static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
 	if (maxvec < minvec)
 		return -ERANGE;
 
-	/*
-	 * If the caller is passing in sets, we can't support a range of
-	 * vectors. The caller needs to handle that.
-	 */
-	if (affd && affd->nr_sets && minvec != maxvec)
-		return -EINVAL;
-
 	if (WARN_ON_ONCE(dev->msi_enabled))
 		return -EINVAL;
 
@@ -1093,13 +1086,6 @@ static int __pci_enable_msix_range(struct pci_dev *dev,
 	if (maxvec < minvec)
 		return -ERANGE;
 
-	/*
-	 * If the caller is passing in sets, we can't support a range of
-	 * supported vectors. The caller needs to handle that.
-	 */
-	if (affd && affd->nr_sets && minvec != maxvec)
-		return -EINVAL;
-
 	if (WARN_ON_ONCE(dev->msix_enabled))
 		return -EINVAL;
 

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [patch v6 3/7] genirq/affinity: Add new callback for (re)calculating interrupt sets
  2019-02-16 17:13 ` [patch v6 3/7] genirq/affinity: Add new callback for (re)calculating interrupt sets Thomas Gleixner
  2019-02-18 10:27   ` [tip:irq/core] " tip-bot for Ming Lei
@ 2021-06-15 19:57   ` Bjorn Helgaas
  2021-06-15 20:04     ` Christoph Hellwig
                       ` (2 more replies)
  1 sibling, 3 replies; 27+ messages in thread
From: Bjorn Helgaas @ 2021-06-15 19:57 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: LKML, Ming Lei, Christoph Hellwig, Jens Axboe, linux-block,
	Sagi Grimberg, linux-nvme, linux-pci, Keith Busch, Marc Zyngier,
	Sumit Saxena, Kashyap Desai, Shivasharan Srikanteshwara

On Sat, Feb 16, 2019 at 06:13:09PM +0100, Thomas Gleixner wrote:
> From: Ming Lei <ming.lei@redhat.com>
> 
> The interrupt affinity spreading mechanism supports to spread out
> affinities for one or more interrupt sets. A interrupt set contains one or
> more interrupts. Each set is mapped to a specific functionality of a
> device, e.g. general I/O queues and read I/O queus of multiqueue block
> devices.
> 
> The number of interrupts per set is defined by the driver. It depends on
> the total number of available interrupts for the device, which is
> determined by the PCI capabilites and the availability of underlying CPU
> resources, and the number of queues which the device provides and the
> driver wants to instantiate.
> 
> The driver passes initial configuration for the interrupt allocation via a
> pointer to struct irq_affinity.
> 
> Right now the allocation mechanism is complex as it requires to have a loop
> in the driver to determine the maximum number of interrupts which are
> provided by the PCI capabilities and the underlying CPU resources.  This
> loop would have to be replicated in every driver which wants to utilize
> this mechanism. That's unwanted code duplication and error prone.
> 
> In order to move this into generic facilities it is required to have a
> mechanism, which allows the recalculation of the interrupt sets and their
> size, in the core code. As the core code does not have any knowledge about the
> underlying device, a driver specific callback is required in struct
> irq_affinity, which can be invoked by the core code. The callback gets the
> number of available interupts as an argument, so the driver can calculate the
> corresponding number and size of interrupt sets.
> 
> At the moment the struct irq_affinity pointer which is handed in from the
> driver and passed through to several core functions is marked 'const', but for
> the callback to be able to modify the data in the struct it's required to
> remove the 'const' qualifier.
> 
> Add the optional callback to struct irq_affinity, which allows drivers to
> recalculate the number and size of interrupt sets and remove the 'const'
> qualifier.
> 
> For simple invocations, which do not supply a callback, a default callback
> is installed, which just sets nr_sets to 1 and transfers the number of
> spreadable vectors to the set_size array at index 0.
> 
> This is for now guarded by a check for nr_sets != 0 to keep the NVME driver
> working until it is converted to the callback mechanism.
> 
> To make sure that the driver configuration is correct under all circumstances
> the callback is invoked even when there are no interrupts for queues left,
> i.e. the pre/post requirements already exhaust the numner of available
> interrupts.
> 
> At the PCI layer irq_create_affinity_masks() has to be invoked even for the
> case where the legacy interrupt is used. That ensures that the callback is
> invoked and the device driver can adjust to that situation.
> 
> [ tglx: Fixed the simple case (no sets required). Moved the sanity check
>   	for nr_sets after the invocation of the callback so it catches
>   	broken drivers. Fixed the kernel doc comments for struct
>   	irq_affinity and de-'This patch'-ed the changelog ]
> 
> Signed-off-by: Ming Lei <ming.lei@redhat.com>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

> @@ -1196,6 +1196,13 @@ int pci_alloc_irq_vectors_affinity(struc
>  	/* use legacy irq if allowed */
>  	if (flags & PCI_IRQ_LEGACY) {
>  		if (min_vecs == 1 && dev->irq) {
> +			/*
> +			 * Invoke the affinity spreading logic to ensure that
> +			 * the device driver can adjust queue configuration
> +			 * for the single interrupt case.
> +			 */
> +			if (affd)
> +				irq_create_affinity_masks(1, affd);

This looks like a leak because irq_create_affinity_masks() returns a
pointer to kcalloc()ed space, but we throw away the pointer.

Or is there something very subtle going on here, like this special
case doesn't allocate anything?  I do see the "Nothing to assign?"
case that returns NULL with no alloc, but it's not completely trivial
to verify that we take that case here.

>  			pci_intx(dev, 1);
>  			return 1;
>  		}

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch v6 3/7] genirq/affinity: Add new callback for (re)calculating interrupt sets
  2021-06-15 19:57   ` [patch v6 3/7] " Bjorn Helgaas
@ 2021-06-15 20:04     ` Christoph Hellwig
  2021-06-16  0:40     ` Ming Lei
  2021-06-18 19:19     ` Thomas Gleixner
  2 siblings, 0 replies; 27+ messages in thread
From: Christoph Hellwig @ 2021-06-15 20:04 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Thomas Gleixner, LKML, Ming Lei, Christoph Hellwig, Jens Axboe,
	linux-block, Sagi Grimberg, linux-nvme, linux-pci, Keith Busch,
	Marc Zyngier, Sumit Saxena, Kashyap Desai,
	Shivasharan Srikanteshwara

On Tue, Jun 15, 2021 at 02:57:07PM -0500, Bjorn Helgaas wrote:
> On Sat, Feb 16, 2019 at 06:13:09PM +0100, Thomas Gleixner wrote:
> > From: Ming Lei <ming.lei@redhat.com>
> > 
> > The interrupt affinity spreading mechanism supports to spread out
> > affinities for one or more interrupt sets. A interrupt set contains one or
> > more interrupts. Each set is mapped to a specific functionality of a
> > device, e.g. general I/O queues and read I/O queus of multiqueue block
> > devices.
> > 
> > The number of interrupts per set is defined by the driver. It depends on
> > the total number of available interrupts for the device, which is
> > determined by the PCI capabilites and the availability of underlying CPU
> > resources, and the number of queues which the device provides and the
> > driver wants to instantiate.
> > 
> > The driver passes initial configuration for the interrupt allocation via a
> > pointer to struct irq_affinity.
> > 
> > Right now the allocation mechanism is complex as it requires to have a loop
> > in the driver to determine the maximum number of interrupts which are
> > provided by the PCI capabilities and the underlying CPU resources.  This
> > loop would have to be replicated in every driver which wants to utilize
> > this mechanism. That's unwanted code duplication and error prone.
> > 
> > In order to move this into generic facilities it is required to have a
> > mechanism, which allows the recalculation of the interrupt sets and their
> > size, in the core code. As the core code does not have any knowledge about the
> > underlying device, a driver specific callback is required in struct
> > irq_affinity, which can be invoked by the core code. The callback gets the
> > number of available interupts as an argument, so the driver can calculate the
> > corresponding number and size of interrupt sets.
> > 
> > At the moment the struct irq_affinity pointer which is handed in from the
> > driver and passed through to several core functions is marked 'const', but for
> > the callback to be able to modify the data in the struct it's required to
> > remove the 'const' qualifier.
> > 
> > Add the optional callback to struct irq_affinity, which allows drivers to
> > recalculate the number and size of interrupt sets and remove the 'const'
> > qualifier.
> > 
> > For simple invocations, which do not supply a callback, a default callback
> > is installed, which just sets nr_sets to 1 and transfers the number of
> > spreadable vectors to the set_size array at index 0.
> > 
> > This is for now guarded by a check for nr_sets != 0 to keep the NVME driver
> > working until it is converted to the callback mechanism.
> > 
> > To make sure that the driver configuration is correct under all circumstances
> > the callback is invoked even when there are no interrupts for queues left,
> > i.e. the pre/post requirements already exhaust the numner of available
> > interrupts.
> > 
> > At the PCI layer irq_create_affinity_masks() has to be invoked even for the
> > case where the legacy interrupt is used. That ensures that the callback is
> > invoked and the device driver can adjust to that situation.
> > 
> > [ tglx: Fixed the simple case (no sets required). Moved the sanity check
> >   	for nr_sets after the invocation of the callback so it catches
> >   	broken drivers. Fixed the kernel doc comments for struct
> >   	irq_affinity and de-'This patch'-ed the changelog ]
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> 
> > @@ -1196,6 +1196,13 @@ int pci_alloc_irq_vectors_affinity(struc
> >  	/* use legacy irq if allowed */
> >  	if (flags & PCI_IRQ_LEGACY) {
> >  		if (min_vecs == 1 && dev->irq) {
> > +			/*
> > +			 * Invoke the affinity spreading logic to ensure that
> > +			 * the device driver can adjust queue configuration
> > +			 * for the single interrupt case.
> > +			 */
> > +			if (affd)
> > +				irq_create_affinity_masks(1, affd);
> 
> This looks like a leak because irq_create_affinity_masks() returns a
> pointer to kcalloc()ed space, but we throw away the pointer.
> 
> Or is there something very subtle going on here, like this special
> case doesn't allocate anything?  I do see the "Nothing to assign?"
> case that returns NULL with no alloc, but it's not completely trivial
> to verify that we take that case here.
> 
> >  			pci_intx(dev, 1);
> >  			return 1;
> >  		}
---end quoted text---

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch v6 3/7] genirq/affinity: Add new callback for (re)calculating interrupt sets
  2021-06-15 19:57   ` [patch v6 3/7] " Bjorn Helgaas
  2021-06-15 20:04     ` Christoph Hellwig
@ 2021-06-16  0:40     ` Ming Lei
  2021-06-18 19:32       ` Thomas Gleixner
  2021-06-18 19:19     ` Thomas Gleixner
  2 siblings, 1 reply; 27+ messages in thread
From: Ming Lei @ 2021-06-16  0:40 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Thomas Gleixner, LKML, Christoph Hellwig, Jens Axboe,
	linux-block, Sagi Grimberg, linux-nvme, linux-pci, Keith Busch,
	Marc Zyngier, Sumit Saxena, Kashyap Desai,
	Shivasharan Srikanteshwara

On Tue, Jun 15, 2021 at 02:57:07PM -0500, Bjorn Helgaas wrote:
> On Sat, Feb 16, 2019 at 06:13:09PM +0100, Thomas Gleixner wrote:
> > From: Ming Lei <ming.lei@redhat.com>
> > 
> > The interrupt affinity spreading mechanism supports to spread out
> > affinities for one or more interrupt sets. A interrupt set contains one or
> > more interrupts. Each set is mapped to a specific functionality of a
> > device, e.g. general I/O queues and read I/O queus of multiqueue block
> > devices.
> > 
> > The number of interrupts per set is defined by the driver. It depends on
> > the total number of available interrupts for the device, which is
> > determined by the PCI capabilites and the availability of underlying CPU
> > resources, and the number of queues which the device provides and the
> > driver wants to instantiate.
> > 
> > The driver passes initial configuration for the interrupt allocation via a
> > pointer to struct irq_affinity.
> > 
> > Right now the allocation mechanism is complex as it requires to have a loop
> > in the driver to determine the maximum number of interrupts which are
> > provided by the PCI capabilities and the underlying CPU resources.  This
> > loop would have to be replicated in every driver which wants to utilize
> > this mechanism. That's unwanted code duplication and error prone.
> > 
> > In order to move this into generic facilities it is required to have a
> > mechanism, which allows the recalculation of the interrupt sets and their
> > size, in the core code. As the core code does not have any knowledge about the
> > underlying device, a driver specific callback is required in struct
> > irq_affinity, which can be invoked by the core code. The callback gets the
> > number of available interupts as an argument, so the driver can calculate the
> > corresponding number and size of interrupt sets.
> > 
> > At the moment the struct irq_affinity pointer which is handed in from the
> > driver and passed through to several core functions is marked 'const', but for
> > the callback to be able to modify the data in the struct it's required to
> > remove the 'const' qualifier.
> > 
> > Add the optional callback to struct irq_affinity, which allows drivers to
> > recalculate the number and size of interrupt sets and remove the 'const'
> > qualifier.
> > 
> > For simple invocations, which do not supply a callback, a default callback
> > is installed, which just sets nr_sets to 1 and transfers the number of
> > spreadable vectors to the set_size array at index 0.
> > 
> > This is for now guarded by a check for nr_sets != 0 to keep the NVME driver
> > working until it is converted to the callback mechanism.
> > 
> > To make sure that the driver configuration is correct under all circumstances
> > the callback is invoked even when there are no interrupts for queues left,
> > i.e. the pre/post requirements already exhaust the numner of available
> > interrupts.
> > 
> > At the PCI layer irq_create_affinity_masks() has to be invoked even for the
> > case where the legacy interrupt is used. That ensures that the callback is
> > invoked and the device driver can adjust to that situation.
> > 
> > [ tglx: Fixed the simple case (no sets required). Moved the sanity check
> >   	for nr_sets after the invocation of the callback so it catches
> >   	broken drivers. Fixed the kernel doc comments for struct
> >   	irq_affinity and de-'This patch'-ed the changelog ]
> > 
> > Signed-off-by: Ming Lei <ming.lei@redhat.com>
> > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> 
> > @@ -1196,6 +1196,13 @@ int pci_alloc_irq_vectors_affinity(struc
> >  	/* use legacy irq if allowed */
> >  	if (flags & PCI_IRQ_LEGACY) {
> >  		if (min_vecs == 1 && dev->irq) {
> > +			/*
> > +			 * Invoke the affinity spreading logic to ensure that
> > +			 * the device driver can adjust queue configuration
> > +			 * for the single interrupt case.
> > +			 */
> > +			if (affd)
> > +				irq_create_affinity_masks(1, affd);
> 
> This looks like a leak because irq_create_affinity_masks() returns a
> pointer to kcalloc()ed space, but we throw away the pointer.
> 
> Or is there something very subtle going on here, like this special
> case doesn't allocate anything?  I do see the "Nothing to assign?"
> case that returns NULL with no alloc, but it's not completely trivial
> to verify that we take that case here.

The purpose is to provide chance to call ->calc_sets() for single
interrupt, maybe it can be improved by the following change:

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 217dc9f0231f..025c647279f5 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -1223,8 +1223,7 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
 			 * the device driver can adjust queue configuration
 			 * for the single interrupt case.
 			 */
-			if (affd)
-				irq_create_affinity_masks(1, affd);
+			irq_affinity_calc_sets_legacy(affd);
 			pci_intx(dev, 1);
 			return 1;
 		}
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 4777850a6dc7..f21f93ce460b 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -368,6 +368,7 @@ irq_create_affinity_masks(unsigned int nvec, struct irq_affinity *affd);
 
 unsigned int irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
 				       const struct irq_affinity *affd);
+void irq_affinity_calc_sets_legacy(struct irq_affinity *affd);
 
 #else /* CONFIG_SMP */
 
@@ -419,6 +420,10 @@ irq_calc_affinity_vectors(unsigned int minvec, unsigned int maxvec,
 	return maxvec;
 }
 
+static inline void irq_affinity_calc_sets_legacy(struct irq_affinity *affd)
+{
+}
+
 #endif /* CONFIG_SMP */
 
 /*
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 4d89ad4fae3b..d01f7dfa5712 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -405,6 +405,30 @@ static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
 	affd->set_size[0] = affvecs;
 }
 
+static void irq_affinity_calc_sets(unsigned int affvecs,
+		struct irq_affinity *affd)
+{
+	/*
+	 * Simple invocations do not provide a calc_sets() callback. Install
+	 * the generic one.
+	 */
+	if (!affd->calc_sets)
+		affd->calc_sets = default_calc_sets;
+
+	/* Recalculate the sets */
+	affd->calc_sets(affd, affvecs);
+
+	WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS);
+}
+
+/* Provide a chance to call ->calc_sets for legacy */
+void irq_affinity_calc_sets_legacy(struct irq_affinity *affd)
+{
+	if (!affd)
+		return;
+	irq_affinity_calc_sets(0, affd);
+}
+
 /**
  * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
  * @nvecs:	The total number of vectors
@@ -429,17 +453,8 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
 	else
 		affvecs = 0;
 
-	/*
-	 * Simple invocations do not provide a calc_sets() callback. Install
-	 * the generic one.
-	 */
-	if (!affd->calc_sets)
-		affd->calc_sets = default_calc_sets;
-
-	/* Recalculate the sets */
-	affd->calc_sets(affd, affvecs);
-
-	if (WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS))
+	irq_affinity_calc_sets(affvecs, affd);
+	if (affd->nr_sets > IRQ_AFFINITY_MAX_SETS)
 		return NULL;
 
 	/* Nothing to assign? */


Thanks, 
Ming


^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [patch v6 3/7] genirq/affinity: Add new callback for (re)calculating interrupt sets
  2021-06-15 19:57   ` [patch v6 3/7] " Bjorn Helgaas
  2021-06-15 20:04     ` Christoph Hellwig
  2021-06-16  0:40     ` Ming Lei
@ 2021-06-18 19:19     ` Thomas Gleixner
  2 siblings, 0 replies; 27+ messages in thread
From: Thomas Gleixner @ 2021-06-18 19:19 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: LKML, Ming Lei, Christoph Hellwig, Jens Axboe, linux-block,
	Sagi Grimberg, linux-nvme, linux-pci, Keith Busch, Marc Zyngier,
	Sumit Saxena, Kashyap Desai, Shivasharan Srikanteshwara

On Tue, Jun 15 2021 at 14:57, Bjorn Helgaas wrote:
>
>> @@ -1196,6 +1196,13 @@ int pci_alloc_irq_vectors_affinity(struc
>>  	/* use legacy irq if allowed */
>>  	if (flags & PCI_IRQ_LEGACY) {
>>  		if (min_vecs == 1 && dev->irq) {
>> +			/*
>> +			 * Invoke the affinity spreading logic to ensure that
>> +			 * the device driver can adjust queue configuration
>> +			 * for the single interrupt case.
>> +			 */
>> +			if (affd)
>> +				irq_create_affinity_masks(1, affd);
>
> This looks like a leak because irq_create_affinity_masks() returns a
> pointer to kcalloc()ed space, but we throw away the pointer.
>
> Or is there something very subtle going on here, like this special
> case doesn't allocate anything?  I do see the "Nothing to assign?"
> case that returns NULL with no alloc, but it's not completely trivial
> to verify that we take that case here.

Yes, it's subtle and it's subtle crap. Sorry that I did not catch that.

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [patch v6 3/7] genirq/affinity: Add new callback for (re)calculating interrupt sets
  2021-06-16  0:40     ` Ming Lei
@ 2021-06-18 19:32       ` Thomas Gleixner
  0 siblings, 0 replies; 27+ messages in thread
From: Thomas Gleixner @ 2021-06-18 19:32 UTC (permalink / raw)
  To: Ming Lei, Bjorn Helgaas
  Cc: LKML, Christoph Hellwig, Jens Axboe, linux-block, Sagi Grimberg,
	linux-nvme, linux-pci, Keith Busch, Marc Zyngier, Sumit Saxena,
	Kashyap Desai, Shivasharan Srikanteshwara

On Wed, Jun 16 2021 at 08:40, Ming Lei wrote:
> On Tue, Jun 15, 2021 at 02:57:07PM -0500, Bjorn Helgaas wrote:

> +static inline void irq_affinity_calc_sets_legacy(struct irq_affinity *affd)

This function name sucks because the function is really a wrapper around
irq_affinity_calc_sets(). What's so legacy about this? The fact that
it's called from the legacy PCI single interrupt code path?

> @@ -405,6 +405,30 @@ static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
>  	affd->set_size[0] = affvecs;
>  }
>  
> +static void irq_affinity_calc_sets(unsigned int affvecs,
> +		struct irq_affinity *affd)

Please align the arguments when you need a line break.

> +{
> +	/*
> +	 * Simple invocations do not provide a calc_sets() callback. Install
> +	 * the generic one.
> +	 */
> +	if (!affd->calc_sets)
> +		affd->calc_sets = default_calc_sets;
> +
> +	/* Recalculate the sets */
> +	affd->calc_sets(affd, affvecs);
> +
> +	WARN_ON_ONCE(affd->nr_sets > IRQ_AFFINITY_MAX_SETS);

Hrm. That function really should return an error code to tell the caller
that something went wrong.

> +}
> +
> +/* Provide a chance to call ->calc_sets for legacy */

What does this comment tell? Close to zero. 

> +void irq_affinity_calc_sets_legacy(struct irq_affinity *affd)
> +{
> +	if (!affd)
> +		return;
> +	irq_affinity_calc_sets(0, affd);
> +}

What's wrong with just exposing irq_affinity_calc_sets() have that
NULL pointer check in the function and add proper function documentation
which explains what this is about?

Thanks,

        tglx

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2021-06-18 19:32 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-16 17:13 [patch v6 0/7] genirq/affinity: Overhaul the multiple interrupt sets support Thomas Gleixner
2019-02-16 17:13 ` [patch v6 1/7] genirq/affinity: Code consolidation Thomas Gleixner
2019-02-17 13:36   ` Ming Lei
2019-02-18 10:25   ` [tip:irq/core] " tip-bot for Thomas Gleixner
2019-02-16 17:13 ` [patch v6 2/7] genirq/affinity: Store interrupt sets size in struct irq_affinity Thomas Gleixner
2019-02-18 10:26   ` [tip:irq/core] " tip-bot for Ming Lei
2019-02-16 17:13 ` [patch v6 3/7] genirq/affinity: Add new callback for (re)calculating interrupt sets Thomas Gleixner
2019-02-18 10:27   ` [tip:irq/core] " tip-bot for Ming Lei
2021-06-15 19:57   ` [patch v6 3/7] " Bjorn Helgaas
2021-06-15 20:04     ` Christoph Hellwig
2021-06-16  0:40     ` Ming Lei
2021-06-18 19:32       ` Thomas Gleixner
2021-06-18 19:19     ` Thomas Gleixner
2019-02-16 17:13 ` [patch v6 4/7] nvme-pci: Simplify interrupt allocation Thomas Gleixner
2019-02-18 10:27   ` [tip:irq/core] " tip-bot for Ming Lei
2019-02-16 17:13 ` [patch v6 5/7] genirq/affinity: Remove the leftovers of the original set support Thomas Gleixner
2019-02-17 13:39   ` Ming Lei
2019-02-18 10:28   ` [tip:irq/core] " tip-bot for Thomas Gleixner
2019-02-16 17:13 ` [patch v6 6/7] PCI/MSI: Remove obsolete sanity checks for multiple interrupt sets Thomas Gleixner
2019-02-17 13:39   ` Ming Lei
2019-02-18 10:29   ` [tip:irq/core] " tip-bot for Thomas Gleixner
2019-02-16 17:13 ` [patch v6 7/7] genirq/affinity: Add support for non-managed affinity sets Thomas Gleixner
2019-02-17 13:45   ` Ming Lei
2019-02-17 19:17     ` Thomas Gleixner
2019-02-18  2:49       ` Ming Lei
2019-02-18  7:25         ` Thomas Gleixner
2019-02-18  8:43 ` [patch v6 0/7] genirq/affinity: Overhaul the multiple interrupt sets support Marc Zyngier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).