linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] irq/core: Fix and expand the irq affinity descriptor
@ 2018-12-04 15:51 Dou Liyang
  2018-12-04 15:51 ` [PATCH 1/3] genirq/core: Add a new interrupt " Dou Liyang
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Dou Liyang @ 2018-12-04 15:51 UTC (permalink / raw)
  To: linux-kernel, linux-pci
  Cc: tglx, kashyap.desai, shivasharan.srikanteshwara, sumit.saxena,
	ming.lei, hch, bhelgaas, douliyang1, Dou Liyang

Now,  Spreading the interrupt affinity info by a cpumask pointer is not
enough, meets a problem[1] and hard to expand in the future.

Fix it by:

     +-----------------------------------+
     |                                   |
     |     struct cpumask *affinity      |
     |                                   |
     +-----------------------------------+
                       |
    +------------------v-------------------+
    |                                      |
    | struct irq_affinity_desc {           |
    |     struct cpumask   mask;           |
    |     unsigned int     is_managed : 1; |
    | };                                   |
    |                                      |
    +--------------------------------------+

[1]:https://marc.info/?l=linux-kernel&m=153543887027997&w=2

Dou Liyang (3):
  genirq/affinity: Add a new interrupt affinity descriptor
  irq/affinity: Add is_managed into struct irq_affinity_desc
  irq/affinity: Fix a possible breakage

 drivers/pci/msi.c         |  9 ++++-----
 include/linux/interrupt.h | 15 +++++++++++++--
 include/linux/irq.h       |  6 ++++--
 include/linux/irqdomain.h |  6 ++++--
 include/linux/msi.h       |  4 ++--
 kernel/irq/affinity.c     | 38 +++++++++++++++++++++++++-------------
 kernel/irq/devres.c       |  4 ++--
 kernel/irq/irqdesc.c      | 25 +++++++++++++++++--------
 kernel/irq/irqdomain.c    |  4 ++--
 kernel/irq/msi.c          |  7 ++++---
 10 files changed, 77 insertions(+), 41 deletions(-)

-- 
2.17.2


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/3] genirq/core: Add a new interrupt affinity descriptor
  2018-12-04 15:51 [PATCH 0/3] irq/core: Fix and expand the irq affinity descriptor Dou Liyang
@ 2018-12-04 15:51 ` Dou Liyang
  2018-12-19 10:37   ` [tip:irq/core] genirq/core: Introduce struct irq_affinity_desc tip-bot for Dou Liyang
  2018-12-04 15:51 ` [PATCH 2/3] irq/affinity: Add is_managed into " Dou Liyang
                   ` (2 subsequent siblings)
  3 siblings, 1 reply; 12+ messages in thread
From: Dou Liyang @ 2018-12-04 15:51 UTC (permalink / raw)
  To: linux-kernel, linux-pci
  Cc: tglx, kashyap.desai, shivasharan.srikanteshwara, sumit.saxena,
	ming.lei, hch, bhelgaas, douliyang1, Dou Liyang

Now, Linux just spreads the interrupt affinity info by a cpumask pointer
and mark it as managed interrupt if its cpumask is not NULL.

if there are some other info should be passed, this design is not good
to expand, adding new arguments is the most staightforward method, But
this will break many functions.

So, add a new interrupt affinity descriptor, replace the cpumask pointer
with its pointer which allows to expand this in the future without touching
all the functions ever again, Just modify the data irq_affinity_desc
structure.

No functional change, just prepares for support of spreading managed
flags.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Dou Liyang <douliyangs@gmail.com>
---
 drivers/pci/msi.c         |  9 ++++-----
 include/linux/interrupt.h | 14 ++++++++++++--
 include/linux/irq.h       |  6 ++++--
 include/linux/irqdomain.h |  6 ++++--
 include/linux/msi.h       |  4 ++--
 kernel/irq/affinity.c     | 22 ++++++++++++----------
 kernel/irq/devres.c       |  4 ++--
 kernel/irq/irqdesc.c      | 16 ++++++++++------
 kernel/irq/irqdomain.c    |  4 ++--
 kernel/irq/msi.c          |  7 ++++---
 10 files changed, 56 insertions(+), 36 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 265ed3e4c920..7a1c8a09efa5 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -534,14 +534,13 @@ static int populate_msi_sysfs(struct pci_dev *pdev)
 static struct msi_desc *
 msi_setup_entry(struct pci_dev *dev, int nvec, const struct irq_affinity *affd)
 {
-	struct cpumask *masks = NULL;
+	struct irq_affinity_desc *masks = NULL;
 	struct msi_desc *entry;
 	u16 control;
 
 	if (affd)
 		masks = irq_create_affinity_masks(nvec, affd);
 
-
 	/* MSI Entry Initialization */
 	entry = alloc_msi_entry(&dev->dev, nvec, masks);
 	if (!entry)
@@ -672,7 +671,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 			      struct msix_entry *entries, int nvec,
 			      const struct irq_affinity *affd)
 {
-	struct cpumask *curmsk, *masks = NULL;
+	struct irq_affinity_desc *curmsk, *masks = NULL;
 	struct msi_desc *entry;
 	int ret, i;
 
@@ -1264,7 +1263,7 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr)
 
 		for_each_pci_msi_entry(entry, dev) {
 			if (i == nr)
-				return entry->affinity;
+				return &entry->affinity->mask;
 			i++;
 		}
 		WARN_ON_ONCE(1);
@@ -1276,7 +1275,7 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr)
 				 nr >= entry->nvec_used))
 			return NULL;
 
-		return &entry->affinity[nr];
+		return &entry->affinity[nr].mask;
 	} else {
 		return cpu_possible_mask;
 	}
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index ca397ff40836..71be303231e9 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -257,6 +257,14 @@ struct irq_affinity {
 	int	*sets;
 };
 
+/**
+ * struct irq_affinity_desc - Interrupt affinity descriptor
+ * @mask:	It's one cpumask per descriptor.
+ */
+struct irq_affinity_desc {
+	struct cpumask	mask;
+};
+
 #if defined(CONFIG_SMP)
 
 extern cpumask_var_t irq_default_affinity;
@@ -303,7 +311,9 @@ extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
 extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
 
-struct cpumask *irq_create_affinity_masks(int nvec, const struct irq_affinity *affd);
+struct irq_affinity_desc *
+irq_create_affinity_masks(int nvec, const struct irq_affinity *affd);
+
 int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd);
 
 #else /* CONFIG_SMP */
@@ -337,7 +347,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
 	return 0;
 }
 
-static inline struct cpumask *
+static inline struct irq_affinity_desc *
 irq_create_affinity_masks(int nvec, const struct irq_affinity *affd)
 {
 	return NULL;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index c9bffda04a45..def2b2aac8b1 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -27,6 +27,7 @@
 struct seq_file;
 struct module;
 struct msi_msg;
+struct irq_affinity_desc;
 enum irqchip_irq_state;
 
 /*
@@ -834,11 +835,12 @@ struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
 unsigned int arch_dynirq_lower_bound(unsigned int from);
 
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
-		      struct module *owner, const struct cpumask *affinity);
+		      struct module *owner,
+		      const struct irq_affinity_desc *affinity);
 
 int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
 			   unsigned int cnt, int node, struct module *owner,
-			   const struct cpumask *affinity);
+			   const struct irq_affinity_desc *affinity);
 
 /* use macros to avoid needing export.h for THIS_MODULE */
 #define irq_alloc_descs(irq, from, cnt, node)	\
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 068aa46f0d55..35965f41d7be 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -43,6 +43,7 @@ struct irq_chip;
 struct irq_data;
 struct cpumask;
 struct seq_file;
+struct irq_affinity_desc;
 
 /* Number of irqs reserved for a legacy isa controller */
 #define NUM_ISA_INTERRUPTS	16
@@ -266,7 +267,7 @@ extern bool irq_domain_check_msi_remap(void);
 extern void irq_set_default_host(struct irq_domain *host);
 extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
 				  irq_hw_number_t hwirq, int node,
-				  const struct cpumask *affinity);
+				  const struct irq_affinity_desc *affinity);
 
 static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node)
 {
@@ -449,7 +450,8 @@ static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *par
 
 extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
 				   unsigned int nr_irqs, int node, void *arg,
-				   bool realloc, const struct cpumask *affinity);
+				   bool realloc,
+				   const struct irq_affinity_desc *affinity);
 extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs);
 extern int irq_domain_activate_irq(struct irq_data *irq_data, bool early);
 extern void irq_domain_deactivate_irq(struct irq_data *irq_data);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 0e9c50052ff3..7ba4c230181c 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -76,7 +76,7 @@ struct msi_desc {
 	unsigned int			nvec_used;
 	struct device			*dev;
 	struct msi_msg			msg;
-	struct cpumask			*affinity;
+	struct irq_affinity_desc	*affinity;
 
 	union {
 		/* PCI MSI/X specific data */
@@ -136,7 +136,7 @@ static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg)
 #endif /* CONFIG_PCI_MSI */
 
 struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
-				 const struct cpumask *affinity);
+				 const struct irq_affinity_desc *affinity);
 void free_msi_entry(struct msi_desc *entry);
 void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 08c904eb7279..1562a36e7c0f 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -99,7 +99,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 				    cpumask_var_t *node_to_cpumask,
 				    const struct cpumask *cpu_mask,
 				    struct cpumask *nmsk,
-				    struct cpumask *masks)
+				    struct irq_affinity_desc *masks)
 {
 	int n, nodes, cpus_per_vec, extra_vecs, done = 0;
 	int last_affv = firstvec + numvecs;
@@ -117,7 +117,9 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 	 */
 	if (numvecs <= nodes) {
 		for_each_node_mask(n, nodemsk) {
-			cpumask_or(masks + curvec, masks + curvec, node_to_cpumask[n]);
+			cpumask_or(&masks[curvec].mask,
+					&masks[curvec].mask,
+					node_to_cpumask[n]);
 			if (++curvec == last_affv)
 				curvec = firstvec;
 		}
@@ -150,7 +152,8 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 				cpus_per_vec++;
 				--extra_vecs;
 			}
-			irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
+			irq_spread_init_one(&masks[curvec].mask, nmsk,
+						cpus_per_vec);
 		}
 
 		done += v;
@@ -173,7 +176,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 static int irq_build_affinity_masks(const struct irq_affinity *affd,
 				    int startvec, int numvecs, int firstvec,
 				    cpumask_var_t *node_to_cpumask,
-				    struct cpumask *masks)
+				    struct irq_affinity_desc *masks)
 {
 	int curvec = startvec, nr_present, nr_others;
 	int ret = -ENOMEM;
@@ -226,15 +229,15 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
  * @nvecs:	The total number of vectors
  * @affd:	Description of the affinity requirements
  *
- * Returns the masks pointer or NULL if allocation failed.
+ * Returns the irq_affinity_desc pointer or NULL if allocation failed.
  */
-struct cpumask *
+struct irq_affinity_desc *
 irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
 	int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
 	int curvec, usedvecs;
 	cpumask_var_t *node_to_cpumask;
-	struct cpumask *masks = NULL;
+	struct irq_affinity_desc *masks = NULL;
 	int i, nr_sets;
 
 	/*
@@ -254,8 +257,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 
 	/* Fill out vectors at the beginning that don't need affinity */
 	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
-		cpumask_copy(masks + curvec, irq_default_affinity);
-
+		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
 	/*
 	 * Spread on present CPUs starting from affd->pre_vectors. If we
 	 * have multiple sets, build each sets affinity mask separately.
@@ -285,7 +287,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	else
 		curvec = affd->pre_vectors + usedvecs;
 	for (; curvec < nvecs; curvec++)
-		cpumask_copy(masks + curvec, irq_default_affinity);
+		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
 
 outnodemsk:
 	free_node_to_cpumask(node_to_cpumask);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 6a682c229e10..5d5378ea0afe 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -169,7 +169,7 @@ static void devm_irq_desc_release(struct device *dev, void *res)
  * @cnt:	Number of consecutive irqs to allocate
  * @node:	Preferred node on which the irq descriptor should be allocated
  * @owner:	Owning module (can be NULL)
- * @affinity:	Optional pointer to an affinity mask array of size @cnt
+ * @affinity:	Optional pointer to an irq_affinity_desc array of size @cnt
  *		which hints where the irq descriptors should be allocated
  *		and which default affinities to use
  *
@@ -179,7 +179,7 @@ static void devm_irq_desc_release(struct device *dev, void *res)
  */
 int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
 			   unsigned int cnt, int node, struct module *owner,
-			   const struct cpumask *affinity)
+			   const struct irq_affinity_desc *affinity)
 {
 	struct irq_desc_devres *dr;
 	int base;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 578d0e5f1b5b..f87fa2b9935a 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -449,8 +449,10 @@ static void free_desc(unsigned int irq)
 }
 
 static int alloc_descs(unsigned int start, unsigned int cnt, int node,
-		       const struct cpumask *affinity, struct module *owner)
+		       const struct irq_affinity_desc *affinity,
+		       struct module *owner)
 {
+	const struct irq_affinity_desc *cur_affinity= affinity;
 	const struct cpumask *mask = NULL;
 	struct irq_desc *desc;
 	unsigned int flags;
@@ -458,9 +460,11 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
 
 	/* Validate affinity mask(s) */
 	if (affinity) {
-		for (i = 0, mask = affinity; i < cnt; i++, mask++) {
+		for (i = 0; i < cnt; i++) {
+			mask = &cur_affinity->mask;
 			if (cpumask_empty(mask))
 				return -EINVAL;
+			cur_affinity++;
 		}
 	}
 
@@ -469,8 +473,8 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
 
 	for (i = 0; i < cnt; i++) {
 		if (affinity) {
-			node = cpu_to_node(cpumask_first(affinity));
-			mask = affinity;
+			mask = &affinity->mask;
+			node = cpu_to_node(cpumask_first(mask));
 			affinity++;
 		}
 		desc = alloc_desc(start + i, node, flags, mask, owner);
@@ -575,7 +579,7 @@ static void free_desc(unsigned int irq)
 }
 
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
-			      const struct cpumask *affinity,
+			      const struct irq_affinity_desc *affinity,
 			      struct module *owner)
 {
 	u32 i;
@@ -705,7 +709,7 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
  */
 int __ref
 __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
-		  struct module *owner, const struct cpumask *affinity)
+		  struct module *owner, const struct irq_affinity_desc *affinity)
 {
 	int start, ret;
 
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3366d11c3e02..8b0be4bd6565 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -969,7 +969,7 @@ const struct irq_domain_ops irq_domain_simple_ops = {
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
 
 int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
-			   int node, const struct cpumask *affinity)
+			   int node, const struct irq_affinity_desc *affinity)
 {
 	unsigned int hint;
 
@@ -1281,7 +1281,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
  */
 int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
 			    unsigned int nr_irqs, int node, void *arg,
-			    bool realloc, const struct cpumask *affinity)
+			    bool realloc, const struct irq_affinity_desc *affinity)
 {
 	int i, ret, virq;
 
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 4ca2fd46645d..36b7f92fcff0 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -23,11 +23,12 @@
  * @nvec:	The number of vectors used in this entry
  * @affinity:	Optional pointer to an affinity mask array size of @nvec
  *
- * If @affinity is not NULL then a an affinity array[@nvec] is allocated
- * and the affinity masks from @affinity are copied.
+ * If @affinity is not NULL then an affinity array[@nvec] is allocated
+ * and the affinity masks and flags from @affinity are copied.
  */
 struct msi_desc *
-alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity)
+alloc_msi_entry(struct device *dev, int nvec,
+		const struct irq_affinity_desc *affinity)
 {
 	struct msi_desc *desc;
 
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/3] irq/affinity: Add is_managed into struct irq_affinity_desc
  2018-12-04 15:51 [PATCH 0/3] irq/core: Fix and expand the irq affinity descriptor Dou Liyang
  2018-12-04 15:51 ` [PATCH 1/3] genirq/core: Add a new interrupt " Dou Liyang
@ 2018-12-04 15:51 ` Dou Liyang
  2018-12-18 15:26   ` Thomas Gleixner
  2018-12-19 10:38   ` [tip:irq/core] genirq/affinity: Add is_managed to " tip-bot for Dou Liyang
  2018-12-04 15:51 ` [PATCH 3/3] irq/affinity: Fix a possible breakage Dou Liyang
  2018-12-19 10:53 ` [PATCH 0/3] irq/core: Fix and expand the irq affinity descriptor Thomas Gleixner
  3 siblings, 2 replies; 12+ messages in thread
From: Dou Liyang @ 2018-12-04 15:51 UTC (permalink / raw)
  To: linux-kernel, linux-pci
  Cc: tglx, kashyap.desai, shivasharan.srikanteshwara, sumit.saxena,
	ming.lei, hch, bhelgaas, douliyang1, Dou Liyang

Now, Linux uses the irq_affinity_desc to convey information.

As Kashyap and Sumit reported, in MSI/-x subsystem, the pre/post vectors
may be used to some extra reply queues for performance.

  https://marc.info/?l=linux-kernel&m=153543887027997&w=2

Their affinities are not NULL, but, they should be mapped as unmanaged
interrupts. So, only transfering the irq affinity assignments is not enough.

Add a new bit "is_managed" to convey the info in irq_affinity_desc and use
it in alloc_descs().

Reported-by: Kashyap Desai <kashyap.desai@broadcom.com>
Reported-by: Sumit Saxena <sumit.saxena@broadcom.com>
Signed-off-by: Dou Liyang <douliyangs@gmail.com>
---
 include/linux/interrupt.h | 1 +
 kernel/irq/affinity.c     | 7 +++++++
 kernel/irq/irqdesc.c      | 9 +++++++--
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 71be303231e9..a12b3dbbc45e 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -263,6 +263,7 @@ struct irq_affinity {
  */
 struct irq_affinity_desc {
 	struct cpumask	mask;
+	unsigned int	is_managed : 1;
 };
 
 #if defined(CONFIG_SMP)
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 1562a36e7c0f..d122575ba1b4 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -289,6 +289,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	for (; curvec < nvecs; curvec++)
 		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
 
+	/*  Setup complementary information */
+	for (i = 0; i < nvecs; i++) {
+		if (i >= affd->pre_vectors && i < nvecs - affd->post_vectors)
+			masks[i].is_managed = 1;
+		else
+			masks[i].is_managed = 0;
+	}
 outnodemsk:
 	free_node_to_cpumask(node_to_cpumask);
 	return masks;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index f87fa2b9935a..6b0821c144c0 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -455,7 +455,7 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
 	const struct irq_affinity_desc *cur_affinity= affinity;
 	const struct cpumask *mask = NULL;
 	struct irq_desc *desc;
-	unsigned int flags;
+	unsigned int flags = 0;
 	int i;
 
 	/* Validate affinity mask(s) */
@@ -468,11 +468,16 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
 		}
 	}
 
-	flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0;
 	mask = NULL;
 
 	for (i = 0; i < cnt; i++) {
 		if (affinity) {
+			if (affinity->is_managed) {
+				flags = IRQD_AFFINITY_MANAGED |
+					IRQD_MANAGED_SHUTDOWN;
+			} else {
+				flags = 0;
+			}
 			mask = &affinity->mask;
 			node = cpu_to_node(cpumask_first(mask));
 			affinity++;
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/3] irq/affinity: Fix a possible breakage
  2018-12-04 15:51 [PATCH 0/3] irq/core: Fix and expand the irq affinity descriptor Dou Liyang
  2018-12-04 15:51 ` [PATCH 1/3] genirq/core: Add a new interrupt " Dou Liyang
  2018-12-04 15:51 ` [PATCH 2/3] irq/affinity: Add is_managed into " Dou Liyang
@ 2018-12-04 15:51 ` Dou Liyang
  2018-12-05  8:28   ` Thomas Gleixner
  2018-12-19 10:53 ` [PATCH 0/3] irq/core: Fix and expand the irq affinity descriptor Thomas Gleixner
  3 siblings, 1 reply; 12+ messages in thread
From: Dou Liyang @ 2018-12-04 15:51 UTC (permalink / raw)
  To: linux-kernel, linux-pci
  Cc: tglx, kashyap.desai, shivasharan.srikanteshwara, sumit.saxena,
	ming.lei, hch, bhelgaas, douliyang1, Dou Liyang

In case of irq_default_affinity != cpu_possible_mask, setting the affinity
for the pre/post vectors to irq_default_affinity is a breakage.

Just set the pre/post vectors to cpu_possible_mask and be done with it.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Dou Liyang <douliyangs@gmail.com>
---
 kernel/irq/affinity.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index d122575ba1b4..aaa1dd82c3df 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -257,7 +257,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 
 	/* Fill out vectors at the beginning that don't need affinity */
 	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
-		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
+		cpumask_copy(&masks[curvec].mask, cpu_possible_mask);
 	/*
 	 * Spread on present CPUs starting from affd->pre_vectors. If we
 	 * have multiple sets, build each sets affinity mask separately.
@@ -282,12 +282,15 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	}
 
 	/* Fill out vectors at the end that don't need affinity */
-	if (usedvecs >= affvecs)
+	if (usedvecs >= affvecs) {
 		curvec = affd->pre_vectors + affvecs;
-	else
+	} else {
 		curvec = affd->pre_vectors + usedvecs;
+		for (; curvec < affd->pre_vectors + affvecs; curvec++)
+			cpumask_copy(&masks[curvec].mask, irq_default_affinity);
+	}
 	for (; curvec < nvecs; curvec++)
-		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
+		cpumask_copy(&masks[curvec].mask, cpu_possible_mask);
 
 	/*  Setup complementary information */
 	for (i = 0; i < nvecs; i++) {
-- 
2.17.2


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] irq/affinity: Fix a possible breakage
  2018-12-04 15:51 ` [PATCH 3/3] irq/affinity: Fix a possible breakage Dou Liyang
@ 2018-12-05  8:28   ` Thomas Gleixner
  2018-12-11 16:27     ` Dou Liyang
  0 siblings, 1 reply; 12+ messages in thread
From: Thomas Gleixner @ 2018-12-05  8:28 UTC (permalink / raw)
  To: Dou Liyang
  Cc: linux-kernel, linux-pci, kashyap.desai,
	shivasharan.srikanteshwara, sumit.saxena, ming.lei, hch,
	bhelgaas, douliyang1

On Tue, 4 Dec 2018, Dou Liyang wrote:

> In case of irq_default_affinity != cpu_possible_mask, setting the affinity
> for the pre/post vectors to irq_default_affinity is a breakage.

Why so? All interrupts which are not managed get te default affinity
mask. It can be different than cpu_possible_mask, but that's what the admin
has set. The affinity of these non-managed interrupts can still be set via
/proc/... so where is the breakage?

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] irq/affinity: Fix a possible breakage
  2018-12-05  8:28   ` Thomas Gleixner
@ 2018-12-11 16:27     ` Dou Liyang
  0 siblings, 0 replies; 12+ messages in thread
From: Dou Liyang @ 2018-12-11 16:27 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: linux-kernel, linux-pci, kashyap.desai,
	shivasharan.srikanteshwara, sumit.saxena, ming.lei, hch,
	bhelgaas, douliyang1

Hi tglx,
on 2018/12/5 16:28, Thomas Gleixner wrote:
> On Tue, 4 Dec 2018, Dou Liyang wrote:
> 
>> In case of irq_default_affinity != cpu_possible_mask, setting the affinity
>> for the pre/post vectors to irq_default_affinity is a breakage.
> 
> Why so? All interrupts which are not managed get te default affinity
> mask. It can be different than cpu_possible_mask, but that's what the admin
> has set. The affinity of these non-managed interrupts can still be set via
> /proc/... so where is the breakage?

I misunderstood it. please ignore this, ;-)

Thanks,
	dou

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] irq/affinity: Add is_managed into struct irq_affinity_desc
  2018-12-04 15:51 ` [PATCH 2/3] irq/affinity: Add is_managed into " Dou Liyang
@ 2018-12-18 15:26   ` Thomas Gleixner
  2018-12-19 10:38   ` [tip:irq/core] genirq/affinity: Add is_managed to " tip-bot for Dou Liyang
  1 sibling, 0 replies; 12+ messages in thread
From: Thomas Gleixner @ 2018-12-18 15:26 UTC (permalink / raw)
  To: Dou Liyang
  Cc: linux-kernel, linux-pci, kashyap.desai,
	shivasharan.srikanteshwara, sumit.saxena, ming.lei, hch,
	bhelgaas, douliyang1

On Tue, 4 Dec 2018, Dou Liyang wrote:
> diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
> index 1562a36e7c0f..d122575ba1b4 100644
> --- a/kernel/irq/affinity.c
> +++ b/kernel/irq/affinity.c
> @@ -289,6 +289,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
>  	for (; curvec < nvecs; curvec++)
>  		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
>  
> +	/*  Setup complementary information */
> +	for (i = 0; i < nvecs; i++) {
> +		if (i >= affd->pre_vectors && i < nvecs - affd->post_vectors)
> +			masks[i].is_managed = 1;
> +		else
> +			masks[i].is_managed = 0;

Why want you clear the bit? The masks are allocated with kcalloc() so it's
clear already.

+	/* Mark the managed interrupts */
+	for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
+		masks[i].is_managed = 1;
+

Is what I made it instead.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [tip:irq/core] genirq/core: Introduce struct irq_affinity_desc
  2018-12-04 15:51 ` [PATCH 1/3] genirq/core: Add a new interrupt " Dou Liyang
@ 2018-12-19 10:37   ` tip-bot for Dou Liyang
  0 siblings, 0 replies; 12+ messages in thread
From: tip-bot for Dou Liyang @ 2018-12-19 10:37 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: bhelgaas, tglx, douliyangs, mingo, hpa, linux-kernel

Commit-ID:  bec04037e4e484f41ee4d9409e40616874169d20
Gitweb:     https://git.kernel.org/tip/bec04037e4e484f41ee4d9409e40616874169d20
Author:     Dou Liyang <douliyangs@gmail.com>
AuthorDate: Tue, 4 Dec 2018 23:51:20 +0800
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Wed, 19 Dec 2018 11:32:08 +0100

genirq/core: Introduce struct irq_affinity_desc

The interrupt affinity management uses straight cpumask pointers to convey
the automatically assigned affinity masks for managed interrupts. The core
interrupt descriptor allocation also decides based on the pointer being non
NULL whether an interrupt is managed or not.

Devices which use managed interrupts usually have two classes of
interrupts:

  - Interrupts for multiple device queues
  - Interrupts for general device management

Currently both classes are treated the same way, i.e. as managed
interrupts. The general interrupts get the default affinity mask assigned
while the device queue interrupts are spread out over the possible CPUs.

Treating the general interrupts as managed is both a limitation and under
certain circumstances a bug. Assume the following situation:

 default_irq_affinity = 4..7

So if CPUs 4-7 are offlined, then the core code will shut down the device
management interrupts because the last CPU in their affinity mask went
offline.

It's also a limitation because it's desired to allow manual placement of
the general device interrupts for various reasons. If they are marked
managed then the interrupt affinity setting from both user and kernel space
is disabled.

To remedy that situation it's required to convey more information than the
cpumasks through various interfaces related to interrupt descriptor
allocation.

Instead of adding yet another argument, create a new data structure
'irq_affinity_desc' which for now just contains the cpumask. This struct
can be expanded to convey auxilliary information in the next step.

No functional change, just preparatory work.

[ tglx: Simplified logic and clarified changelog ]

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Dou Liyang <douliyangs@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-pci@vger.kernel.org
Cc: kashyap.desai@broadcom.com
Cc: shivasharan.srikanteshwara@broadcom.com
Cc: sumit.saxena@broadcom.com
Cc: ming.lei@redhat.com
Cc: hch@lst.de
Cc: douliyang1@huawei.com
Link: https://lkml.kernel.org/r/20181204155122.6327-2-douliyangs@gmail.com

---
 drivers/pci/msi.c         |  9 ++++-----
 include/linux/interrupt.h | 14 ++++++++++++--
 include/linux/irq.h       |  6 ++++--
 include/linux/irqdomain.h |  6 ++++--
 include/linux/msi.h       |  4 ++--
 kernel/irq/affinity.c     | 22 ++++++++++++----------
 kernel/irq/devres.c       |  4 ++--
 kernel/irq/irqdesc.c      | 17 +++++++++--------
 kernel/irq/irqdomain.c    |  4 ++--
 kernel/irq/msi.c          |  8 ++++----
 10 files changed, 55 insertions(+), 39 deletions(-)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 265ed3e4c920..7a1c8a09efa5 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -534,14 +534,13 @@ error_attrs:
 static struct msi_desc *
 msi_setup_entry(struct pci_dev *dev, int nvec, const struct irq_affinity *affd)
 {
-	struct cpumask *masks = NULL;
+	struct irq_affinity_desc *masks = NULL;
 	struct msi_desc *entry;
 	u16 control;
 
 	if (affd)
 		masks = irq_create_affinity_masks(nvec, affd);
 
-
 	/* MSI Entry Initialization */
 	entry = alloc_msi_entry(&dev->dev, nvec, masks);
 	if (!entry)
@@ -672,7 +671,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
 			      struct msix_entry *entries, int nvec,
 			      const struct irq_affinity *affd)
 {
-	struct cpumask *curmsk, *masks = NULL;
+	struct irq_affinity_desc *curmsk, *masks = NULL;
 	struct msi_desc *entry;
 	int ret, i;
 
@@ -1264,7 +1263,7 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr)
 
 		for_each_pci_msi_entry(entry, dev) {
 			if (i == nr)
-				return entry->affinity;
+				return &entry->affinity->mask;
 			i++;
 		}
 		WARN_ON_ONCE(1);
@@ -1276,7 +1275,7 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr)
 				 nr >= entry->nvec_used))
 			return NULL;
 
-		return &entry->affinity[nr];
+		return &entry->affinity[nr].mask;
 	} else {
 		return cpu_possible_mask;
 	}
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index ca397ff40836..c44b7844dc83 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -257,6 +257,14 @@ struct irq_affinity {
 	int	*sets;
 };
 
+/**
+ * struct irq_affinity_desc - Interrupt affinity descriptor
+ * @mask:	cpumask to hold the affinity assignment
+ */
+struct irq_affinity_desc {
+	struct cpumask	mask;
+};
+
 #if defined(CONFIG_SMP)
 
 extern cpumask_var_t irq_default_affinity;
@@ -303,7 +311,9 @@ extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m);
 extern int
 irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify);
 
-struct cpumask *irq_create_affinity_masks(int nvec, const struct irq_affinity *affd);
+struct irq_affinity_desc *
+irq_create_affinity_masks(int nvec, const struct irq_affinity *affd);
+
 int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd);
 
 #else /* CONFIG_SMP */
@@ -337,7 +347,7 @@ irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
 	return 0;
 }
 
-static inline struct cpumask *
+static inline struct irq_affinity_desc *
 irq_create_affinity_masks(int nvec, const struct irq_affinity *affd)
 {
 	return NULL;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index c9bffda04a45..def2b2aac8b1 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -27,6 +27,7 @@
 struct seq_file;
 struct module;
 struct msi_msg;
+struct irq_affinity_desc;
 enum irqchip_irq_state;
 
 /*
@@ -834,11 +835,12 @@ struct cpumask *irq_data_get_effective_affinity_mask(struct irq_data *d)
 unsigned int arch_dynirq_lower_bound(unsigned int from);
 
 int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
-		      struct module *owner, const struct cpumask *affinity);
+		      struct module *owner,
+		      const struct irq_affinity_desc *affinity);
 
 int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
 			   unsigned int cnt, int node, struct module *owner,
-			   const struct cpumask *affinity);
+			   const struct irq_affinity_desc *affinity);
 
 /* use macros to avoid needing export.h for THIS_MODULE */
 #define irq_alloc_descs(irq, from, cnt, node)	\
diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index 068aa46f0d55..35965f41d7be 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -43,6 +43,7 @@ struct irq_chip;
 struct irq_data;
 struct cpumask;
 struct seq_file;
+struct irq_affinity_desc;
 
 /* Number of irqs reserved for a legacy isa controller */
 #define NUM_ISA_INTERRUPTS	16
@@ -266,7 +267,7 @@ extern bool irq_domain_check_msi_remap(void);
 extern void irq_set_default_host(struct irq_domain *host);
 extern int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
 				  irq_hw_number_t hwirq, int node,
-				  const struct cpumask *affinity);
+				  const struct irq_affinity_desc *affinity);
 
 static inline struct fwnode_handle *of_node_to_fwnode(struct device_node *node)
 {
@@ -449,7 +450,8 @@ static inline struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *par
 
 extern int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
 				   unsigned int nr_irqs, int node, void *arg,
-				   bool realloc, const struct cpumask *affinity);
+				   bool realloc,
+				   const struct irq_affinity_desc *affinity);
 extern void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs);
 extern int irq_domain_activate_irq(struct irq_data *irq_data, bool early);
 extern void irq_domain_deactivate_irq(struct irq_data *irq_data);
diff --git a/include/linux/msi.h b/include/linux/msi.h
index eb213b87617c..784fb52b9900 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -76,7 +76,7 @@ struct msi_desc {
 	unsigned int			nvec_used;
 	struct device			*dev;
 	struct msi_msg			msg;
-	struct cpumask			*affinity;
+	struct irq_affinity_desc	*affinity;
 
 	union {
 		/* PCI MSI/X specific data */
@@ -138,7 +138,7 @@ static inline void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg)
 #endif /* CONFIG_PCI_MSI */
 
 struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
-				 const struct cpumask *affinity);
+				 const struct irq_affinity_desc *affinity);
 void free_msi_entry(struct msi_desc *entry);
 void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
 void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index e423bff1928c..c0fe591b0dc9 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -99,7 +99,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 				      cpumask_var_t *node_to_cpumask,
 				      const struct cpumask *cpu_mask,
 				      struct cpumask *nmsk,
-				      struct cpumask *masks)
+				      struct irq_affinity_desc *masks)
 {
 	int n, nodes, cpus_per_vec, extra_vecs, done = 0;
 	int last_affv = firstvec + numvecs;
@@ -117,7 +117,9 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 	 */
 	if (numvecs <= nodes) {
 		for_each_node_mask(n, nodemsk) {
-			cpumask_or(masks + curvec, masks + curvec, node_to_cpumask[n]);
+			cpumask_or(&masks[curvec].mask,
+					&masks[curvec].mask,
+					node_to_cpumask[n]);
 			if (++curvec == last_affv)
 				curvec = firstvec;
 		}
@@ -150,7 +152,8 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd,
 				cpus_per_vec++;
 				--extra_vecs;
 			}
-			irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
+			irq_spread_init_one(&masks[curvec].mask, nmsk,
+						cpus_per_vec);
 		}
 
 		done += v;
@@ -173,7 +176,7 @@ out:
 static int irq_build_affinity_masks(const struct irq_affinity *affd,
 				    int startvec, int numvecs, int firstvec,
 				    cpumask_var_t *node_to_cpumask,
-				    struct cpumask *masks)
+				    struct irq_affinity_desc *masks)
 {
 	int curvec = startvec, nr_present, nr_others;
 	int ret = -ENOMEM;
@@ -226,15 +229,15 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd,
  * @nvecs:	The total number of vectors
  * @affd:	Description of the affinity requirements
  *
- * Returns the masks pointer or NULL if allocation failed.
+ * Returns the irq_affinity_desc pointer or NULL if allocation failed.
  */
-struct cpumask *
+struct irq_affinity_desc *
 irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 {
 	int affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
 	int curvec, usedvecs;
 	cpumask_var_t *node_to_cpumask;
-	struct cpumask *masks = NULL;
+	struct irq_affinity_desc *masks = NULL;
 	int i, nr_sets;
 
 	/*
@@ -254,8 +257,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 
 	/* Fill out vectors at the beginning that don't need affinity */
 	for (curvec = 0; curvec < affd->pre_vectors; curvec++)
-		cpumask_copy(masks + curvec, irq_default_affinity);
-
+		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
 	/*
 	 * Spread on present CPUs starting from affd->pre_vectors. If we
 	 * have multiple sets, build each sets affinity mask separately.
@@ -285,7 +287,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	else
 		curvec = affd->pre_vectors + usedvecs;
 	for (; curvec < nvecs; curvec++)
-		cpumask_copy(masks + curvec, irq_default_affinity);
+		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
 
 outnodemsk:
 	free_node_to_cpumask(node_to_cpumask);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 6a682c229e10..5d5378ea0afe 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -169,7 +169,7 @@ static void devm_irq_desc_release(struct device *dev, void *res)
  * @cnt:	Number of consecutive irqs to allocate
  * @node:	Preferred node on which the irq descriptor should be allocated
  * @owner:	Owning module (can be NULL)
- * @affinity:	Optional pointer to an affinity mask array of size @cnt
+ * @affinity:	Optional pointer to an irq_affinity_desc array of size @cnt
  *		which hints where the irq descriptors should be allocated
  *		and which default affinities to use
  *
@@ -179,7 +179,7 @@ static void devm_irq_desc_release(struct device *dev, void *res)
  */
 int __devm_irq_alloc_descs(struct device *dev, int irq, unsigned int from,
 			   unsigned int cnt, int node, struct module *owner,
-			   const struct cpumask *affinity)
+			   const struct irq_affinity_desc *affinity)
 {
 	struct irq_desc_devres *dr;
 	int base;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 578d0e5f1b5b..cb401d6c5040 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -449,28 +449,29 @@ static void free_desc(unsigned int irq)
 }
 
 static int alloc_descs(unsigned int start, unsigned int cnt, int node,
-		       const struct cpumask *affinity, struct module *owner)
+		       const struct irq_affinity_desc *affinity,
+		       struct module *owner)
 {
-	const struct cpumask *mask = NULL;
 	struct irq_desc *desc;
 	unsigned int flags;
 	int i;
 
 	/* Validate affinity mask(s) */
 	if (affinity) {
-		for (i = 0, mask = affinity; i < cnt; i++, mask++) {
-			if (cpumask_empty(mask))
+		for (i = 0; i < cnt; i++) {
+			if (cpumask_empty(&affinity[i].mask))
 				return -EINVAL;
 		}
 	}
 
 	flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0;
-	mask = NULL;
 
 	for (i = 0; i < cnt; i++) {
+		const struct cpumask *mask = NULL;
+
 		if (affinity) {
 			node = cpu_to_node(cpumask_first(affinity));
-			mask = affinity;
+			mask = &affinity->mask;
 			affinity++;
 		}
 		desc = alloc_desc(start + i, node, flags, mask, owner);
@@ -575,7 +576,7 @@ static void free_desc(unsigned int irq)
 }
 
 static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
-			      const struct cpumask *affinity,
+			      const struct irq_affinity_desc *affinity,
 			      struct module *owner)
 {
 	u32 i;
@@ -705,7 +706,7 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
  */
 int __ref
 __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
-		  struct module *owner, const struct cpumask *affinity)
+		  struct module *owner, const struct irq_affinity_desc *affinity)
 {
 	int start, ret;
 
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 3366d11c3e02..8b0be4bd6565 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -969,7 +969,7 @@ const struct irq_domain_ops irq_domain_simple_ops = {
 EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
 
 int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
-			   int node, const struct cpumask *affinity)
+			   int node, const struct irq_affinity_desc *affinity)
 {
 	unsigned int hint;
 
@@ -1281,7 +1281,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
  */
 int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
 			    unsigned int nr_irqs, int node, void *arg,
-			    bool realloc, const struct cpumask *affinity)
+			    bool realloc, const struct irq_affinity_desc *affinity)
 {
 	int i, ret, virq;
 
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 4ca2fd46645d..ad26fbcfbfc8 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -23,11 +23,11 @@
  * @nvec:	The number of vectors used in this entry
  * @affinity:	Optional pointer to an affinity mask array size of @nvec
  *
- * If @affinity is not NULL then a an affinity array[@nvec] is allocated
- * and the affinity masks from @affinity are copied.
+ * If @affinity is not NULL then an affinity array[@nvec] is allocated
+ * and the affinity masks and flags from @affinity are copied.
  */
-struct msi_desc *
-alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity)
+struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
+				 const struct irq_affinity_desc *affinity)
 {
 	struct msi_desc *desc;
 

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [tip:irq/core] genirq/affinity: Add is_managed to struct irq_affinity_desc
  2018-12-04 15:51 ` [PATCH 2/3] irq/affinity: Add is_managed into " Dou Liyang
  2018-12-18 15:26   ` Thomas Gleixner
@ 2018-12-19 10:38   ` tip-bot for Dou Liyang
  1 sibling, 0 replies; 12+ messages in thread
From: tip-bot for Dou Liyang @ 2018-12-19 10:38 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: sumit.saxena, hpa, tglx, douliyangs, mingo, linux-kernel, kashyap.desai

Commit-ID:  c410abbbacb9b378365ba17a30df08b4b9eec64f
Gitweb:     https://git.kernel.org/tip/c410abbbacb9b378365ba17a30df08b4b9eec64f
Author:     Dou Liyang <douliyangs@gmail.com>
AuthorDate: Tue, 4 Dec 2018 23:51:21 +0800
Committer:  Thomas Gleixner <tglx@linutronix.de>
CommitDate: Wed, 19 Dec 2018 11:32:08 +0100

genirq/affinity: Add is_managed to struct irq_affinity_desc

Devices which use managed interrupts usually have two classes of
interrupts:

  - Interrupts for multiple device queues
  - Interrupts for general device management

Currently both classes are treated the same way, i.e. as managed
interrupts. The general interrupts get the default affinity mask assigned
while the device queue interrupts are spread out over the possible CPUs.

Treating the general interrupts as managed is both a limitation and under
certain circumstances a bug. Assume the following situation:

 default_irq_affinity = 4..7

So if CPUs 4-7 are offlined, then the core code will shut down the device
management interrupts because the last CPU in their affinity mask went
offline.

It's also a limitation because it's desired to allow manual placement of
the general device interrupts for various reasons. If they are marked
managed then the interrupt affinity setting from both user and kernel space
is disabled. That limitation was reported by Kashyap and Sumit.

Expand struct irq_affinity_desc with a new bit 'is_managed' which is set
for truly managed interrupts (queue interrupts) and cleared for the general
device interrupts.

[ tglx: Simplify code and massage changelog ]

Reported-by: Kashyap Desai <kashyap.desai@broadcom.com>
Reported-by: Sumit Saxena <sumit.saxena@broadcom.com>
Signed-off-by: Dou Liyang <douliyangs@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-pci@vger.kernel.org
Cc: shivasharan.srikanteshwara@broadcom.com
Cc: ming.lei@redhat.com
Cc: hch@lst.de
Cc: bhelgaas@google.com
Cc: douliyang1@huawei.com
Link: https://lkml.kernel.org/r/20181204155122.6327-3-douliyangs@gmail.com

---
 include/linux/interrupt.h |  1 +
 kernel/irq/affinity.c     |  4 ++++
 kernel/irq/irqdesc.c      | 13 ++++++++-----
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c44b7844dc83..c672f34235e7 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -263,6 +263,7 @@ struct irq_affinity {
  */
 struct irq_affinity_desc {
 	struct cpumask	mask;
+	unsigned int	is_managed : 1;
 };
 
 #if defined(CONFIG_SMP)
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index c0fe591b0dc9..45b68b4ea48b 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -289,6 +289,10 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
 	for (; curvec < nvecs; curvec++)
 		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
 
+	/* Mark the managed interrupts */
+	for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
+		masks[i].is_managed = 1;
+
 outnodemsk:
 	free_node_to_cpumask(node_to_cpumask);
 	return masks;
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index cb401d6c5040..ee062b7939d3 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -453,27 +453,30 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
 		       struct module *owner)
 {
 	struct irq_desc *desc;
-	unsigned int flags;
 	int i;
 
 	/* Validate affinity mask(s) */
 	if (affinity) {
-		for (i = 0; i < cnt; i++) {
+		for (i = 0; i < cnt; i++, i++) {
 			if (cpumask_empty(&affinity[i].mask))
 				return -EINVAL;
 		}
 	}
 
-	flags = affinity ? IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN : 0;
-
 	for (i = 0; i < cnt; i++) {
 		const struct cpumask *mask = NULL;
+		unsigned int flags = 0;
 
 		if (affinity) {
-			node = cpu_to_node(cpumask_first(affinity));
+			if (affinity->is_managed) {
+				flags = IRQD_AFFINITY_MANAGED |
+					IRQD_MANAGED_SHUTDOWN;
+			}
 			mask = &affinity->mask;
+			node = cpu_to_node(cpumask_first(mask));
 			affinity++;
 		}
+
 		desc = alloc_desc(start + i, node, flags, mask, owner);
 		if (!desc)
 			goto err;

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/3] irq/core: Fix and expand the irq affinity descriptor
  2018-12-04 15:51 [PATCH 0/3] irq/core: Fix and expand the irq affinity descriptor Dou Liyang
                   ` (2 preceding siblings ...)
  2018-12-04 15:51 ` [PATCH 3/3] irq/affinity: Fix a possible breakage Dou Liyang
@ 2018-12-19 10:53 ` Thomas Gleixner
  2018-12-19 12:55   ` Sumit Saxena
  3 siblings, 1 reply; 12+ messages in thread
From: Thomas Gleixner @ 2018-12-19 10:53 UTC (permalink / raw)
  To: Dou Liyang
  Cc: linux-kernel, linux-pci, kashyap.desai,
	shivasharan.srikanteshwara, sumit.saxena, ming.lei, hch,
	bhelgaas, douliyang1

On Tue, 4 Dec 2018, Dou Liyang wrote:

> Now,  Spreading the interrupt affinity info by a cpumask pointer is not
> enough, meets a problem[1] and hard to expand in the future.
> 
> Fix it by:
> 
>      +-----------------------------------+
>      |                                   |
>      |     struct cpumask *affinity      |
>      |                                   |
>      +-----------------------------------+
>                        |
>     +------------------v-------------------+
>     |                                      |
>     | struct irq_affinity_desc {           |
>     |     struct cpumask   mask;           |
>     |     unsigned int     is_managed : 1; |
>     | };                                   |
>     |                                      |
>     +--------------------------------------+
> 

So, I've applied that lot for 4.21 (or whatever number it will be). That's
only the first step for solving Kashyap's problem.

IIRC, then Kashap wanted to get initial interrupt spreading for these extra
magic interrupts as well, but not have them marked managed.

That's trivial to do now with the two queued changes in that area:

  - The rework above
  
  - The support for interrupt sets from Jens

Just adding a small bitfield to struct irq_affinity which allows to tell
the core that a particular interrupt set is not managed does the trick.

Untested patch below.

Kashyap, is that what you were looking for and if so, does it work?

Thanks,

	tglx

8<-----------------

Subject: genirq/affinity: Add support for non-managed affinity sets
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 18 Dec 2018 16:46:47 +0100

Some drivers need an extra set of interrupts which are not marked managed,
but should get initial interrupt spreading.

Add a bitmap to struct irq_affinity which allows the driver to mark a
particular set of interrupts as non managed. Check the bitmap during
spreading and use the result to mark the interrupts in the sets
accordingly.

The unmanaged interrupts get initial spreading, but user space can change
their affinity later on.

Usage example:

      struct irq_affinity affd = { .pre_vectors	= 2 };
      int sets[2];

      /* Fill in sets[] */

      affd.nr_sets = 2;
      affd.sets = &sets;
      affd.unmanaged_sets = 0x02;

      ......

So both sets are properly spread out, but the second set is not marked
managed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |   10 ++++++----
 kernel/irq/affinity.c     |   24 ++++++++++++++----------
 2 files changed, 20 insertions(+), 14 deletions(-)

--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -99,7 +99,8 @@ static int __irq_build_affinity_masks(co
 				      cpumask_var_t *node_to_cpumask,
 				      const struct cpumask *cpu_mask,
 				      struct cpumask *nmsk,
-				      struct irq_affinity_desc *masks)
+				      struct irq_affinity_desc *masks,
+				      bool managed)
 {
 	int n, nodes, cpus_per_vec, extra_vecs, done = 0;
 	int last_affv = firstvec + numvecs;
@@ -154,6 +155,7 @@ static int __irq_build_affinity_masks(co
 			}
 			irq_spread_init_one(&masks[curvec].mask, nmsk,
 						cpus_per_vec);
+			masks[curvec].is_managed = managed;
 		}
 
 		done += v;
@@ -176,7 +178,8 @@ static int __irq_build_affinity_masks(co
 static int irq_build_affinity_masks(const struct irq_affinity *affd,
 				    int startvec, int numvecs, int firstvec,
 				    cpumask_var_t *node_to_cpumask,
-				    struct irq_affinity_desc *masks)
+				    struct irq_affinity_desc *masks,
+				    bool managed)
 {
 	int curvec = startvec, nr_present, nr_others;
 	int ret = -ENOMEM;
@@ -196,7 +199,8 @@ static int irq_build_affinity_masks(cons
 	/* Spread on present CPUs starting from affd->pre_vectors */
 	nr_present = __irq_build_affinity_masks(affd, curvec, numvecs,
 						firstvec, node_to_cpumask,
-						cpu_present_mask, nmsk, masks);
+						cpu_present_mask, nmsk, masks,
+						managed);
 
 	/*
 	 * Spread on non present CPUs starting from the next vector to be
@@ -211,7 +215,7 @@ static int irq_build_affinity_masks(cons
 	cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
 	nr_others = __irq_build_affinity_masks(affd, curvec, numvecs,
 					       firstvec, node_to_cpumask,
-					       npresmsk, nmsk, masks);
+					       npresmsk, nmsk, masks, managed);
 	put_online_cpus();
 
 	if (nr_present < numvecs)
@@ -268,10 +272,11 @@ irq_create_affinity_masks(int nvecs, con
 
 	for (i = 0, usedvecs = 0; i < nr_sets; i++) {
 		int this_vecs = affd->sets ? affd->sets[i] : affvecs;
+		bool managed = !test_bit(i, &affd->unmanaged_sets);
 		int ret;
 
-		ret = irq_build_affinity_masks(affd, curvec, this_vecs,
-						curvec, node_to_cpumask, masks);
+		ret = irq_build_affinity_masks(affd, curvec, this_vecs, curvec,
+					       node_to_cpumask, masks, managed);
 		if (ret) {
 			kfree(masks);
 			masks = NULL;
@@ -289,10 +294,6 @@ irq_create_affinity_masks(int nvecs, con
 	for (; curvec < nvecs; curvec++)
 		cpumask_copy(&masks[curvec].mask, irq_default_affinity);
 
-	/* Mark the managed interrupts */
-	for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
-		masks[i].is_managed = 1;
-
 outnodemsk:
 	free_node_to_cpumask(node_to_cpumask);
 	return masks;
@@ -316,6 +317,9 @@ int irq_calc_affinity_vectors(int minvec
 	if (affd->nr_sets) {
 		int i;
 
+		if (WARN_ON_ONCE(affd->nr_sets > BITS_PER_LONG))
+			return 0;
+
 		for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
 			set_vecs += affd->sets[i];
 	} else {
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -249,12 +249,14 @@ struct irq_affinity_notify {
  *			the MSI(-X) vector space
  * @nr_sets:		Length of passed in *sets array
  * @sets:		Number of affinitized sets
+ * @unmanaged_sets:	Bitmap to mark members of @sets as unmanaged
  */
 struct irq_affinity {
-	int	pre_vectors;
-	int	post_vectors;
-	int	nr_sets;
-	int	*sets;
+	int		pre_vectors;
+	int		post_vectors;
+	int		nr_sets;
+	int		*sets;
+	unsigned long	unmanaged_sets;
 };
 
 /**

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/3] irq/core: Fix and expand the irq affinity descriptor
  2018-12-19 10:53 ` [PATCH 0/3] irq/core: Fix and expand the irq affinity descriptor Thomas Gleixner
@ 2018-12-19 12:55   ` Sumit Saxena
  2018-12-28  9:54     ` Sumit Saxena
  0 siblings, 1 reply; 12+ messages in thread
From: Sumit Saxena @ 2018-12-19 12:55 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: douliyangs, LKML, linux-pci, Kashyap Desai,
	Shivasharan Srikanteshwara, Ming Lei, Christoph Hellwig,
	Bjorn Helgaas, douliyang1

On Wed, Dec 19, 2018 at 4:23 PM Thomas Gleixner <tglx@linutronix.de> wrote:
>
> On Tue, 4 Dec 2018, Dou Liyang wrote:
>
> > Now,  Spreading the interrupt affinity info by a cpumask pointer is not
> > enough, meets a problem[1] and hard to expand in the future.
> >
> > Fix it by:
> >
> >      +-----------------------------------+
> >      |                                   |
> >      |     struct cpumask *affinity      |
> >      |                                   |
> >      +-----------------------------------+
> >                        |
> >     +------------------v-------------------+
> >     |                                      |
> >     | struct irq_affinity_desc {           |
> >     |     struct cpumask   mask;           |
> >     |     unsigned int     is_managed : 1; |
> >     | };                                   |
> >     |                                      |
> >     +--------------------------------------+
> >
>
> So, I've applied that lot for 4.21 (or whatever number it will be). That's
> only the first step for solving Kashyap's problem.
>
> IIRC, then Kashap wanted to get initial interrupt spreading for these extra
> magic interrupts as well, but not have them marked managed.
>
> That's trivial to do now with the two queued changes in that area:
>
>   - The rework above
>
>   - The support for interrupt sets from Jens
>
> Just adding a small bitfield to struct irq_affinity which allows to tell
> the core that a particular interrupt set is not managed does the trick.
>
> Untested patch below.
>
> Kashyap, is that what you were looking for and if so, does it work?
Thomas,
We could not test these patches as they did net get applied to latest
linux-block tree cleanly.

Our requirement is: 1. extra interrupts should be un-managed and 2.
should be spread to CPUs of local NUMA node.
If interrupts are un-managed but not spread as per our requirement,
then still driver/userspace apps can manage by spreading
them as required by calling API- irq_set_affinity_hint().

Thanks,
Sumit
>
> Thanks,
>
>         tglx
>
> 8<-----------------
>
> Subject: genirq/affinity: Add support for non-managed affinity sets
> From: Thomas Gleixner <tglx@linutronix.de>
> Date: Tue, 18 Dec 2018 16:46:47 +0100
>
> Some drivers need an extra set of interrupts which are not marked managed,
> but should get initial interrupt spreading.
>
> Add a bitmap to struct irq_affinity which allows the driver to mark a
> particular set of interrupts as non managed. Check the bitmap during
> spreading and use the result to mark the interrupts in the sets
> accordingly.
>
> The unmanaged interrupts get initial spreading, but user space can change
> their affinity later on.
>
> Usage example:
>
>       struct irq_affinity affd = { .pre_vectors = 2 };
>       int sets[2];
>
>       /* Fill in sets[] */
>
>       affd.nr_sets = 2;
>       affd.sets = &sets;
>       affd.unmanaged_sets = 0x02;
>
>       ......
>
> So both sets are properly spread out, but the second set is not marked
> managed.
>
> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> ---
>  include/linux/interrupt.h |   10 ++++++----
>  kernel/irq/affinity.c     |   24 ++++++++++++++----------
>  2 files changed, 20 insertions(+), 14 deletions(-)
>
> --- a/kernel/irq/affinity.c
> +++ b/kernel/irq/affinity.c
> @@ -99,7 +99,8 @@ static int __irq_build_affinity_masks(co
>                                       cpumask_var_t *node_to_cpumask,
>                                       const struct cpumask *cpu_mask,
>                                       struct cpumask *nmsk,
> -                                     struct irq_affinity_desc *masks)
> +                                     struct irq_affinity_desc *masks,
> +                                     bool managed)
>  {
>         int n, nodes, cpus_per_vec, extra_vecs, done = 0;
>         int last_affv = firstvec + numvecs;
> @@ -154,6 +155,7 @@ static int __irq_build_affinity_masks(co
>                         }
>                         irq_spread_init_one(&masks[curvec].mask, nmsk,
>                                                 cpus_per_vec);
> +                       masks[curvec].is_managed = managed;
>                 }
>
>                 done += v;
> @@ -176,7 +178,8 @@ static int __irq_build_affinity_masks(co
>  static int irq_build_affinity_masks(const struct irq_affinity *affd,
>                                     int startvec, int numvecs, int firstvec,
>                                     cpumask_var_t *node_to_cpumask,
> -                                   struct irq_affinity_desc *masks)
> +                                   struct irq_affinity_desc *masks,
> +                                   bool managed)
>  {
>         int curvec = startvec, nr_present, nr_others;
>         int ret = -ENOMEM;
> @@ -196,7 +199,8 @@ static int irq_build_affinity_masks(cons
>         /* Spread on present CPUs starting from affd->pre_vectors */
>         nr_present = __irq_build_affinity_masks(affd, curvec, numvecs,
>                                                 firstvec, node_to_cpumask,
> -                                               cpu_present_mask, nmsk, masks);
> +                                               cpu_present_mask, nmsk, masks,
> +                                               managed);
>
>         /*
>          * Spread on non present CPUs starting from the next vector to be
> @@ -211,7 +215,7 @@ static int irq_build_affinity_masks(cons
>         cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
>         nr_others = __irq_build_affinity_masks(affd, curvec, numvecs,
>                                                firstvec, node_to_cpumask,
> -                                              npresmsk, nmsk, masks);
> +                                              npresmsk, nmsk, masks, managed);
>         put_online_cpus();
>
>         if (nr_present < numvecs)
> @@ -268,10 +272,11 @@ irq_create_affinity_masks(int nvecs, con
>
>         for (i = 0, usedvecs = 0; i < nr_sets; i++) {
>                 int this_vecs = affd->sets ? affd->sets[i] : affvecs;
> +               bool managed = !test_bit(i, &affd->unmanaged_sets);
>                 int ret;
>
> -               ret = irq_build_affinity_masks(affd, curvec, this_vecs,
> -                                               curvec, node_to_cpumask, masks);
> +               ret = irq_build_affinity_masks(affd, curvec, this_vecs, curvec,
> +                                              node_to_cpumask, masks, managed);
>                 if (ret) {
>                         kfree(masks);
>                         masks = NULL;
> @@ -289,10 +294,6 @@ irq_create_affinity_masks(int nvecs, con
>         for (; curvec < nvecs; curvec++)
>                 cpumask_copy(&masks[curvec].mask, irq_default_affinity);
>
> -       /* Mark the managed interrupts */
> -       for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
> -               masks[i].is_managed = 1;
> -
>  outnodemsk:
>         free_node_to_cpumask(node_to_cpumask);
>         return masks;
> @@ -316,6 +317,9 @@ int irq_calc_affinity_vectors(int minvec
>         if (affd->nr_sets) {
>                 int i;
>
> +               if (WARN_ON_ONCE(affd->nr_sets > BITS_PER_LONG))
> +                       return 0;
> +
>                 for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
>                         set_vecs += affd->sets[i];
>         } else {
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -249,12 +249,14 @@ struct irq_affinity_notify {
>   *                     the MSI(-X) vector space
>   * @nr_sets:           Length of passed in *sets array
>   * @sets:              Number of affinitized sets
> + * @unmanaged_sets:    Bitmap to mark members of @sets as unmanaged
>   */
>  struct irq_affinity {
> -       int     pre_vectors;
> -       int     post_vectors;
> -       int     nr_sets;
> -       int     *sets;
> +       int             pre_vectors;
> +       int             post_vectors;
> +       int             nr_sets;
> +       int             *sets;
> +       unsigned long   unmanaged_sets;
>  };
>
>  /**

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/3] irq/core: Fix and expand the irq affinity descriptor
  2018-12-19 12:55   ` Sumit Saxena
@ 2018-12-28  9:54     ` Sumit Saxena
  0 siblings, 0 replies; 12+ messages in thread
From: Sumit Saxena @ 2018-12-28  9:54 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Liyang Dou, LKML, linux-pci, Kashyap Desai,
	Shivasharan Srikanteshwara, Ming Lei, Christoph Hellwig,
	Bjorn Helgaas, douliyang1

On Wed, Dec 19, 2018 at 6:25 PM Sumit Saxena <sumit.saxena@broadcom.com> wrote:
>
> On Wed, Dec 19, 2018 at 4:23 PM Thomas Gleixner <tglx@linutronix.de> wrote:
> >
> > On Tue, 4 Dec 2018, Dou Liyang wrote:
> >
> > > Now,  Spreading the interrupt affinity info by a cpumask pointer is not
> > > enough, meets a problem[1] and hard to expand in the future.
> > >
> > > Fix it by:
> > >
> > >      +-----------------------------------+
> > >      |                                   |
> > >      |     struct cpumask *affinity      |
> > >      |                                   |
> > >      +-----------------------------------+
> > >                        |
> > >     +------------------v-------------------+
> > >     |                                      |
> > >     | struct irq_affinity_desc {           |
> > >     |     struct cpumask   mask;           |
> > >     |     unsigned int     is_managed : 1; |
> > >     | };                                   |
> > >     |                                      |
> > >     +--------------------------------------+
> > >
> >
> > So, I've applied that lot for 4.21 (or whatever number it will be). That's
> > only the first step for solving Kashyap's problem.
> >
> > IIRC, then Kashap wanted to get initial interrupt spreading for these extra
> > magic interrupts as well, but not have them marked managed.
> >
> > That's trivial to do now with the two queued changes in that area:
> >
> >   - The rework above
> >
> >   - The support for interrupt sets from Jens
> >
> > Just adding a small bitfield to struct irq_affinity which allows to tell
> > the core that a particular interrupt set is not managed does the trick.
> >
> > Untested patch below.
> >
> > Kashyap, is that what you were looking for and if so, does it work?
> Thomas,
> We could not test these patches as they did net get applied to latest
> linux-block tree cleanly.
>
> Our requirement is: 1. extra interrupts should be un-managed and 2.
> should be spread to CPUs of local NUMA node.
> If interrupts are un-managed but not spread as per our requirement,
> then still driver/userspace apps can manage by spreading
> them as required by calling API- irq_set_affinity_hint().
>
> Thanks,
> Sumit
I tested this patchset with some minor rework to apply it on latest
linux block tree(4.20-rc7).
It worked as our expectation. For "pre_vectors" IRQs(extra set of
interrupts), "is_managed" flag is set to 0
and later driver can affine these "pre_vectors" to CPUs of local NUMA
node through API- irq_set_affinity_hint().
Regular set of interrupts(not pre_vectors/post_vectors) are managed,
"is_managed" set to 1.

Below are some data from my test setup-

# numactl --hardware

available: 2 nodes (0-1)

node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 36 37 38 39
40 41 42 43 44 45 46 47 48 49 50 51 52 53

node 0 size: 31822 MB

node 0 free: 30241 MB

node 1 cpus: 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 54
55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71

node 1 size: 32248 MB

node 1 free: 31960 MB

node distances:

node   0   1

  0:  10  21

  1:  21  10

MegaRAID Controller(PCI device 86:00.0) is attached to node 1.
# find /sys -name *numa_node* | grep "86:00" | xargs cat
1

IRQ-CPU affinity of extra 16 interrupts for PCI  device 86:00.0:
irq 149, cpu list 18-35,54-71
irq 150, cpu list 18-35,54-71
irq 151, cpu list 18-35,54-71
irq 152, cpu list 18-35,54-71
irq 153, cpu list 18-35,54-71
irq 154, cpu list 18-35,54-71
irq 155, cpu list 18-35,54-71
irq 156, cpu list 18-35,54-71
irq 157, cpu list 18-35,54-71
irq 158, cpu list 18-35,54-71
irq 159, cpu list 18-35,54-71
irq 160, cpu list 18-35,54-71
irq 161, cpu list 18-35,54-71
irq 162, cpu list 18-35,54-71
irq 163, cpu list 18-35,54-71
irq 164, cpu list 18-35,54-71
---
# cat /sys/kernel/debug/irq/irqs/164 | grep is_managed
   is_managed:       0

Tested-by: Sumit Saxena <sumit.saxena@broadcom.com>

> >
> > Thanks,
> >
> >         tglx
> >
> > 8<-----------------
> >
> > Subject: genirq/affinity: Add support for non-managed affinity sets
> > From: Thomas Gleixner <tglx@linutronix.de>
> > Date: Tue, 18 Dec 2018 16:46:47 +0100
> >
> > Some drivers need an extra set of interrupts which are not marked managed,
> > but should get initial interrupt spreading.
> >
> > Add a bitmap to struct irq_affinity which allows the driver to mark a
> > particular set of interrupts as non managed. Check the bitmap during
> > spreading and use the result to mark the interrupts in the sets
> > accordingly.
> >
> > The unmanaged interrupts get initial spreading, but user space can change
> > their affinity later on.
> >
> > Usage example:
> >
> >       struct irq_affinity affd = { .pre_vectors = 2 };
> >       int sets[2];
> >
> >       /* Fill in sets[] */
> >
> >       affd.nr_sets = 2;
> >       affd.sets = &sets;
> >       affd.unmanaged_sets = 0x02;
> >
> >       ......
> >
> > So both sets are properly spread out, but the second set is not marked
> > managed.
> >
> > Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
> > ---
> >  include/linux/interrupt.h |   10 ++++++----
> >  kernel/irq/affinity.c     |   24 ++++++++++++++----------
> >  2 files changed, 20 insertions(+), 14 deletions(-)
> >
> > --- a/kernel/irq/affinity.c
> > +++ b/kernel/irq/affinity.c
> > @@ -99,7 +99,8 @@ static int __irq_build_affinity_masks(co
> >                                       cpumask_var_t *node_to_cpumask,
> >                                       const struct cpumask *cpu_mask,
> >                                       struct cpumask *nmsk,
> > -                                     struct irq_affinity_desc *masks)
> > +                                     struct irq_affinity_desc *masks,
> > +                                     bool managed)
> >  {
> >         int n, nodes, cpus_per_vec, extra_vecs, done = 0;
> >         int last_affv = firstvec + numvecs;
> > @@ -154,6 +155,7 @@ static int __irq_build_affinity_masks(co
> >                         }
> >                         irq_spread_init_one(&masks[curvec].mask, nmsk,
> >                                                 cpus_per_vec);
> > +                       masks[curvec].is_managed = managed;
> >                 }
> >
> >                 done += v;
> > @@ -176,7 +178,8 @@ static int __irq_build_affinity_masks(co
> >  static int irq_build_affinity_masks(const struct irq_affinity *affd,
> >                                     int startvec, int numvecs, int firstvec,
> >                                     cpumask_var_t *node_to_cpumask,
> > -                                   struct irq_affinity_desc *masks)
> > +                                   struct irq_affinity_desc *masks,
> > +                                   bool managed)
> >  {
> >         int curvec = startvec, nr_present, nr_others;
> >         int ret = -ENOMEM;
> > @@ -196,7 +199,8 @@ static int irq_build_affinity_masks(cons
> >         /* Spread on present CPUs starting from affd->pre_vectors */
> >         nr_present = __irq_build_affinity_masks(affd, curvec, numvecs,
> >                                                 firstvec, node_to_cpumask,
> > -                                               cpu_present_mask, nmsk, masks);
> > +                                               cpu_present_mask, nmsk, masks,
> > +                                               managed);
> >
> >         /*
> >          * Spread on non present CPUs starting from the next vector to be
> > @@ -211,7 +215,7 @@ static int irq_build_affinity_masks(cons
> >         cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
> >         nr_others = __irq_build_affinity_masks(affd, curvec, numvecs,
> >                                                firstvec, node_to_cpumask,
> > -                                              npresmsk, nmsk, masks);
> > +                                              npresmsk, nmsk, masks, managed);
> >         put_online_cpus();
> >
> >         if (nr_present < numvecs)
> > @@ -268,10 +272,11 @@ irq_create_affinity_masks(int nvecs, con
> >
> >         for (i = 0, usedvecs = 0; i < nr_sets; i++) {
> >                 int this_vecs = affd->sets ? affd->sets[i] : affvecs;
> > +               bool managed = !test_bit(i, &affd->unmanaged_sets);
> >                 int ret;
> >
> > -               ret = irq_build_affinity_masks(affd, curvec, this_vecs,
> > -                                               curvec, node_to_cpumask, masks);
> > +               ret = irq_build_affinity_masks(affd, curvec, this_vecs, curvec,
> > +                                              node_to_cpumask, masks, managed);
> >                 if (ret) {
> >                         kfree(masks);
> >                         masks = NULL;
> > @@ -289,10 +294,6 @@ irq_create_affinity_masks(int nvecs, con
> >         for (; curvec < nvecs; curvec++)
> >                 cpumask_copy(&masks[curvec].mask, irq_default_affinity);
> >
> > -       /* Mark the managed interrupts */
> > -       for (i = affd->pre_vectors; i < nvecs - affd->post_vectors; i++)
> > -               masks[i].is_managed = 1;
> > -
> >  outnodemsk:
> >         free_node_to_cpumask(node_to_cpumask);
> >         return masks;
> > @@ -316,6 +317,9 @@ int irq_calc_affinity_vectors(int minvec
> >         if (affd->nr_sets) {
> >                 int i;
> >
> > +               if (WARN_ON_ONCE(affd->nr_sets > BITS_PER_LONG))
> > +                       return 0;
> > +
> >                 for (i = 0, set_vecs = 0;  i < affd->nr_sets; i++)
> >                         set_vecs += affd->sets[i];
> >         } else {
> > --- a/include/linux/interrupt.h
> > +++ b/include/linux/interrupt.h
> > @@ -249,12 +249,14 @@ struct irq_affinity_notify {
> >   *                     the MSI(-X) vector space
> >   * @nr_sets:           Length of passed in *sets array
> >   * @sets:              Number of affinitized sets
> > + * @unmanaged_sets:    Bitmap to mark members of @sets as unmanaged
> >   */
> >  struct irq_affinity {
> > -       int     pre_vectors;
> > -       int     post_vectors;
> > -       int     nr_sets;
> > -       int     *sets;
> > +       int             pre_vectors;
> > +       int             post_vectors;
> > +       int             nr_sets;
> > +       int             *sets;
> > +       unsigned long   unmanaged_sets;
> >  };
> >
> >  /**

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2018-12-28  9:54 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-04 15:51 [PATCH 0/3] irq/core: Fix and expand the irq affinity descriptor Dou Liyang
2018-12-04 15:51 ` [PATCH 1/3] genirq/core: Add a new interrupt " Dou Liyang
2018-12-19 10:37   ` [tip:irq/core] genirq/core: Introduce struct irq_affinity_desc tip-bot for Dou Liyang
2018-12-04 15:51 ` [PATCH 2/3] irq/affinity: Add is_managed into " Dou Liyang
2018-12-18 15:26   ` Thomas Gleixner
2018-12-19 10:38   ` [tip:irq/core] genirq/affinity: Add is_managed to " tip-bot for Dou Liyang
2018-12-04 15:51 ` [PATCH 3/3] irq/affinity: Fix a possible breakage Dou Liyang
2018-12-05  8:28   ` Thomas Gleixner
2018-12-11 16:27     ` Dou Liyang
2018-12-19 10:53 ` [PATCH 0/3] irq/core: Fix and expand the irq affinity descriptor Thomas Gleixner
2018-12-19 12:55   ` Sumit Saxena
2018-12-28  9:54     ` Sumit Saxena

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).