All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] irq: Add node_affinity CPU masks for smarter irqbalance hints
@ 2009-11-24  9:35 Peter P Waskiewicz Jr
  2009-11-24 11:07 ` Thomas Gleixner
  0 siblings, 1 reply; 7+ messages in thread
From: Peter P Waskiewicz Jr @ 2009-11-24  9:35 UTC (permalink / raw)
  To: linux-kernel, arjan; +Cc: mingo, tglx, yong.zhang0, davem, netdev

This patchset adds a new CPU mask for SMP systems to the irq_desc
struct.  It also exposes an API for underlying device drivers to
assist irqbalance in making smarter decisions when balancing, especially
in a NUMA environment.  For example, an ethernet driver with MSI-X may
wish to limit the CPUs that an interrupt can be balanced within to
stay on a single NUMA node.  Current irqbalance operation can move the
interrupt off the node, resulting in cross-node memory accesses and
locks.

The API is a get/set API within the kernel, along with a /proc entry
for the interrupt.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
---

 include/linux/interrupt.h |    8 ++++++
 include/linux/irq.h       |    8 ++++++
 kernel/irq/manage.c       |   32 +++++++++++++++++++++++++
 kernel/irq/proc.c         |   57 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 105 insertions(+), 0 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 75f3f00..9fd08aa 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -208,6 +208,8 @@ extern cpumask_var_t irq_default_affinity;
 extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
 extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
+extern int irq_set_node_affinity(unsigned int irq,
+                                 const struct cpumask *cpumask);
 
 #else /* CONFIG_SMP */
 
@@ -223,6 +225,12 @@ static inline int irq_can_set_affinity(unsigned int irq)
 
 static inline int irq_select_affinity(unsigned int irq)  { return 0; }
 
+static inline int irq_set_node_affinity(unsigned int irq,
+                                        const struct cpumask *m)
+{
+	return -EINVAL;
+}
+
 #endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */
 
 #ifdef CONFIG_GENERIC_HARDIRQS
diff --git a/include/linux/irq.h b/include/linux/irq.h
index ae9653d..819cda0 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -166,6 +166,7 @@ struct irq_2_iommu;
  * @lock:		locking for SMP
  * @affinity:		IRQ affinity on SMP
  * @node:		node index useful for balancing
+ * @node_affinity:	irq mask hints for irqbalance
  * @pending_mask:	pending rebalanced interrupts
  * @threads_active:	number of irqaction threads currently running
  * @wait_for_threads:	wait queue for sync_irq to wait for threaded handlers
@@ -196,6 +197,7 @@ struct irq_desc {
 #ifdef CONFIG_SMP
 	cpumask_var_t		affinity;
 	unsigned int		node;
+	cpumask_var_t		node_affinity;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_var_t		pending_mask;
 #endif
@@ -445,9 +447,15 @@ static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
 	if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
 		return false;
 
+	if (!alloc_cpumask_var_node(&desc->node_affinity, gfp, node)) {
+		free_cpumask_var(desc->affinity);
+		return false;
+	}
+
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
 		free_cpumask_var(desc->affinity);
+		free_cpumask_var(desc->node_affinity);
 		return false;
 	}
 #endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7305b29..9e80783 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,38 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	return 0;
 }
 
+/**
+ *	irq_set_node_affinity - Set the CPU mask this interrupt can run on
+ *	@irq:		Interrupt to modify
+ *	@cpumask:	CPU mask to assign to the interrupt
+ *
+ */
+int irq_set_node_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	cpumask_copy(desc->node_affinity, cpumask);
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(irq_set_node_affinity);
+
+/**
+ *	irq_get_node_affinity - Get the CPU mask this interrupt can run on
+ *	@irq:		Interrupt to get information
+ *
+ */
+struct cpumask *irq_get_node_affinity(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	return desc->node_affinity;
+}
+EXPORT_SYMBOL(irq_get_node_affinity);
+
 #ifndef CONFIG_AUTO_IRQ_AFFINITY
 /*
  * Generic version of the affinity autoselector.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 0832145..192e3fb 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -31,6 +31,16 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+static int irq_node_affinity_proc_show(struct seq_file *m, void *v)
+{
+	struct irq_desc *desc = irq_to_desc((long)m->private);
+	const struct cpumask *mask = desc->node_affinity;
+
+	seq_cpumask(m, mask);
+	seq_putc(m, '\n');
+	return 0;
+}
+
 #ifndef is_affinity_mask_valid
 #define is_affinity_mask_valid(val) 1
 #endif
@@ -78,11 +88,46 @@ free_cpumask:
 	return err;
 }
 
+static ssize_t irq_node_affinity_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *pos)
+{
+	unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
+	cpumask_var_t new_value;
+	int err;
+
+	if (no_irq_affinity || irq_balancing_disabled(irq))
+		return -EIO;
+
+	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+		return -ENOMEM;
+
+	err = cpumask_parse_user(buffer, count, new_value);
+	if (err)
+		goto free_cpumask;
+
+	if (!is_affinity_mask_valid(new_value)) {
+		err = -EINVAL;
+		goto free_cpumask;
+	}
+
+	irq_set_node_affinity(irq, new_value);
+	err = count;
+
+free_cpumask:
+	free_cpumask_var(new_value);
+	return err;
+}
+
 static int irq_affinity_proc_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
 }
 
+static int irq_node_affinity_proc_open(struct inode *inode, struct file *f)
+{
+	return single_open(f, irq_node_affinity_proc_show, PDE(inode)->data);
+}
+
 static const struct file_operations irq_affinity_proc_fops = {
 	.open		= irq_affinity_proc_open,
 	.read		= seq_read,
@@ -91,6 +136,14 @@ static const struct file_operations irq_affinity_proc_fops = {
 	.write		= irq_affinity_proc_write,
 };
 
+static const struct file_operations irq_node_affinity_proc_fops = {
+	.open		= irq_node_affinity_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= irq_node_affinity_proc_write,
+};
+
 static int default_affinity_show(struct seq_file *m, void *v)
 {
 	seq_cpumask(m, irq_default_affinity);
@@ -230,6 +283,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 	/* create /proc/irq/<irq>/smp_affinity */
 	proc_create_data("smp_affinity", 0600, desc->dir,
 			 &irq_affinity_proc_fops, (void *)(long)irq);
+
+	/* create /proc/irq/<irq>/node_affinity */
+	proc_create_data("node_affinity", 0600, desc->dir,
+	                 &irq_node_affinity_proc_fops, (void *)(long)irq);
 #endif
 
 	proc_create_data("spurious", 0444, desc->dir,


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24  9:35 [PATCH v2] irq: Add node_affinity CPU masks for smarter irqbalance hints Peter P Waskiewicz Jr
@ 2009-11-24 11:07 ` Thomas Gleixner
  2009-11-24 17:57   ` David Miller
  0 siblings, 1 reply; 7+ messages in thread
From: Thomas Gleixner @ 2009-11-24 11:07 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr
  Cc: linux-kernel, arjan, mingo, yong.zhang0, davem, netdev

On Tue, 24 Nov 2009, Peter P Waskiewicz Jr wrote:

> This patchset adds a new CPU mask for SMP systems to the irq_desc
> struct.  It also exposes an API for underlying device drivers to
> assist irqbalance in making smarter decisions when balancing, especially
> in a NUMA environment.  For example, an ethernet driver with MSI-X may
> wish to limit the CPUs that an interrupt can be balanced within to
> stay on a single NUMA node.  Current irqbalance operation can move the
> interrupt off the node, resulting in cross-node memory accesses and
> locks.
> 
> The API is a get/set API within the kernel, along with a /proc entry
> for the interrupt.

And what does the kernel do with this information and why are we not
using the existing device/numa_node information ?
 
> +extern int irq_set_node_affinity(unsigned int irq,
> +                                 const struct cpumask *cpumask);

A node can be described with a single integer, right ?

> +static int irq_node_affinity_proc_show(struct seq_file *m, void *v)
> +{
> +	struct irq_desc *desc = irq_to_desc((long)m->private);
> +	const struct cpumask *mask = desc->node_affinity;
> +
> +	seq_cpumask(m, mask);
> +	seq_putc(m, '\n');
> +	return 0;
> +}
> +
>  #ifndef is_affinity_mask_valid
>  #define is_affinity_mask_valid(val) 1
>  #endif
> @@ -78,11 +88,46 @@ free_cpumask:
>  	return err;
>  }
>  
> +static ssize_t irq_node_affinity_proc_write(struct file *file,
> +		const char __user *buffer, size_t count, loff_t *pos)
> +{
> +	unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
> +	cpumask_var_t new_value;
> +	int err;
> +
> +	if (no_irq_affinity || irq_balancing_disabled(irq))
> +		return -EIO;

Yikes. Why should user space be allowed to write to that file ? And
the whole business is what for ? Storing that value in the irq_desc
data structure for use space to read out again ?

Cool design. We provide storage space for user space applications in
the kernel now ?

See also my earlier reply in the thread. This patch is just adding
code and memory bloat while not solving anything at all. 

Again, this is going nowhere else than into /dev/null.

Thanks,

	tglx


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 11:07 ` Thomas Gleixner
@ 2009-11-24 17:57   ` David Miller
  2009-11-24 21:56     ` Thomas Gleixner
  0 siblings, 1 reply; 7+ messages in thread
From: David Miller @ 2009-11-24 17:57 UTC (permalink / raw)
  To: tglx
  Cc: peter.p.waskiewicz.jr, linux-kernel, arjan, mingo, yong.zhang0, netdev

From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 24 Nov 2009 12:07:35 +0100 (CET)

> And what does the kernel do with this information and why are we not
> using the existing device/numa_node information ?

It's a different problem space Thomas.

If the device lives on NUMA node X, we still end up wanting to
allocate memory resources (RX ring buffers) on other NUMA nodes on a
per-queue basis.

Otherwise a network card's forwarding performance is limited by the
memory bandwidth of a single NUMA node, and on a multiqueue cards we
therefore fare much better by allocating each device RX queue's memory
resources on a different NUMA node.

It is this NUMA usage that PJ is trying to export somehow to userspace
so that irqbalanced and friends can choose the IRQ cpu masks more
intelligently.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 17:57   ` David Miller
@ 2009-11-24 21:56     ` Thomas Gleixner
  2009-11-24 22:05       ` Peter P Waskiewicz Jr
  0 siblings, 1 reply; 7+ messages in thread
From: Thomas Gleixner @ 2009-11-24 21:56 UTC (permalink / raw)
  To: David Miller
  Cc: peter.p.waskiewicz.jr, linux-kernel, arjan, mingo, yong.zhang0, netdev

On Tue, 24 Nov 2009, David Miller wrote:

> From: Thomas Gleixner <tglx@linutronix.de>
> Date: Tue, 24 Nov 2009 12:07:35 +0100 (CET)
> 
> > And what does the kernel do with this information and why are we not
> > using the existing device/numa_node information ?
> 
> It's a different problem space Thomas.
> 
> If the device lives on NUMA node X, we still end up wanting to
> allocate memory resources (RX ring buffers) on other NUMA nodes on a
> per-queue basis.
> 
> Otherwise a network card's forwarding performance is limited by the
> memory bandwidth of a single NUMA node, and on a multiqueue cards we
> therefore fare much better by allocating each device RX queue's memory
> resources on a different NUMA node.
> 
> It is this NUMA usage that PJ is trying to export somehow to userspace
> so that irqbalanced and friends can choose the IRQ cpu masks more
> intelligently.

So you need a preferred irq mask information on a per IRQ basis and
that mask is not restricted to the CPUs of a single NUMA node, right ?

Thanks,

	tglx




^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 21:56     ` Thomas Gleixner
@ 2009-11-24 22:05       ` Peter P Waskiewicz Jr
  2009-11-24 22:23         ` Thomas Gleixner
  0 siblings, 1 reply; 7+ messages in thread
From: Peter P Waskiewicz Jr @ 2009-11-24 22:05 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: David Miller, linux-kernel, arjan, mingo, yong.zhang0, netdev

On Tue, 2009-11-24 at 13:56 -0800, Thomas Gleixner wrote:
> On Tue, 24 Nov 2009, David Miller wrote:
> 
> > From: Thomas Gleixner <tglx@linutronix.de>
> > Date: Tue, 24 Nov 2009 12:07:35 +0100 (CET)
> > 
> > > And what does the kernel do with this information and why are we not
> > > using the existing device/numa_node information ?
> > 
> > It's a different problem space Thomas.
> > 
> > If the device lives on NUMA node X, we still end up wanting to
> > allocate memory resources (RX ring buffers) on other NUMA nodes on a
> > per-queue basis.
> > 
> > Otherwise a network card's forwarding performance is limited by the
> > memory bandwidth of a single NUMA node, and on a multiqueue cards we
> > therefore fare much better by allocating each device RX queue's memory
> > resources on a different NUMA node.
> > 
> > It is this NUMA usage that PJ is trying to export somehow to userspace
> > so that irqbalanced and friends can choose the IRQ cpu masks more
> > intelligently.
> 
> So you need a preferred irq mask information on a per IRQ basis and
> that mask is not restricted to the CPUs of a single NUMA node, right ?
> 

Just to clarify, I need a preferred CPU mask on a per IRQ basis.  And
yes, that mask may not be restricted to the CPUs of a single NUMA node.
But in the normal case, the mask will be restricted to CPUs of a single
node.

Cheers,
-PJ


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 22:05       ` Peter P Waskiewicz Jr
@ 2009-11-24 22:23         ` Thomas Gleixner
  2009-11-30 17:24           ` Peter P Waskiewicz Jr
  0 siblings, 1 reply; 7+ messages in thread
From: Thomas Gleixner @ 2009-11-24 22:23 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr
  Cc: David Miller, linux-kernel, arjan, mingo, yong.zhang0, netdev

On Tue, 24 Nov 2009, Peter P Waskiewicz Jr wrote:
> On Tue, 2009-11-24 at 13:56 -0800, Thomas Gleixner wrote:
> > On Tue, 24 Nov 2009, David Miller wrote:
> > 
> > > From: Thomas Gleixner <tglx@linutronix.de>
> > > Date: Tue, 24 Nov 2009 12:07:35 +0100 (CET)
> > > 
> > > > And what does the kernel do with this information and why are we not
> > > > using the existing device/numa_node information ?
> > > 
> > > It's a different problem space Thomas.
> > > 
> > > If the device lives on NUMA node X, we still end up wanting to
> > > allocate memory resources (RX ring buffers) on other NUMA nodes on a
> > > per-queue basis.
> > > 
> > > Otherwise a network card's forwarding performance is limited by the
> > > memory bandwidth of a single NUMA node, and on a multiqueue cards we
> > > therefore fare much better by allocating each device RX queue's memory
> > > resources on a different NUMA node.
> > > 
> > > It is this NUMA usage that PJ is trying to export somehow to userspace
> > > so that irqbalanced and friends can choose the IRQ cpu masks more
> > > intelligently.
> > 
> > So you need a preferred irq mask information on a per IRQ basis and
> > that mask is not restricted to the CPUs of a single NUMA node, right ?
> > 
> Just to clarify, I need a preferred CPU mask on a per IRQ basis.  And
> yes, that mask may not be restricted to the CPUs of a single NUMA node.
> But in the normal case, the mask will be restricted to CPUs of a single
> node.

Right, but the normal case does not help much if we need to consider
the special case of multiple nodes affected which requires another
cpumask in irq_desc. That's what I really want to avoid.

I at least understand the exact problem you guys want to solve. Will
think more about it.

Thanks,

	tglx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v2] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 22:23         ` Thomas Gleixner
@ 2009-11-30 17:24           ` Peter P Waskiewicz Jr
  0 siblings, 0 replies; 7+ messages in thread
From: Peter P Waskiewicz Jr @ 2009-11-30 17:24 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: David Miller, linux-kernel, arjan, mingo, yong.zhang0, netdev

On Tue, 2009-11-24 at 14:23 -0800, Thomas Gleixner wrote:
> On Tue, 24 Nov 2009, Peter P Waskiewicz Jr wrote:
> > On Tue, 2009-11-24 at 13:56 -0800, Thomas Gleixner wrote:
> > > On Tue, 24 Nov 2009, David Miller wrote:
> > > 
> > > > From: Thomas Gleixner <tglx@linutronix.de>
> > > > Date: Tue, 24 Nov 2009 12:07:35 +0100 (CET)
> > > > 
> > > > > And what does the kernel do with this information and why are we not
> > > > > using the existing device/numa_node information ?
> > > > 
> > > > It's a different problem space Thomas.
> > > > 
> > > > If the device lives on NUMA node X, we still end up wanting to
> > > > allocate memory resources (RX ring buffers) on other NUMA nodes on a
> > > > per-queue basis.
> > > > 
> > > > Otherwise a network card's forwarding performance is limited by the
> > > > memory bandwidth of a single NUMA node, and on a multiqueue cards we
> > > > therefore fare much better by allocating each device RX queue's memory
> > > > resources on a different NUMA node.
> > > > 
> > > > It is this NUMA usage that PJ is trying to export somehow to userspace
> > > > so that irqbalanced and friends can choose the IRQ cpu masks more
> > > > intelligently.
> > > 
> > > So you need a preferred irq mask information on a per IRQ basis and
> > > that mask is not restricted to the CPUs of a single NUMA node, right ?
> > > 
> > Just to clarify, I need a preferred CPU mask on a per IRQ basis.  And
> > yes, that mask may not be restricted to the CPUs of a single NUMA node.
> > But in the normal case, the mask will be restricted to CPUs of a single
> > node.
> 
> Right, but the normal case does not help much if we need to consider
> the special case of multiple nodes affected which requires another
> cpumask in irq_desc. That's what I really want to avoid.
> 
> I at least understand the exact problem you guys want to solve. Will
> think more about it.
> 

Just a friendly ping Thomas.  Any progress on your thinking about this
proposal?

Cheers,
-PJ


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2009-11-30 17:23 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-11-24  9:35 [PATCH v2] irq: Add node_affinity CPU masks for smarter irqbalance hints Peter P Waskiewicz Jr
2009-11-24 11:07 ` Thomas Gleixner
2009-11-24 17:57   ` David Miller
2009-11-24 21:56     ` Thomas Gleixner
2009-11-24 22:05       ` Peter P Waskiewicz Jr
2009-11-24 22:23         ` Thomas Gleixner
2009-11-30 17:24           ` Peter P Waskiewicz Jr

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.