All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
@ 2009-11-23  6:46 Peter P Waskiewicz Jr
  2009-11-23  7:32   ` Yong Zhang
  0 siblings, 1 reply; 67+ messages in thread
From: Peter P Waskiewicz Jr @ 2009-11-23  6:46 UTC (permalink / raw)
  To: linux-kernel, arjan; +Cc: davem, netdev

This patchset adds a new CPU mask for SMP systems to the irq_desc
struct.  It also exposes an API for underlying device drivers to
assist irqbalance in making smarter decisions when balancing, especially
in a NUMA environment.  For example, an ethernet driver with MSI-X may
wish to limit the CPUs that an interrupt can be balanced within to
stay on a single NUMA node.  Current irqbalance operation can move the
interrupt off the node, resulting in cross-node memory accesses and
locks.

The API is a get/set API within the kernel, along with a /proc entry
for the interrupt.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
---

 include/linux/interrupt.h |    8 ++++++
 include/linux/irq.h       |    2 ++
 kernel/irq/manage.c       |   32 +++++++++++++++++++++++++
 kernel/irq/proc.c         |   57 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 99 insertions(+), 0 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 75f3f00..9fd08aa 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -208,6 +208,8 @@ extern cpumask_var_t irq_default_affinity;
 extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
 extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
+extern int irq_set_node_affinity(unsigned int irq,
+                                 const struct cpumask *cpumask);
 
 #else /* CONFIG_SMP */
 
@@ -223,6 +225,12 @@ static inline int irq_can_set_affinity(unsigned int irq)
 
 static inline int irq_select_affinity(unsigned int irq)  { return 0; }
 
+static inline int irq_set_node_affinity(unsigned int irq,
+                                        const struct cpumask *m)
+{
+	return -EINVAL;
+}
+
 #endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */
 
 #ifdef CONFIG_GENERIC_HARDIRQS
diff --git a/include/linux/irq.h b/include/linux/irq.h
index ae9653d..26d7d07 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -166,6 +166,7 @@ struct irq_2_iommu;
  * @lock:		locking for SMP
  * @affinity:		IRQ affinity on SMP
  * @node:		node index useful for balancing
+ * @node_affinity:	irq mask hints for irqbalance
  * @pending_mask:	pending rebalanced interrupts
  * @threads_active:	number of irqaction threads currently running
  * @wait_for_threads:	wait queue for sync_irq to wait for threaded handlers
@@ -196,6 +197,7 @@ struct irq_desc {
 #ifdef CONFIG_SMP
 	cpumask_var_t		affinity;
 	unsigned int		node;
+	cpumask_var_t		node_affinity;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_var_t		pending_mask;
 #endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7305b29..9e80783 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,38 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	return 0;
 }
 
+/**
+ *	irq_set_node_affinity - Set the CPU mask this interrupt can run on
+ *	@irq:		Interrupt to modify
+ *	@cpumask:	CPU mask to assign to the interrupt
+ *
+ */
+int irq_set_node_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	cpumask_copy(desc->node_affinity, cpumask);
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(irq_set_node_affinity);
+
+/**
+ *	irq_get_node_affinity - Get the CPU mask this interrupt can run on
+ *	@irq:		Interrupt to get information
+ *
+ */
+struct cpumask *irq_get_node_affinity(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	return desc->node_affinity;
+}
+EXPORT_SYMBOL(irq_get_node_affinity);
+
 #ifndef CONFIG_AUTO_IRQ_AFFINITY
 /*
  * Generic version of the affinity autoselector.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 0832145..192e3fb 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -31,6 +31,16 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+static int irq_node_affinity_proc_show(struct seq_file *m, void *v)
+{
+	struct irq_desc *desc = irq_to_desc((long)m->private);
+	const struct cpumask *mask = desc->node_affinity;
+
+	seq_cpumask(m, mask);
+	seq_putc(m, '\n');
+	return 0;
+}
+
 #ifndef is_affinity_mask_valid
 #define is_affinity_mask_valid(val) 1
 #endif
@@ -78,11 +88,46 @@ free_cpumask:
 	return err;
 }
 
+static ssize_t irq_node_affinity_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *pos)
+{
+	unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
+	cpumask_var_t new_value;
+	int err;
+
+	if (no_irq_affinity || irq_balancing_disabled(irq))
+		return -EIO;
+
+	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+		return -ENOMEM;
+
+	err = cpumask_parse_user(buffer, count, new_value);
+	if (err)
+		goto free_cpumask;
+
+	if (!is_affinity_mask_valid(new_value)) {
+		err = -EINVAL;
+		goto free_cpumask;
+	}
+
+	irq_set_node_affinity(irq, new_value);
+	err = count;
+
+free_cpumask:
+	free_cpumask_var(new_value);
+	return err;
+}
+
 static int irq_affinity_proc_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
 }
 
+static int irq_node_affinity_proc_open(struct inode *inode, struct file *f)
+{
+	return single_open(f, irq_node_affinity_proc_show, PDE(inode)->data);
+}
+
 static const struct file_operations irq_affinity_proc_fops = {
 	.open		= irq_affinity_proc_open,
 	.read		= seq_read,
@@ -91,6 +136,14 @@ static const struct file_operations irq_affinity_proc_fops = {
 	.write		= irq_affinity_proc_write,
 };
 
+static const struct file_operations irq_node_affinity_proc_fops = {
+	.open		= irq_node_affinity_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= irq_node_affinity_proc_write,
+};
+
 static int default_affinity_show(struct seq_file *m, void *v)
 {
 	seq_cpumask(m, irq_default_affinity);
@@ -230,6 +283,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 	/* create /proc/irq/<irq>/smp_affinity */
 	proc_create_data("smp_affinity", 0600, desc->dir,
 			 &irq_affinity_proc_fops, (void *)(long)irq);
+
+	/* create /proc/irq/<irq>/node_affinity */
+	proc_create_data("node_affinity", 0600, desc->dir,
+	                 &irq_node_affinity_proc_fops, (void *)(long)irq);
 #endif
 
 	proc_create_data("spurious", 0444, desc->dir,


^ permalink raw reply related	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance  hints
  2009-11-23  6:46 [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints Peter P Waskiewicz Jr
@ 2009-11-23  7:32   ` Yong Zhang
  0 siblings, 0 replies; 67+ messages in thread
From: Yong Zhang @ 2009-11-23  7:32 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr; +Cc: linux-kernel, arjan, davem, netdev

On Mon, Nov 23, 2009 at 2:46 PM, Peter P Waskiewicz Jr
<peter.p.waskiewicz.jr@intel.com> wrote:
> This patchset adds a new CPU mask for SMP systems to the irq_desc
> struct.  It also exposes an API for underlying device drivers to
> assist irqbalance in making smarter decisions when balancing, especially
> in a NUMA environment.  For example, an ethernet driver with MSI-X may
> wish to limit the CPUs that an interrupt can be balanced within to
> stay on a single NUMA node.  Current irqbalance operation can move the
> interrupt off the node, resulting in cross-node memory accesses and
> locks.
>
> The API is a get/set API within the kernel, along with a /proc entry
> for the interrupt.
>
> Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
> ---

1) I think you should consider CONFIG_CPUMASK_OFFSTACK which will affect
   node_affinity.
2) It seems like this patch can't work with SPARSE_IRQ.

Thanks,
Yong

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
@ 2009-11-23  7:32   ` Yong Zhang
  0 siblings, 0 replies; 67+ messages in thread
From: Yong Zhang @ 2009-11-23  7:32 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr; +Cc: linux-kernel, arjan, davem, netdev

On Mon, Nov 23, 2009 at 2:46 PM, Peter P Waskiewicz Jr
<peter.p.waskiewicz.jr@intel.com> wrote:
> This patchset adds a new CPU mask for SMP systems to the irq_desc
> struct.  It also exposes an API for underlying device drivers to
> assist irqbalance in making smarter decisions when balancing, especially
> in a NUMA environment.  For example, an ethernet driver with MSI-X may
> wish to limit the CPUs that an interrupt can be balanced within to
> stay on a single NUMA node.  Current irqbalance operation can move the
> interrupt off the node, resulting in cross-node memory accesses and
> locks.
>
> The API is a get/set API within the kernel, along with a /proc entry
> for the interrupt.
>
> Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
> ---

1) I think you should consider CONFIG_CPUMASK_OFFSTACK which will affect
   node_affinity.
2) It seems like this patch can't work with SPARSE_IRQ.

Thanks,
Yong

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-23  7:32   ` Yong Zhang
  (?)
@ 2009-11-23  9:36   ` Peter P Waskiewicz Jr
  2009-11-23 10:21     ` ixgbe question Eric Dumazet
                       ` (2 more replies)
  -1 siblings, 3 replies; 67+ messages in thread
From: Peter P Waskiewicz Jr @ 2009-11-23  9:36 UTC (permalink / raw)
  To: Yong Zhang; +Cc: linux-kernel, arjan, davem, netdev

On Sun, 2009-11-22 at 23:32 -0800, Yong Zhang wrote:
> On Mon, Nov 23, 2009 at 2:46 PM, Peter P Waskiewicz Jr
> <peter.p.waskiewicz.jr@intel.com> wrote:
> > This patchset adds a new CPU mask for SMP systems to the irq_desc
> > struct.  It also exposes an API for underlying device drivers to
> > assist irqbalance in making smarter decisions when balancing, especially
> > in a NUMA environment.  For example, an ethernet driver with MSI-X may
> > wish to limit the CPUs that an interrupt can be balanced within to
> > stay on a single NUMA node.  Current irqbalance operation can move the
> > interrupt off the node, resulting in cross-node memory accesses and
> > locks.
> >
> > The API is a get/set API within the kernel, along with a /proc entry
> > for the interrupt.
> >
> > Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
> > ---
> 
> 1) I think you should consider CONFIG_CPUMASK_OFFSTACK which will affect
>    node_affinity.
> 2) It seems like this patch can't work with SPARSE_IRQ.

This mechanism isn't going to be used by any internal kernel mechanism
for determining interrupt placement or operation.  It's purely something
that either a driver can modify, or external script (through /proc),
that irqbalance will make use of.  If irqbalance isn't running, or the
current version of irqbalance doesn't support reading node_affinity,
then it won't affect the system's operation.

If irqbalance does support it, it'll read whatever the supplied mask is,
and then will try and balance interrupts within that mask.  It will bail
if the mask is invalid, or won't apply to the running system, just like
how putting a bogus mask into smp_affinity is ignored.

If there's something I'm missing beyond this with the two suggestions
you've made (I looked into those two parameters and tried to draw
conclusions), please let me know.

Cheers,
-PJ Waskiewicz


^ permalink raw reply	[flat|nested] 67+ messages in thread

* ixgbe question
  2009-11-23  9:36   ` Peter P Waskiewicz Jr
@ 2009-11-23 10:21     ` Eric Dumazet
  2009-11-23 10:30       ` Badalian Vyacheslav
                         ` (2 more replies)
  2009-11-23 17:05     ` [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints Peter Zijlstra
  2009-11-24  5:17       ` Yong Zhang
  2 siblings, 3 replies; 67+ messages in thread
From: Eric Dumazet @ 2009-11-23 10:21 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr; +Cc: Linux Netdev List

Hi Peter

I tried a pktgen stress on 82599EB card and could not split RX load on multiple cpus.

Setup is :

One 82599 card with fiber0 looped to fiber1, 10Gb link mode.
machine is a HPDL380 G6 with dual quadcore E5530 @2.4GHz (16 logical cpus)

I use one pktgen thread sending to fiber0 one many dst IP, and checked that fiber1
was using many RX queues :

grep fiber1 /proc/interrupts 
117:       1301      13060          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-0
118:        601       1402          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-1
119:        634        832          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-2
120:        601       1303          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-3
121:        620       1246          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-4
122:       1287      13088          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-5
123:        606       1354          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-6
124:        653        827          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-7
125:        639        825          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-8
126:        596       1199          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-9
127:       2013      24800          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-10
128:        648       1353          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-11
129:        601       1123          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-12
130:        625        834          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-13
131:        665       1409          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-14
132:       2637      31699          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-15
133:          1          0          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1:lsc



But only one CPU (CPU1) had a softirq running, 100%, and many frames were dropped

root@demodl380g6:/usr/src# ifconfig fiber0
fiber0    Link encap:Ethernet  HWaddr 00:1b:21:4a:fe:54  
          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
          Packets reçus:4 erreurs:0 :0 overruns:0 frame:0
          TX packets:309291576 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 lg file transmission:1000 
          Octets reçus:1368 (1.3 KB) Octets transmis:18557495682 (18.5 GB)

root@demodl380g6:/usr/src# ifconfig fiber1
fiber1    Link encap:Ethernet  HWaddr 00:1b:21:4a:fe:55  
          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
          Packets reçus:55122164 erreurs:0 :254169411 overruns:0 frame:0
          TX packets:4 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 lg file transmission:1000 
          Octets reçus:3307330968 (3.3 GB) Octets transmis:1368 (1.3 KB)


How and when multi queue rx can really start to use several cpus ?

Thanks
Eric


pktgen script :

pgset()
{
    local result

    echo $1 > $PGDEV

    result=`cat $PGDEV | fgrep "Result: OK:"`
    if [ "$result" = "" ]; then
         cat $PGDEV | fgrep Result:
    fi
}

pg()
{
    echo inject > $PGDEV
    cat $PGDEV
}


PGDEV=/proc/net/pktgen/kpktgend_4

 echo "Adding fiber0"
 pgset "add_device fiber0@0"


CLONE_SKB="clone_skb 15"

PKT_SIZE="pkt_size 60"


COUNT="count 100000000"
DELAY="delay 0"

PGDEV=/proc/net/pktgen/fiber0@0
  echo "Configuring $PGDEV"
 pgset "$COUNT"
 pgset "$CLONE_SKB"
 pgset "$PKT_SIZE"
 pgset "$DELAY"
 pgset "queue_map_min 0"
 pgset "queue_map_max 7"
 pgset "dst_min 192.168.0.2"
 pgset "dst_max 192.168.0.250"
 pgset "src_min 192.168.0.1"
 pgset "src_max 192.168.0.1"
 pgset "dst_mac  00:1b:21:4a:fe:55"


# Time to run
PGDEV=/proc/net/pktgen/pgctrl

 echo "Running... ctrl^C to stop"
 pgset "start" 
 echo "Done"

# Result can be vieved in /proc/net/pktgen/fiber0@0

for f in fiber0@0
do
 cat /proc/net/pktgen/$f
done



^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-23 10:21     ` ixgbe question Eric Dumazet
@ 2009-11-23 10:30       ` Badalian Vyacheslav
  2009-11-23 10:34       ` Waskiewicz Jr, Peter P
  2009-11-23 14:10       ` Jesper Dangaard Brouer
  2 siblings, 0 replies; 67+ messages in thread
From: Badalian Vyacheslav @ 2009-11-23 10:30 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Peter P Waskiewicz Jr, Linux Netdev List

Hello Eric. I paly with this card 3 weeks and maybe help for you :)

By default intel flower use only first cpu. Its strange.
If we add affinity to single cpu core for interrupt its will use this CPU core.
If we add affinity to two or more cpus its applying but don't work.
See ixgbe driver README from intel.com. Its have param for RSS flower. I think its do this :)
Also driver from intel.com have script for split RxTx->Cpu core but you must replace "tx rx" in code to "TxRx".

P.S. Please also see if you can and wont:
On e1000 and x86 kernel + 2xXeon 2core my TC rules load 3 min
On ixgbe and X86_64 kernel + 4xXeon 6core my TC rules load more 15 mins!
Its 64 bit regression?

Tc rules i can send to you if you ask me for it! Thanks!

Slavon


> Hi Peter
> 
> I tried a pktgen stress on 82599EB card and could not split RX load on multiple cpus.
> 
> Setup is :
> 
> One 82599 card with fiber0 looped to fiber1, 10Gb link mode.
> machine is a HPDL380 G6 with dual quadcore E5530 @2.4GHz (16 logical cpus)
> 
> I use one pktgen thread sending to fiber0 one many dst IP, and checked that fiber1
> was using many RX queues :
> 
> grep fiber1 /proc/interrupts 
> 117:       1301      13060          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-0
> 118:        601       1402          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-1
> 119:        634        832          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-2
> 120:        601       1303          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-3
> 121:        620       1246          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-4
> 122:       1287      13088          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-5
> 123:        606       1354          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-6
> 124:        653        827          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-7
> 125:        639        825          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-8
> 126:        596       1199          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-9
> 127:       2013      24800          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-10
> 128:        648       1353          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-11
> 129:        601       1123          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-12
> 130:        625        834          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-13
> 131:        665       1409          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-14
> 132:       2637      31699          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-15
> 133:          1          0          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1:lsc
> 
> 
> 
> But only one CPU (CPU1) had a softirq running, 100%, and many frames were dropped
> 
> root@demodl380g6:/usr/src# ifconfig fiber0
> fiber0    Link encap:Ethernet  HWaddr 00:1b:21:4a:fe:54  
>           UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
>           Packets reçus:4 erreurs:0 :0 overruns:0 frame:0
>           TX packets:309291576 errors:0 dropped:0 overruns:0 carrier:0
>           collisions:0 lg file transmission:1000 
>           Octets reçus:1368 (1.3 KB) Octets transmis:18557495682 (18.5 GB)
> 
> root@demodl380g6:/usr/src# ifconfig fiber1
> fiber1    Link encap:Ethernet  HWaddr 00:1b:21:4a:fe:55  
>           UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
>           Packets reçus:55122164 erreurs:0 :254169411 overruns:0 frame:0
>           TX packets:4 errors:0 dropped:0 overruns:0 carrier:0
>           collisions:0 lg file transmission:1000 
>           Octets reçus:3307330968 (3.3 GB) Octets transmis:1368 (1.3 KB)
> 
> 
> How and when multi queue rx can really start to use several cpus ?
> 
> Thanks
> Eric
> 
> 
> pktgen script :
> 
> pgset()
> {
>     local result
> 
>     echo $1 > $PGDEV
> 
>     result=`cat $PGDEV | fgrep "Result: OK:"`
>     if [ "$result" = "" ]; then
>          cat $PGDEV | fgrep Result:
>     fi
> }
> 
> pg()
> {
>     echo inject > $PGDEV
>     cat $PGDEV
> }
> 
> 
> PGDEV=/proc/net/pktgen/kpktgend_4
> 
>  echo "Adding fiber0"
>  pgset "add_device fiber0@0"
> 
> 
> CLONE_SKB="clone_skb 15"
> 
> PKT_SIZE="pkt_size 60"
> 
> 
> COUNT="count 100000000"
> DELAY="delay 0"
> 
> PGDEV=/proc/net/pktgen/fiber0@0
>   echo "Configuring $PGDEV"
>  pgset "$COUNT"
>  pgset "$CLONE_SKB"
>  pgset "$PKT_SIZE"
>  pgset "$DELAY"
>  pgset "queue_map_min 0"
>  pgset "queue_map_max 7"
>  pgset "dst_min 192.168.0.2"
>  pgset "dst_max 192.168.0.250"
>  pgset "src_min 192.168.0.1"
>  pgset "src_max 192.168.0.1"
>  pgset "dst_mac  00:1b:21:4a:fe:55"
> 
> 
> # Time to run
> PGDEV=/proc/net/pktgen/pgctrl
> 
>  echo "Running... ctrl^C to stop"
>  pgset "start" 
>  echo "Done"
> 
> # Result can be vieved in /proc/net/pktgen/fiber0@0
> 
> for f in fiber0@0
> do
>  cat /proc/net/pktgen/$f
> done
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-23 10:21     ` ixgbe question Eric Dumazet
  2009-11-23 10:30       ` Badalian Vyacheslav
@ 2009-11-23 10:34       ` Waskiewicz Jr, Peter P
  2009-11-23 10:37         ` Eric Dumazet
  2009-11-23 14:10       ` Jesper Dangaard Brouer
  2 siblings, 1 reply; 67+ messages in thread
From: Waskiewicz Jr, Peter P @ 2009-11-23 10:34 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Waskiewicz Jr, Peter P, Linux Netdev List

[-- Attachment #1: Type: TEXT/PLAIN, Size: 5581 bytes --]

On Mon, 23 Nov 2009, Eric Dumazet wrote:

> Hi Peter
> 
> I tried a pktgen stress on 82599EB card and could not split RX load on multiple cpus.
> 
> Setup is :
> 
> One 82599 card with fiber0 looped to fiber1, 10Gb link mode.
> machine is a HPDL380 G6 with dual quadcore E5530 @2.4GHz (16 logical cpus)

Can you specify kernel version and driver version?

> 
> I use one pktgen thread sending to fiber0 one many dst IP, and checked that fiber1
> was using many RX queues :
> 
> grep fiber1 /proc/interrupts 
> 117:       1301      13060          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-0
> 118:        601       1402          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-1
> 119:        634        832          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-2
> 120:        601       1303          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-3
> 121:        620       1246          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-4
> 122:       1287      13088          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-5
> 123:        606       1354          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-6
> 124:        653        827          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-7
> 125:        639        825          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-8
> 126:        596       1199          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-9
> 127:       2013      24800          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-10
> 128:        648       1353          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-11
> 129:        601       1123          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-12
> 130:        625        834          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-13
> 131:        665       1409          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-14
> 132:       2637      31699          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-15
> 133:          1          0          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1:lsc
> 
> 
> 
> But only one CPU (CPU1) had a softirq running, 100%, and many frames were dropped
> 
> root@demodl380g6:/usr/src# ifconfig fiber0
> fiber0    Link encap:Ethernet  HWaddr 00:1b:21:4a:fe:54  
>           UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
>           Packets reçus:4 erreurs:0 :0 overruns:0 frame:0
>           TX packets:309291576 errors:0 dropped:0 overruns:0 carrier:0
>           collisions:0 lg file transmission:1000 
>           Octets reçus:1368 (1.3 KB) Octets transmis:18557495682 (18.5 GB)
> 
> root@demodl380g6:/usr/src# ifconfig fiber1
> fiber1    Link encap:Ethernet  HWaddr 00:1b:21:4a:fe:55  
>           UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
>           Packets reçus:55122164 erreurs:0 :254169411 overruns:0 frame:0
>           TX packets:4 errors:0 dropped:0 overruns:0 carrier:0
>           collisions:0 lg file transmission:1000 
>           Octets reçus:3307330968 (3.3 GB) Octets transmis:1368 (1.3 KB)

I stay in the states too much.  I love seeing net stats in French.  :-)

> 
> 
> How and when multi queue rx can really start to use several cpus ?

If you're sending one flow to many consumers, it's still one flow.  Even 
using RSS won't help, since it requires differing flows to spread load  
(5-tuple matches for flow distribution).

Cheers,
-PJ

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-23 10:34       ` Waskiewicz Jr, Peter P
@ 2009-11-23 10:37         ` Eric Dumazet
  2009-11-23 14:05           ` Eric Dumazet
  2009-11-23 21:26           ` David Miller
  0 siblings, 2 replies; 67+ messages in thread
From: Eric Dumazet @ 2009-11-23 10:37 UTC (permalink / raw)
  To: Waskiewicz Jr, Peter P; +Cc: Linux Netdev List

Waskiewicz Jr, Peter P a écrit :
> On Mon, 23 Nov 2009, Eric Dumazet wrote:
> 
>> Hi Peter
>>
>> I tried a pktgen stress on 82599EB card and could not split RX load on multiple cpus.
>>
>> Setup is :
>>
>> One 82599 card with fiber0 looped to fiber1, 10Gb link mode.
>> machine is a HPDL380 G6 with dual quadcore E5530 @2.4GHz (16 logical cpus)
> 
> Can you specify kernel version and driver version?


Well, I forgot to mention I am only working with net-next-2.6 tree.

Ubuntu 9.10 kernel (Fedora Core 12 installer was not able to recognize disks on this machine :( )

ixgbe: Intel(R) 10 Gigabit PCI Express Network Driver - version 2.0.44-k2


> 
>> I use one pktgen thread sending to fiber0 one many dst IP, and checked that fiber1
>> was using many RX queues :
>>
>> grep fiber1 /proc/interrupts 
>> 117:       1301      13060          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-0
>> 118:        601       1402          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-1
>> 119:        634        832          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-2
>> 120:        601       1303          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-3
>> 121:        620       1246          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-4
>> 122:       1287      13088          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-5
>> 123:        606       1354          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-6
>> 124:        653        827          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-7
>> 125:        639        825          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-8
>> 126:        596       1199          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-9
>> 127:       2013      24800          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-10
>> 128:        648       1353          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-11
>> 129:        601       1123          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-12
>> 130:        625        834          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-13
>> 131:        665       1409          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-14
>> 132:       2637      31699          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1-TxRx-15
>> 133:          1          0          0          0          0          0          0          0          0          0          0          0          0          0          0          0   PCI-MSI-edge      fiber1:lsc
>>
>>
>>
>> But only one CPU (CPU1) had a softirq running, 100%, and many frames were dropped
>>
>> root@demodl380g6:/usr/src# ifconfig fiber0
>> fiber0    Link encap:Ethernet  HWaddr 00:1b:21:4a:fe:54  
>>           UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
>>           Packets reçus:4 erreurs:0 :0 overruns:0 frame:0
>>           TX packets:309291576 errors:0 dropped:0 overruns:0 carrier:0
>>           collisions:0 lg file transmission:1000 
>>           Octets reçus:1368 (1.3 KB) Octets transmis:18557495682 (18.5 GB)
>>
>> root@demodl380g6:/usr/src# ifconfig fiber1
>> fiber1    Link encap:Ethernet  HWaddr 00:1b:21:4a:fe:55  
>>           UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
>>           Packets reçus:55122164 erreurs:0 :254169411 overruns:0 frame:0
>>           TX packets:4 errors:0 dropped:0 overruns:0 carrier:0
>>           collisions:0 lg file transmission:1000 
>>           Octets reçus:3307330968 (3.3 GB) Octets transmis:1368 (1.3 KB)
> 
> I stay in the states too much.  I love seeing net stats in French.  :-)

Ok :)

> 
>>
>> How and when multi queue rx can really start to use several cpus ?
> 
> If you're sending one flow to many consumers, it's still one flow.  Even 
> using RSS won't help, since it requires differing flows to spread load  
> (5-tuple matches for flow distribution).

Hm... I can try varying both src and dst on my pktgen test.

Thanks

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-23 10:37         ` Eric Dumazet
@ 2009-11-23 14:05           ` Eric Dumazet
  2009-11-23 21:26           ` David Miller
  1 sibling, 0 replies; 67+ messages in thread
From: Eric Dumazet @ 2009-11-23 14:05 UTC (permalink / raw)
  To: Waskiewicz Jr, Peter P; +Cc: Linux Netdev List

Eric Dumazet a écrit :
> Waskiewicz Jr, Peter P a écrit :
>> On Mon, 23 Nov 2009, Eric Dumazet wrote:
>>
>>> Hi Peter
>>>
>>> I tried a pktgen stress on 82599EB card and could not split RX load on multiple cpus.
>>>
>>> Setup is :
>>>
>>> One 82599 card with fiber0 looped to fiber1, 10Gb link mode.
>>> machine is a HPDL380 G6 with dual quadcore E5530 @2.4GHz (16 logical cpus)
>> Can you specify kernel version and driver version?
> 
> 
> Well, I forgot to mention I am only working with net-next-2.6 tree.
> 
> Ubuntu 9.10 kernel (Fedora Core 12 installer was not able to recognize disks on this machine :( )
> 
> ixgbe: Intel(R) 10 Gigabit PCI Express Network Driver - version 2.0.44-k2
> 
> 

I tried with several pktgen threads, no success so far.

Only one cpu handles all interrupts and ksoftirq enters
a mode with no escape to splitted mode.

To get real multi queue and uncontended handling, I had to force :

echo 1 >`echo /proc/irq/*/fiber1-TxRx-0/../smp_affinity` 
echo 2 >`echo /proc/irq/*/fiber1-TxRx-1/../smp_affinity`
echo 4 >`echo /proc/irq/*/fiber1-TxRx-2/../smp_affinity`
echo 8 >`echo /proc/irq/*/fiber1-TxRx-3/../smp_affinity`
echo 10 >`echo /proc/irq/*/fiber1-TxRx-4/../smp_affinity`
echo 20 >`echo /proc/irq/*/fiber1-TxRx-5/../smp_affinity`
echo 40 >`echo /proc/irq/*/fiber1-TxRx-6/../smp_affinity`
echo 80 >`echo /proc/irq/*/fiber1-TxRx-7/../smp_affinity`
echo 100 >`echo /proc/irq/*/fiber1-TxRx-8/../smp_affinity`
echo 200 >`echo /proc/irq/*/fiber1-TxRx-9/../smp_affinity`
echo 400 >`echo /proc/irq/*/fiber1-TxRx-10/../smp_affinity`
echo 800 >`echo /proc/irq/*/fiber1-TxRx-11/../smp_affinity`
echo 1000 >`echo /proc/irq/*/fiber1-TxRx-12/../smp_affinity`
echo 2000 >`echo /proc/irq/*/fiber1-TxRx-13/../smp_affinity`
echo 4000 >`echo /proc/irq/*/fiber1-TxRx-14/../smp_affinity`
echo 8000 >`echo /proc/irq/*/fiber1-TxRx-15/../smp_affinity`


Probably problem comes from fact that when ksoftirqd runs and
RX queues are not depleted, no hardware interrupts is sent,
and NAPI contexts stay sticked on one cpu forever ?


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-23 10:21     ` ixgbe question Eric Dumazet
  2009-11-23 10:30       ` Badalian Vyacheslav
  2009-11-23 10:34       ` Waskiewicz Jr, Peter P
@ 2009-11-23 14:10       ` Jesper Dangaard Brouer
  2009-11-23 14:38         ` Eric Dumazet
  2 siblings, 1 reply; 67+ messages in thread
From: Jesper Dangaard Brouer @ 2009-11-23 14:10 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Peter P Waskiewicz Jr, Linux Netdev List


On Mon, 23 Nov 2009, Eric Dumazet wrote:

> I tried a pktgen stress on 82599EB card and could not split RX load on multiple cpus.
>
> Setup is :
>
> One 82599 card with fiber0 looped to fiber1, 10Gb link mode.
> machine is a HPDL380 G6 with dual quadcore E5530 @2.4GHz (16 logical cpus)
>
> I use one pktgen thread sending to fiber0 one many dst IP, and checked that fiber1
> was using many RX queues :

How is your smp_affinity mask's set?

grep . /proc/irq/*/fiber1-*/../smp_affinity


> But only one CPU (CPU1) had a softirq running, 100%, and many frames were dropped

Just a hint, I use 'ethtool -S fiber1' to see how the packets gets 
distributed across the rx and tx queues.



> CLONE_SKB="clone_skb 15"

Be careful with to high clone, as my experience is it will send a burst of 
clone_skb packets before the packet gets randomized again.


> pgset "dst_min 192.168.0.2"
> pgset "dst_max 192.168.0.250"
> pgset "src_min 192.168.0.1"
> pgset "src_max 192.168.0.1"
> pgset "dst_mac  00:1b:21:4a:fe:55"

To get a packets randomized across RX queues, I used:

     echo "- Random UDP source port min:$min - max:$max"
     pgset "flag UDPSRC_RND"
     pgset "udp_src_min $min"
     pgset "udp_src_max $max"

Ahh.. I think you are missing:

  pgset "flag IPDST_RND"


Cheers,
   Jesper Brouer

--
-------------------------------------------------------------------
MSc. Master of Computer Science
Dept. of Computer Science, University of Copenhagen
Author of http://www.adsl-optimizer.dk
-------------------------------------------------------------------

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-23 14:10       ` Jesper Dangaard Brouer
@ 2009-11-23 14:38         ` Eric Dumazet
  2009-11-23 18:30           ` robert
  0 siblings, 1 reply; 67+ messages in thread
From: Eric Dumazet @ 2009-11-23 14:38 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: Peter P Waskiewicz Jr, Linux Netdev List

Jesper Dangaard Brouer a écrit :

> How is your smp_affinity mask's set?
> 
> grep . /proc/irq/*/fiber1-*/../smp_affinity

First, I tried default affinities (ffff)

Then I tried irqbalance... no more success.

Driver seems to try to handle all queues on one cpu on low trafic,
and possibly dynamically switches to a multi-cpu mode,
but as all interrupts are masked, we stay in 
a NAPI context handling all queues.

And we let one cpu in flood/drops mode.




> 
> 
>> But only one CPU (CPU1) had a softirq running, 100%, and many frames
>> were dropped
> 
> Just a hint, I use 'ethtool -S fiber1' to see how the packets gets
> distributed across the rx and tx queues.

They are correctly distributed

     rx_queue_0_packets: 14119644
     rx_queue_0_bytes: 847178640
     rx_queue_1_packets: 14126315
     rx_queue_1_bytes: 847578900
     rx_queue_2_packets: 14115249
     rx_queue_2_bytes: 846914940
     rx_queue_3_packets: 14118146
     rx_queue_3_bytes: 847088760
     rx_queue_4_packets: 14130869
     rx_queue_4_bytes: 847853268
     rx_queue_5_packets: 14112239
     rx_queue_5_bytes: 846734340
     rx_queue_6_packets: 14128425
     rx_queue_6_bytes: 847705500
     rx_queue_7_packets: 14110587
     rx_queue_7_bytes: 846635220
     rx_queue_8_packets: 14117350
     rx_queue_8_bytes: 847041000
     rx_queue_9_packets: 14125992
     rx_queue_9_bytes: 847559520
     rx_queue_10_packets: 14121732
     rx_queue_10_bytes: 847303920
     rx_queue_11_packets: 14120997
     rx_queue_11_bytes: 847259820
     rx_queue_12_packets: 14125576
     rx_queue_12_bytes: 847535854
     rx_queue_13_packets: 14118512
     rx_queue_13_bytes: 847110720
     rx_queue_14_packets: 14118348
     rx_queue_14_bytes: 847100880
     rx_queue_15_packets: 14118647
     rx_queue_15_bytes: 847118820



> 
> 
> 
>> CLONE_SKB="clone_skb 15"
> 
> Be careful with to high clone, as my experience is it will send a burst
> of clone_skb packets before the packet gets randomized again.

Yes, but 15 should be ok with 10Gb link  :)

Thanks

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-23 18:30           ` robert
@ 2009-11-23 16:59             ` Eric Dumazet
  2009-11-23 20:54               ` robert
  2009-11-23 23:28               ` Waskiewicz Jr, Peter P
  0 siblings, 2 replies; 67+ messages in thread
From: Eric Dumazet @ 2009-11-23 16:59 UTC (permalink / raw)
  To: robert; +Cc: Jesper Dangaard Brouer, Peter P Waskiewicz Jr, Linux Netdev List

robert@herjulf.net a écrit :
> Eric Dumazet writes:
> 
>  > Jesper Dangaard Brouer a écrit :
>  > 
>  > > How is your smp_affinity mask's set?
>  > > 
>  > > grep . /proc/irq/*/fiber1-*/../smp_affinity
>  > 
> 
>  Weird... set clone_skb to 1 to be sure and vary dst or something so 
>  the HW classifier selects different queues and with proper RX affinty. 
>  
>  You should see in /proc/net/softnet_stat something like:
> 
> 012a7bb9 00000000 000000ae 00000000 00000000 00000000 00000000 00000000 00000000
> 01288d4c 00000000 00000049 00000000 00000000 00000000 00000000 00000000 00000000
> 0128fe28 00000000 00000043 00000000 00000000 00000000 00000000 00000000 00000000
> 01295387 00000000 00000047 00000000 00000000 00000000 00000000 00000000 00000000
> 0129a722 00000000 0000004a 00000000 00000000 00000000 00000000 00000000 00000000
> 0128c5e4 00000000 00000046 00000000 00000000 00000000 00000000 00000000 00000000
> 0128f718 00000000 00000043 00000000 00000000 00000000 00000000 00000000 00000000
> 012993e3 00000000 0000004a 00000000 00000000 00000000 00000000 00000000 00000000
> 

slone_skb set to 1, this changes nothing but slows down pktgen (obviously)

Result: OK: 117614452(c117608705+d5746) nsec, 100000000 (60byte,0frags)
  850235pps 408Mb/sec (408112800bps) errors: 0

All RX processing of 16 RX queues done by CPU 1 only.


# cat  /proc/net/softnet_stat  ; sleep 2 ; echo "--------------";cat  /proc/net/softnet_stat
0039f331 00000000 00002e10 00000000 00000000 00000000 00000000 00000000 00000000
03f2ed19 00000000 00037ca2 00000000 00000000 00000000 00000000 00000000 00000000
00000024 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000041 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000028 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
0000000b 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
000000c5 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
0000010d 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000250 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000498 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000616 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
0000012c 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
000000d2 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
0000025d 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
0000003c 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000127 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
--------------
0039f331 00000000 00002e10 00000000 00000000 00000000 00000000 00000000 00000000
03f66737 00000000 00038015 00000000 00000000 00000000 00000000 00000000 00000000
00000024 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000041 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000028 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
0000000b 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
000000c5 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000110 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000250 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000499 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000616 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
0000012c 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
000000d2 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000263 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
0000003c 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000129 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000

ethtool -S fiber1  (to show how my trafic is equally distributed to 16 RX queues)

     rx_queue_0_packets: 4867706
     rx_queue_0_bytes: 292062360
     rx_queue_1_packets: 4862472
     rx_queue_1_bytes: 291748320
     rx_queue_2_packets: 4867111
     rx_queue_2_bytes: 292026660
     rx_queue_3_packets: 4859897
     rx_queue_3_bytes: 291593820
     rx_queue_4_packets: 4862267
     rx_queue_4_bytes: 291740814
     rx_queue_5_packets: 4861517
     rx_queue_5_bytes: 291691020
     rx_queue_6_packets: 4862699
     rx_queue_6_bytes: 291761940
     rx_queue_7_packets: 4860523
     rx_queue_7_bytes: 291631380
     rx_queue_8_packets: 4856891
     rx_queue_8_bytes: 291413460
     rx_queue_9_packets: 4868794
     rx_queue_9_bytes: 292127640
     rx_queue_10_packets: 4859099
     rx_queue_10_bytes: 291545940
     rx_queue_11_packets: 4867599
     rx_queue_11_bytes: 292055940
     rx_queue_12_packets: 4861868
     rx_queue_12_bytes: 291713374
     rx_queue_13_packets: 4862655
     rx_queue_13_bytes: 291759300
     rx_queue_14_packets: 4860798
     rx_queue_14_bytes: 291647880
     rx_queue_15_packets: 4860951
     rx_queue_15_bytes: 291657060


perf top -C 1 -E 25
------------------------------------------------------------------------------
   PerfTop:   24419 irqs/sec  kernel:100.0% [100000 cycles],  (all, cpu: 1)
------------------------------------------------------------------------------

             samples    pcnt   kernel function
             _______   _____   _______________

            46234.00 - 24.3% : ixgbe_clean_tx_irq	[ixgbe]
            21134.00 - 11.1% : __slab_free
            17838.00 -  9.4% : _raw_spin_lock
            17086.00 -  9.0% : skb_release_head_state
             9410.00 -  5.0% : ixgbe_clean_rx_irq	[ixgbe]
             8639.00 -  4.5% : kmem_cache_free
             6910.00 -  3.6% : kfree
             5743.00 -  3.0% : __ip_route_output_key
             5321.00 -  2.8% : ip_route_input
             3138.00 -  1.7% : ip_rcv
             2179.00 -  1.1% : kmem_cache_alloc_node
             2002.00 -  1.1% : __kmalloc_node_track_caller
             1907.00 -  1.0% : skb_put
             1807.00 -  1.0% : __xfrm_lookup
             1742.00 -  0.9% : get_partial_node
             1727.00 -  0.9% : csum_partial_copy_generic
             1541.00 -  0.8% : add_partial
             1516.00 -  0.8% : __kfree_skb
             1465.00 -  0.8% : __netdev_alloc_skb
             1420.00 -  0.7% : icmp_send
             1222.00 -  0.6% : dev_gro_receive
             1159.00 -  0.6% : fib_table_lookup
             1155.00 -  0.6% : __phys_addr
             1050.00 -  0.6% : skb_release_data
              982.00 -  0.5% : _raw_spin_unlock


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-23  9:36   ` Peter P Waskiewicz Jr
  2009-11-23 10:21     ` ixgbe question Eric Dumazet
@ 2009-11-23 17:05     ` Peter Zijlstra
  2009-11-23 23:32       ` Waskiewicz Jr, Peter P
  2009-11-24  6:07       ` Arjan van de Ven
  2009-11-24  5:17       ` Yong Zhang
  2 siblings, 2 replies; 67+ messages in thread
From: Peter Zijlstra @ 2009-11-23 17:05 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr; +Cc: Yong Zhang, linux-kernel, arjan, davem, netdev

On Mon, 2009-11-23 at 01:36 -0800, Peter P Waskiewicz Jr wrote:

> This mechanism isn't going to be used by any internal kernel mechanism
> for determining interrupt placement or operation.  It's purely something
> that either a driver can modify, or external script (through /proc),
> that irqbalance will make use of.  If irqbalance isn't running, or the
> current version of irqbalance doesn't support reading node_affinity,
> then it won't affect the system's operation.
> 
> If irqbalance does support it, it'll read whatever the supplied mask is,
> and then will try and balance interrupts within that mask.  It will bail
> if the mask is invalid, or won't apply to the running system, just like
> how putting a bogus mask into smp_affinity is ignored.
> 
> If there's something I'm missing beyond this with the two suggestions
> you've made (I looked into those two parameters and tried to draw
> conclusions), please let me know.

I don't see the point in adding it, if the driver wants to set a node
cpu mask it can already do that using the regular smp affinity settings.

Same for userspace.


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-23 14:38         ` Eric Dumazet
@ 2009-11-23 18:30           ` robert
  2009-11-23 16:59             ` Eric Dumazet
  0 siblings, 1 reply; 67+ messages in thread
From: robert @ 2009-11-23 18:30 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Jesper Dangaard Brouer, Peter P Waskiewicz Jr, Linux Netdev List


Eric Dumazet writes:

 > Jesper Dangaard Brouer a écrit :
 > 
 > > How is your smp_affinity mask's set?
 > > 
 > > grep . /proc/irq/*/fiber1-*/../smp_affinity
 > 

 Weird... set clone_skb to 1 to be sure and vary dst or something so 
 the HW classifier selects different queues and with proper RX affinty. 
 
 You should see in /proc/net/softnet_stat something like:

012a7bb9 00000000 000000ae 00000000 00000000 00000000 00000000 00000000 00000000
01288d4c 00000000 00000049 00000000 00000000 00000000 00000000 00000000 00000000
0128fe28 00000000 00000043 00000000 00000000 00000000 00000000 00000000 00000000
01295387 00000000 00000047 00000000 00000000 00000000 00000000 00000000 00000000
0129a722 00000000 0000004a 00000000 00000000 00000000 00000000 00000000 00000000
0128c5e4 00000000 00000046 00000000 00000000 00000000 00000000 00000000 00000000
0128f718 00000000 00000043 00000000 00000000 00000000 00000000 00000000 00000000
012993e3 00000000 0000004a 00000000 00000000 00000000 00000000 00000000 00000000

Or something is...
 
Cheers

					--ro

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-23 16:59             ` Eric Dumazet
@ 2009-11-23 20:54               ` robert
  2009-11-23 21:28                 ` David Miller
  2009-11-23 23:28               ` Waskiewicz Jr, Peter P
  1 sibling, 1 reply; 67+ messages in thread
From: robert @ 2009-11-23 20:54 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: robert, Jesper Dangaard Brouer, Peter P Waskiewicz Jr, Linux Netdev List


Eric Dumazet writes:

 > slone_skb set to 1, this changes nothing but slows down pktgen (obviously)

 > All RX processing of 16 RX queues done by CPU 1 only.


 Well just pulled net-next-2.6 and ran with both 82598 and 82599 boards and 
 pkt load gets distributed among the cpu-cores. 


 Something mysterious or very obvious...
 
 You can even try the script it's a sort of Internet Link traffic emulation
 well you have to set up your routing.


 Cheers
					--ro

 


#! /bin/sh

#modprobe pktgen

function pgset() {
    local result

    echo $1 > $PGDEV

    result=`cat $PGDEV | fgrep "Result: OK:"`
    if [ "$result" = "" ]; then
         cat $PGDEV | fgrep Result:
    fi
}

function pg() {
    echo inject > $PGDEV
    cat $PGDEV
}

# Config Start Here -----------------------------------------------------------

remove_all()
{
 # thread config
 PGDEV=/proc/net/pktgen/kpktgend_0
 pgset "rem_device_all" 

 PGDEV=/proc/net/pktgen/kpktgend_1
 pgset "rem_device_all" 

 PGDEV=/proc/net/pktgen/kpktgend_2
 pgset "rem_device_all" 

 PGDEV=/proc/net/pktgen/kpktgend_3
 pgset "rem_device_all" 

 PGDEV=/proc/net/pktgen/kpktgend_4
 pgset "rem_device_all" 

 PGDEV=/proc/net/pktgen/kpktgend_5
 pgset "rem_device_all" 

 PGDEV=/proc/net/pktgen/kpktgend_6
 pgset "rem_device_all" 

 PGDEV=/proc/net/pktgen/kpktgend_7
 pgset "rem_device_all" 
}

remove_all

 PGDEV=/proc/net/pktgen/kpktgend_0
 pgset "add_device eth2@0" 

 PGDEV=/proc/net/pktgen/kpktgend_1
 pgset "add_device eth2@1" 

 PGDEV=/proc/net/pktgen/kpktgend_2
 pgset "add_device eth2@2" 

 PGDEV=/proc/net/pktgen/kpktgend_3
 pgset "add_device eth2@3" 


# device config
#
# Sending a mix of pkt sizes of 64, 576 and 1500
#

CLONE_SKB="clone_skb 1"
PKT_SIZE="pkt_size 60"
COUNT="count 000000"
DELAY="delay 0000"
#MAC="00:21:28:08:40:EE"
#MAC="00:21:28:08:40:EF"
#MAC="00:1B:21:17:C1:CD"
MAC="00:14:4F:DA:8C:66"
#MAC="00:14:4F:6B:CD:E8"


PGDEV=/proc/net/pktgen/eth2@0
echo "Configuring $PGDEV"
pgset "$COUNT"
pgset "$CLONE_SKB"
pgset "pkt_size 1496"
pgset "$DELAY"
pgset "flag QUEUE_MAP_CPU"
pgset "flag IPDST_RND" 
pgset "flag FLOW_SEQ" 
pgset "dst_min 11.0.0.0" 
pgset "dst_max 11.255.255.255" 
pgset "flows 2048" 
pgset "flowlen 30" 
pgset  "dst_mac $MAC"

PGDEV=/proc/net/pktgen/eth2@1
echo "Configuring $PGDEV"
pgset "$COUNT"
pgset "$CLONE_SKB"
pgset "pkt_size 576"
pgset "$DELAY"
pgset "flag QUEUE_MAP_CPU"
pgset "flag IPDST_RND" 
pgset "flag FLOW_SEQ" 
pgset "dst_min 11.0.0.0" 
pgset "dst_max 11.255.255.255" 
pgset "flows 2048" 
pgset "flowlen 30" 
pgset  "dst_mac $MAC"

PGDEV=/proc/net/pktgen/eth2@2
echo "Configuring $PGDEV"
pgset "$COUNT"
pgset "$CLONE_SKB"
pgset "$DELAY"
pgset "pkt_size 60"
pgset "flag QUEUE_MAP_CPU"
pgset "flag IPDST_RND" 
pgset "flag FLOW_SEQ" 
pgset "dst_min 11.0.0.0" 
pgset "dst_max 11.255.255.255" 
pgset "flows 2048" 
pgset "flowlen 30" 
pgset  "dst_mac $MAC"

PGDEV=/proc/net/pktgen/eth2@3
echo "Configuring $PGDEV"
pgset "$COUNT"
pgset "$CLONE_SKB"
pgset "pkt_size 1496"
pgset "$DELAY"
pgset "flag QUEUE_MAP_CPU"
pgset "flag IPDST_RND" 
pgset "flag FLOW_SEQ" 
pgset "dst_min 11.0.0.0" 
pgset "dst_max 11.255.255.255" 
pgset "flows 2048" 
pgset "flowlen 30" 
pgset  "dst_mac $MAC"

# Time to run
PGDEV=/proc/net/pktgen/pgctrl

echo "Running... ctrl^C to stop"
pgset "start" 
echo "Done"

grep pps /proc/net/pktgen/*

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-23 10:37         ` Eric Dumazet
  2009-11-23 14:05           ` Eric Dumazet
@ 2009-11-23 21:26           ` David Miller
  1 sibling, 0 replies; 67+ messages in thread
From: David Miller @ 2009-11-23 21:26 UTC (permalink / raw)
  To: eric.dumazet; +Cc: peter.p.waskiewicz.jr, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 23 Nov 2009 11:37:20 +0100

> (Fedora Core 12 installer was not able to recognize disks on this machine :( )

I ran into this problem too on my laptop, but only with the Live-CD images.

The DVD image recognized the disks and installed just fine.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-23 20:54               ` robert
@ 2009-11-23 21:28                 ` David Miller
  2009-11-23 22:14                   ` Robert Olsson
  0 siblings, 1 reply; 67+ messages in thread
From: David Miller @ 2009-11-23 21:28 UTC (permalink / raw)
  To: robert; +Cc: eric.dumazet, hawk, peter.p.waskiewicz.jr, netdev

From: robert@herjulf.net
Date: Mon, 23 Nov 2009 21:54:43 +0100

>  Something mysterious or very obvious...

It seem very obvious to me that, for whatever reason, the MSI-X vectors
are only being sent to cpu 1 on Eric's system.

I also suspect, as a result, that it has nothing to do with the IXGBE
driver but rather is some IRQ controller programming or some bug or
limitation in the IRQ affinity mask handling in the kernel.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-23 21:28                 ` David Miller
@ 2009-11-23 22:14                   ` Robert Olsson
  0 siblings, 0 replies; 67+ messages in thread
From: Robert Olsson @ 2009-11-23 22:14 UTC (permalink / raw)
  To: David Miller; +Cc: robert, eric.dumazet, hawk, peter.p.waskiewicz.jr, netdev


David Miller writes:
 > It seem very obvious to me that, for whatever reason, the MSI-X vectors
 > are only being sent to cpu 1 on Eric's system.
 > 
 > I also suspect, as a result, that it has nothing to do with the IXGBE
 > driver but rather is some IRQ controller programming or some bug or
 > limitation in the IRQ affinity mask handling in the kernel.

 Probably so yes. I'll guess Eric will dig into this.
 
 Cheers

					--ro
 
 

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-23 16:59             ` Eric Dumazet
  2009-11-23 20:54               ` robert
@ 2009-11-23 23:28               ` Waskiewicz Jr, Peter P
  2009-11-23 23:44                 ` David Miller
  2009-11-24  7:46                 ` Eric Dumazet
  1 sibling, 2 replies; 67+ messages in thread
From: Waskiewicz Jr, Peter P @ 2009-11-23 23:28 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: robert, Jesper Dangaard Brouer, Waskiewicz Jr, Peter P,
	Linux Netdev List

[-- Attachment #1: Type: TEXT/PLAIN, Size: 2301 bytes --]

On Mon, 23 Nov 2009, Eric Dumazet wrote:

> robert@herjulf.net a écrit :
> > Eric Dumazet writes:
> > 
> >  > Jesper Dangaard Brouer a écrit :
> >  > 
> >  > > How is your smp_affinity mask's set?
> >  > > 
> >  > > grep . /proc/irq/*/fiber1-*/../smp_affinity
> >  > 
> > 
> >  Weird... set clone_skb to 1 to be sure and vary dst or something so 
> >  the HW classifier selects different queues and with proper RX affinty. 
> >  
> >  You should see in /proc/net/softnet_stat something like:
> > 
> > 012a7bb9 00000000 000000ae 00000000 00000000 00000000 00000000 00000000 00000000
> > 01288d4c 00000000 00000049 00000000 00000000 00000000 00000000 00000000 00000000
> > 0128fe28 00000000 00000043 00000000 00000000 00000000 00000000 00000000 00000000
> > 01295387 00000000 00000047 00000000 00000000 00000000 00000000 00000000 00000000
> > 0129a722 00000000 0000004a 00000000 00000000 00000000 00000000 00000000 00000000
> > 0128c5e4 00000000 00000046 00000000 00000000 00000000 00000000 00000000 00000000
> > 0128f718 00000000 00000043 00000000 00000000 00000000 00000000 00000000 00000000
> > 012993e3 00000000 0000004a 00000000 00000000 00000000 00000000 00000000 00000000
> > 
> 
> slone_skb set to 1, this changes nothing but slows down pktgen (obviously)
> 
> Result: OK: 117614452(c117608705+d5746) nsec, 100000000 (60byte,0frags)
>   850235pps 408Mb/sec (408112800bps) errors: 0
> 
> All RX processing of 16 RX queues done by CPU 1 only.

Ok, I was confused earlier.  I thought you were saying that all packets 
were headed into a single Rx queue.  This is different.

Do you know what version of irqbalance you're running, or if it's running 
at all?  We've seen issues with irqbalance where it won't recognize the 
ethernet device if the driver has been reloaded.  In that case, it won't 
balance the interrupts at all.  If the default affinity was set to one 
CPU, then well, you're screwed.

My suggestion in this case is after you reload ixgbe and start your tests, 
see if it all goes to one CPU.  If it does, then restart irqbalance 
(service irqbalance restart - or just kill it and restart by hand).  Then 
start running your test, and in 10 seconds you should see the interrupts 
move and spread out.

Let me know if this helps,
-PJ

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-23 17:05     ` [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints Peter Zijlstra
@ 2009-11-23 23:32       ` Waskiewicz Jr, Peter P
  2009-11-24  8:38         ` Peter Zijlstra
  2009-11-24  6:07       ` Arjan van de Ven
  1 sibling, 1 reply; 67+ messages in thread
From: Waskiewicz Jr, Peter P @ 2009-11-23 23:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Waskiewicz Jr, Peter P, Yong Zhang, linux-kernel, arjan, davem, netdev

On Mon, 23 Nov 2009, Peter Zijlstra wrote:

> On Mon, 2009-11-23 at 01:36 -0800, Peter P Waskiewicz Jr wrote:
> 
> > This mechanism isn't going to be used by any internal kernel mechanism
> > for determining interrupt placement or operation.  It's purely something
> > that either a driver can modify, or external script (through /proc),
> > that irqbalance will make use of.  If irqbalance isn't running, or the
> > current version of irqbalance doesn't support reading node_affinity,
> > then it won't affect the system's operation.
> > 
> > If irqbalance does support it, it'll read whatever the supplied mask is,
> > and then will try and balance interrupts within that mask.  It will bail
> > if the mask is invalid, or won't apply to the running system, just like
> > how putting a bogus mask into smp_affinity is ignored.
> > 
> > If there's something I'm missing beyond this with the two suggestions
> > you've made (I looked into those two parameters and tried to draw
> > conclusions), please let me know.
> 
> I don't see the point in adding it, if the driver wants to set a node
> cpu mask it can already do that using the regular smp affinity settings.

Unfortunately, a driver can't.  The irq_set_affinity() function isn't 
exported.  I proposed a patch on netdev to export it, and then to tie down 
an interrupt using IRQF_NOBALANCING, so irqbalance won't touch it.  That 
was rejected, since the driver is enforcing policy of the interrupt 
balancing, not irqbalance.

I and Jesse Brandeburg had a meeting with Arjan about this.  What we came 
up with was this interface, so drivers can set what they'd like to see, if 
irqbalance decides to honor it.  That way interrupt affinity policies are 
set only by irqbalance, but this interface gives us a mechanism to hint to 
irqbalance what we'd like it to do.

Also, if you use the /proc interface to change smp_affinity on an 
interrupt without any of these changes, irqbalance will override it on its 
next poll interval.  This also is not desirable.

Cheers,
-PJ

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-23 23:28               ` Waskiewicz Jr, Peter P
@ 2009-11-23 23:44                 ` David Miller
  2009-11-24  7:46                 ` Eric Dumazet
  1 sibling, 0 replies; 67+ messages in thread
From: David Miller @ 2009-11-23 23:44 UTC (permalink / raw)
  To: peter.p.waskiewicz.jr; +Cc: eric.dumazet, robert, hawk, netdev

From: "Waskiewicz Jr, Peter P" <peter.p.waskiewicz.jr@intel.com>
Date: Mon, 23 Nov 2009 15:28:18 -0800 (Pacific Standard Time)

> Do you know what version of irqbalance you're running, or if it's running 
> at all?

Eric said he tried both with and without irqbalanced.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance  hints
  2009-11-23  9:36   ` Peter P Waskiewicz Jr
@ 2009-11-24  5:17       ` Yong Zhang
  2009-11-23 17:05     ` [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints Peter Zijlstra
  2009-11-24  5:17       ` Yong Zhang
  2 siblings, 0 replies; 67+ messages in thread
From: Yong Zhang @ 2009-11-24  5:17 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr; +Cc: linux-kernel, arjan, davem, netdev

[snip]
>>
>> 1) I think you should consider CONFIG_CPUMASK_OFFSTACK which will affect
>>    node_affinity.
>> 2) It seems like this patch can't work with SPARSE_IRQ.
>
> This mechanism isn't going to be used by any internal kernel mechanism
> for determining interrupt placement or operation.  It's purely something
> that either a driver can modify, or external script (through /proc),
> that irqbalance will make use of.  If irqbalance isn't running, or the
> current version of irqbalance doesn't support reading node_affinity,
> then it won't affect the system's operation.
>
> If irqbalance does support it, it'll read whatever the supplied mask is,
> and then will try and balance interrupts within that mask.  It will bail
> if the mask is invalid, or won't apply to the running system, just like
> how putting a bogus mask into smp_affinity is ignored.
>
> If there's something I'm missing beyond this with the two suggestions
> you've made (I looked into those two parameters and tried to draw
> conclusions), please let me know.

My two suggestions are both about your adding node_affinity. Before you can
use this element, you must initialise it firstly. You can refer how
irq_desc::affinity
is used in function alloc_desc_masks().
include/linux/irq.h:
static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
							bool boot)
{
	gfp_t gfp = GFP_ATOMIC;

	if (boot)
		gfp = GFP_NOWAIT;

#ifdef CONFIG_CPUMASK_OFFSTACK
	if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
		return false;

#ifdef CONFIG_GENERIC_PENDING_IRQ
	if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
		free_cpumask_var(desc->affinity);
		return false;
	}
#endif
#endif
	return true;
}

Thanks,
Yong

>
> Cheers,
> -PJ Waskiewicz
>
>

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
@ 2009-11-24  5:17       ` Yong Zhang
  0 siblings, 0 replies; 67+ messages in thread
From: Yong Zhang @ 2009-11-24  5:17 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr; +Cc: linux-kernel, arjan, davem, netdev

[snip]
>>
>> 1) I think you should consider CONFIG_CPUMASK_OFFSTACK which will affect
>>    node_affinity.
>> 2) It seems like this patch can't work with SPARSE_IRQ.
>
> This mechanism isn't going to be used by any internal kernel mechanism
> for determining interrupt placement or operation.  It's purely something
> that either a driver can modify, or external script (through /proc),
> that irqbalance will make use of.  If irqbalance isn't running, or the
> current version of irqbalance doesn't support reading node_affinity,
> then it won't affect the system's operation.
>
> If irqbalance does support it, it'll read whatever the supplied mask is,
> and then will try and balance interrupts within that mask.  It will bail
> if the mask is invalid, or won't apply to the running system, just like
> how putting a bogus mask into smp_affinity is ignored.
>
> If there's something I'm missing beyond this with the two suggestions
> you've made (I looked into those two parameters and tried to draw
> conclusions), please let me know.

My two suggestions are both about your adding node_affinity. Before you can
use this element, you must initialise it firstly. You can refer how
irq_desc::affinity
is used in function alloc_desc_masks().
include/linux/irq.h:
static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
							bool boot)
{
	gfp_t gfp = GFP_ATOMIC;

	if (boot)
		gfp = GFP_NOWAIT;

#ifdef CONFIG_CPUMASK_OFFSTACK
	if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
		return false;

#ifdef CONFIG_GENERIC_PENDING_IRQ
	if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
		free_cpumask_var(desc->affinity);
		return false;
	}
#endif
#endif
	return true;
}

Thanks,
Yong

>
> Cheers,
> -PJ Waskiewicz
>
>

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-23 17:05     ` [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints Peter Zijlstra
  2009-11-23 23:32       ` Waskiewicz Jr, Peter P
@ 2009-11-24  6:07       ` Arjan van de Ven
  2009-11-24  8:39         ` Peter Zijlstra
  1 sibling, 1 reply; 67+ messages in thread
From: Arjan van de Ven @ 2009-11-24  6:07 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Peter P Waskiewicz Jr, Yong Zhang, linux-kernel, arjan, davem, netdev

Peter Zijlstra wrote:
> On Mon, 2009-11-23 at 01:36 -0800, Peter P Waskiewicz Jr wrote:
> 
>> This mechanism isn't going to be used by any internal kernel mechanism
>> for determining interrupt placement or operation.  It's purely something
>> that either a driver can modify, or external script (through /proc),
>> that irqbalance will make use of.  If irqbalance isn't running, or the
>> current version of irqbalance doesn't support reading node_affinity,
>> then it won't affect the system's operation.
>>
>> If irqbalance does support it, it'll read whatever the supplied mask is,
>> and then will try and balance interrupts within that mask.  It will bail
>> if the mask is invalid, or won't apply to the running system, just like
>> how putting a bogus mask into smp_affinity is ignored.
>>
>> If there's something I'm missing beyond this with the two suggestions
>> you've made (I looked into those two parameters and tried to draw
>> conclusions), please let me know.
> 
> I don't see the point in adding it, if the driver wants to set a node
> cpu mask it can already do that using the regular smp affinity settings.
> 
> Same for userspace.

the problem is that there is no way currently that the driver can communicate
"I allocated all my metadata on THIS numa node". irqbalance and sysadmins need
that to not make really stupid decisions.....

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-23 23:28               ` Waskiewicz Jr, Peter P
  2009-11-23 23:44                 ` David Miller
@ 2009-11-24  7:46                 ` Eric Dumazet
  2009-11-24  8:46                   ` Badalian Vyacheslav
  2009-11-24  9:07                   ` Peter P Waskiewicz Jr
  1 sibling, 2 replies; 67+ messages in thread
From: Eric Dumazet @ 2009-11-24  7:46 UTC (permalink / raw)
  To: Waskiewicz Jr, Peter P; +Cc: robert, Jesper Dangaard Brouer, Linux Netdev List

Waskiewicz Jr, Peter P a écrit :
> Ok, I was confused earlier.  I thought you were saying that all packets 
> were headed into a single Rx queue.  This is different.
> 
> Do you know what version of irqbalance you're running, or if it's running 
> at all?  We've seen issues with irqbalance where it won't recognize the 
> ethernet device if the driver has been reloaded.  In that case, it won't 
> balance the interrupts at all.  If the default affinity was set to one 
> CPU, then well, you're screwed.
> 
> My suggestion in this case is after you reload ixgbe and start your tests, 
> see if it all goes to one CPU.  If it does, then restart irqbalance 
> (service irqbalance restart - or just kill it and restart by hand).  Then 
> start running your test, and in 10 seconds you should see the interrupts 
> move and spread out.
> 
> Let me know if this helps,

Sure it helps !

I tried without irqbalance and with irqbalance (Ubuntu 9.10 ships irqbalance 0.55-4)
I can see irqbalance setting smp_affinities to 5555 or AAAA with no direct effect.

I do receive 16 different irqs, but all serviced on one cpu.

Only way to have irqs on different cpus is to manualy force irq affinities to be exclusive
(one bit set in the mask, not several ones), and that is not optimal for moderate loads.

echo 1 >`echo /proc/irq/*/fiber1-TxRx-0/../smp_affinity`
echo 1 >`echo /proc/irq/*/fiber1-TxRx-1/../smp_affinity`
echo 4 >`echo /proc/irq/*/fiber1-TxRx-2/../smp_affinity`
echo 4 >`echo /proc/irq/*/fiber1-TxRx-3/../smp_affinity`
echo 10 >`echo /proc/irq/*/fiber1-TxRx-4/../smp_affinity`
echo 10 >`echo /proc/irq/*/fiber1-TxRx-5/../smp_affinity`
echo 40 >`echo /proc/irq/*/fiber1-TxRx-6/../smp_affinity`
echo 40 >`echo /proc/irq/*/fiber1-TxRx-7/../smp_affinity`
echo 100 >`echo /proc/irq/*/fiber1-TxRx-8/../smp_affinity`
echo 100 >`echo /proc/irq/*/fiber1-TxRx-9/../smp_affinity`
echo 400 >`echo /proc/irq/*/fiber1-TxRx-10/../smp_affinity`
echo 400 >`echo /proc/irq/*/fiber1-TxRx-11/../smp_affinity`
echo 1000 >`echo /proc/irq/*/fiber1-TxRx-12/../smp_affinity`
echo 1000 >`echo /proc/irq/*/fiber1-TxRx-13/../smp_affinity`
echo 4000 >`echo /proc/irq/*/fiber1-TxRx-14/../smp_affinity`
echo 4000 >`echo /proc/irq/*/fiber1-TxRx-15/../smp_affinity`


One other problem is that after reload of ixgbe driver, link is 95% of the time
at 1 Gbps speed, and I could not find an easy way to force it being 10 Gbps

I run following script many times and stop it when 10 Gbps speed if reached.

ethtool -A fiber0 rx off tx off
ip link set fiber0 down
ip link set fiber1 down
sleep 2
ethtool fiber0
ethtool -s fiber0 speed 10000
ethtool -s fiber1 speed 10000
ethtool -r fiber0 &
ethtool -r fiber1 &
ethtool fiber0
ip link set fiber1 up &
ip link set fiber0 up &
ethtool fiber0

[   33.625689] ixgbe: Intel(R) 10 Gigabit PCI Express Network Driver - version 2.0.44-k2
[   33.625692] ixgbe: Copyright (c) 1999-2009 Intel Corporation.
[   33.625741] ixgbe 0000:07:00.0: PCI INT A -> GSI 32 (level, low) -> IRQ 32
[   33.625760] ixgbe 0000:07:00.0: setting latency timer to 64
[   33.735579] ixgbe 0000:07:00.0: irq 100 for MSI/MSI-X
[   33.735583] ixgbe 0000:07:00.0: irq 101 for MSI/MSI-X
[   33.735585] ixgbe 0000:07:00.0: irq 102 for MSI/MSI-X
[   33.735587] ixgbe 0000:07:00.0: irq 103 for MSI/MSI-X
[   33.735589] ixgbe 0000:07:00.0: irq 104 for MSI/MSI-X
[   33.735591] ixgbe 0000:07:00.0: irq 105 for MSI/MSI-X
[   33.735593] ixgbe 0000:07:00.0: irq 106 for MSI/MSI-X
[   33.735595] ixgbe 0000:07:00.0: irq 107 for MSI/MSI-X
[   33.735597] ixgbe 0000:07:00.0: irq 108 for MSI/MSI-X
[   33.735599] ixgbe 0000:07:00.0: irq 109 for MSI/MSI-X
[   33.735602] ixgbe 0000:07:00.0: irq 110 for MSI/MSI-X
[   33.735604] ixgbe 0000:07:00.0: irq 111 for MSI/MSI-X
[   33.735606] ixgbe 0000:07:00.0: irq 112 for MSI/MSI-X
[   33.735608] ixgbe 0000:07:00.0: irq 113 for MSI/MSI-X
[   33.735610] ixgbe 0000:07:00.0: irq 114 for MSI/MSI-X
[   33.735612] ixgbe 0000:07:00.0: irq 115 for MSI/MSI-X
[   33.735614] ixgbe 0000:07:00.0: irq 116 for MSI/MSI-X
[   33.735633] ixgbe: 0000:07:00.0: ixgbe_init_interrupt_scheme: Multiqueue Enabled: Rx Queue count = 16, Tx Queue count = 16
[   33.735638] ixgbe 0000:07:00.0: (PCI Express:5.0Gb/s:Width x8) 00:1b:21:4a:fe:54
[   33.735722] ixgbe 0000:07:00.0: MAC: 2, PHY: 11, SFP+: 5, PBA No: e66562-003
[   33.738111] ixgbe 0000:07:00.0: Intel(R) 10 Gigabit Network Connection
[   33.738135] ixgbe 0000:07:00.1: PCI INT B -> GSI 42 (level, low) -> IRQ 42
[   33.738151] ixgbe 0000:07:00.1: setting latency timer to 64
[   33.853526] ixgbe 0000:07:00.1: irq 117 for MSI/MSI-X
[   33.853529] ixgbe 0000:07:00.1: irq 118 for MSI/MSI-X
[   33.853532] ixgbe 0000:07:00.1: irq 119 for MSI/MSI-X
[   33.853534] ixgbe 0000:07:00.1: irq 120 for MSI/MSI-X
[   33.853536] ixgbe 0000:07:00.1: irq 121 for MSI/MSI-X
[   33.853538] ixgbe 0000:07:00.1: irq 122 for MSI/MSI-X
[   33.853540] ixgbe 0000:07:00.1: irq 123 for MSI/MSI-X
[   33.853542] ixgbe 0000:07:00.1: irq 124 for MSI/MSI-X
[   33.853544] ixgbe 0000:07:00.1: irq 125 for MSI/MSI-X
[   33.853546] ixgbe 0000:07:00.1: irq 126 for MSI/MSI-X
[   33.853548] ixgbe 0000:07:00.1: irq 127 for MSI/MSI-X
[   33.853550] ixgbe 0000:07:00.1: irq 128 for MSI/MSI-X
[   33.853552] ixgbe 0000:07:00.1: irq 129 for MSI/MSI-X
[   33.853554] ixgbe 0000:07:00.1: irq 130 for MSI/MSI-X
[   33.853556] ixgbe 0000:07:00.1: irq 131 for MSI/MSI-X
[   33.853558] ixgbe 0000:07:00.1: irq 132 for MSI/MSI-X
[   33.853560] ixgbe 0000:07:00.1: irq 133 for MSI/MSI-X
[   33.853580] ixgbe: 0000:07:00.1: ixgbe_init_interrupt_scheme: Multiqueue Enabled: Rx Queue count = 16, Tx Queue count = 16
[   33.853585] ixgbe 0000:07:00.1: (PCI Express:5.0Gb/s:Width x8) 00:1b:21:4a:fe:55
[   33.853669] ixgbe 0000:07:00.1: MAC: 2, PHY: 11, SFP+: 5, PBA No: e66562-003
[   33.855956] ixgbe 0000:07:00.1: Intel(R) 10 Gigabit Network Connection

[   85.208233] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: RX/TX
[   85.237453] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: RX/TX
[   96.080713] ixgbe: fiber1 NIC Link is Down
[  102.094610] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
[  102.119572] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
[  142.524691] ixgbe: fiber1 NIC Link is Down
[  148.421332] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
[  148.449465] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
[  160.728643] ixgbe: fiber1 NIC Link is Down
[  172.832301] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
[  173.659038] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
[  184.554501] ixgbe: fiber0 NIC Link is Down
[  185.376273] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
[  186.493598] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
[  190.564383] ixgbe: fiber0 NIC Link is Down
[  191.391149] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
[  192.484492] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
[  192.545424] ixgbe: fiber1 NIC Link is Down
[  205.858197] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
[  206.684940] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
[  211.991875] ixgbe: fiber1 NIC Link is Down
[  220.833478] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
[  220.833630] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
[  229.804853] ixgbe: fiber1 NIC Link is Down
[  248.395672] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
[  249.222408] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
[  484.631598] ixgbe: fiber1 NIC Link is Down
[  490.138931] ixgbe: fiber1 NIC Link is Up 10 Gbps, Flow Control: None
[  490.167880] ixgbe: fiber0 NIC Link is Up 10 Gbps, Flow Control: None

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-23 23:32       ` Waskiewicz Jr, Peter P
@ 2009-11-24  8:38         ` Peter Zijlstra
  2009-11-24  8:59           ` Peter P Waskiewicz Jr
  0 siblings, 1 reply; 67+ messages in thread
From: Peter Zijlstra @ 2009-11-24  8:38 UTC (permalink / raw)
  To: Waskiewicz Jr, Peter P
  Cc: Yong Zhang, linux-kernel, arjan, davem, netdev, Thomas Gleixner

On Mon, 2009-11-23 at 15:32 -0800, Waskiewicz Jr, Peter P wrote:

> Unfortunately, a driver can't.  The irq_set_affinity() function isn't 
> exported.  I proposed a patch on netdev to export it, and then to tie down 
> an interrupt using IRQF_NOBALANCING, so irqbalance won't touch it.  That 
> was rejected, since the driver is enforcing policy of the interrupt 
> balancing, not irqbalance.

Why would a patch touching the irq subsystem go to netdev?

What is wrong with exporting irq_set_affinity(), and wtf do you need
IRQF_NOBALANCING for?

> I and Jesse Brandeburg had a meeting with Arjan about this.  What we came 
> up with was this interface, so drivers can set what they'd like to see, if 
> irqbalance decides to honor it.  That way interrupt affinity policies are 
> set only by irqbalance, but this interface gives us a mechanism to hint to 
> irqbalance what we'd like it to do.

If all you want is to expose policy to userspace then you don't need any
of this, simply expose the NICs home node through a sysfs device thingy
(I was under the impression its already there somewhere, but I can't
ever find anything in /sys).

No need what so ever to poke at the IRQ subsystem.

> Also, if you use the /proc interface to change smp_affinity on an 
> interrupt without any of these changes, irqbalance will override it on its 
> next poll interval.  This also is not desirable.

This all sounds backwards.. we've got a perfectly functional interface
for affinity -- which people object to being used for some reason. So
you add another interface on top, and that is ok?

All the while not CC'ing the IRQ folks,.. brilliant approach.


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24  5:17       ` Yong Zhang
  (?)
@ 2009-11-24  8:39       ` Peter P Waskiewicz Jr
  -1 siblings, 0 replies; 67+ messages in thread
From: Peter P Waskiewicz Jr @ 2009-11-24  8:39 UTC (permalink / raw)
  To: Yong Zhang; +Cc: linux-kernel, arjan, davem, netdev

On Mon, 2009-11-23 at 22:17 -0700, Yong Zhang wrote:
> [snip]
> >>
> >> 1) I think you should consider CONFIG_CPUMASK_OFFSTACK which will affect
> >>    node_affinity.
> >> 2) It seems like this patch can't work with SPARSE_IRQ.
> >
> > This mechanism isn't going to be used by any internal kernel mechanism
> > for determining interrupt placement or operation.  It's purely something
> > that either a driver can modify, or external script (through /proc),
> > that irqbalance will make use of.  If irqbalance isn't running, or the
> > current version of irqbalance doesn't support reading node_affinity,
> > then it won't affect the system's operation.
> >
> > If irqbalance does support it, it'll read whatever the supplied mask is,
> > and then will try and balance interrupts within that mask.  It will bail
> > if the mask is invalid, or won't apply to the running system, just like
> > how putting a bogus mask into smp_affinity is ignored.
> >
> > If there's something I'm missing beyond this with the two suggestions
> > you've made (I looked into those two parameters and tried to draw
> > conclusions), please let me know.
> 
> My two suggestions are both about your adding node_affinity. Before you can
> use this element, you must initialise it firstly. You can refer how
> irq_desc::affinity
> is used in function alloc_desc_masks().
> include/linux/irq.h:
> static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
> 							bool boot)
> {
> 	gfp_t gfp = GFP_ATOMIC;
> 
> 	if (boot)
> 		gfp = GFP_NOWAIT;
> 
> #ifdef CONFIG_CPUMASK_OFFSTACK
> 	if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
> 		return false;
> 
> #ifdef CONFIG_GENERIC_PENDING_IRQ
> 	if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
> 		free_cpumask_var(desc->affinity);
> 		return false;
> 	}
> #endif
> #endif
> 	return true;
> }
> 

Ah, ok.  I see what you were referring to now.  Let me respin the patch
and send a second version.

Thanks Yong,

-PJ Waskiewicz


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24  6:07       ` Arjan van de Ven
@ 2009-11-24  8:39         ` Peter Zijlstra
  2009-11-24 14:42           ` Arjan van de Ven
  2009-11-24 17:39           ` David Miller
  0 siblings, 2 replies; 67+ messages in thread
From: Peter Zijlstra @ 2009-11-24  8:39 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Peter P Waskiewicz Jr, Yong Zhang, linux-kernel, arjan, davem, netdev

On Mon, 2009-11-23 at 22:07 -0800, Arjan van de Ven wrote:
> Peter Zijlstra wrote:
> > On Mon, 2009-11-23 at 01:36 -0800, Peter P Waskiewicz Jr wrote:
> > 
> >> This mechanism isn't going to be used by any internal kernel mechanism
> >> for determining interrupt placement or operation.  It's purely something
> >> that either a driver can modify, or external script (through /proc),
> >> that irqbalance will make use of.  If irqbalance isn't running, or the
> >> current version of irqbalance doesn't support reading node_affinity,
> >> then it won't affect the system's operation.
> >>
> >> If irqbalance does support it, it'll read whatever the supplied mask is,
> >> and then will try and balance interrupts within that mask.  It will bail
> >> if the mask is invalid, or won't apply to the running system, just like
> >> how putting a bogus mask into smp_affinity is ignored.
> >>
> >> If there's something I'm missing beyond this with the two suggestions
> >> you've made (I looked into those two parameters and tried to draw
> >> conclusions), please let me know.
> > 
> > I don't see the point in adding it, if the driver wants to set a node
> > cpu mask it can already do that using the regular smp affinity settings.
> > 
> > Same for userspace.
> 
> the problem is that there is no way currently that the driver can communicate
> "I allocated all my metadata on THIS numa node". irqbalance and sysadmins need
> that to not make really stupid decisions.....

And what exactly is struct device::numa_node good for then?


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-24  7:46                 ` Eric Dumazet
@ 2009-11-24  8:46                   ` Badalian Vyacheslav
  2009-11-24  9:07                   ` Peter P Waskiewicz Jr
  1 sibling, 0 replies; 67+ messages in thread
From: Badalian Vyacheslav @ 2009-11-24  8:46 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Waskiewicz Jr, Peter P, Linux Netdev List

Eric Dumazet пишет:
> Waskiewicz Jr, Peter P a écrit :
>> Ok, I was confused earlier.  I thought you were saying that all packets 
>> were headed into a single Rx queue.  This is different.
>>
>> Do you know what version of irqbalance you're running, or if it's running 
>> at all?  We've seen issues with irqbalance where it won't recognize the 
>> ethernet device if the driver has been reloaded.  In that case, it won't 
>> balance the interrupts at all.  If the default affinity was set to one 
>> CPU, then well, you're screwed.
>>
>> My suggestion in this case is after you reload ixgbe and start your tests, 
>> see if it all goes to one CPU.  If it does, then restart irqbalance 
>> (service irqbalance restart - or just kill it and restart by hand).  Then 
>> start running your test, and in 10 seconds you should see the interrupts 
>> move and spread out.
>>
>> Let me know if this helps,
> 
> Sure it helps !
> 
> I tried without irqbalance and with irqbalance (Ubuntu 9.10 ships irqbalance 0.55-4)
> I can see irqbalance setting smp_affinities to 5555 or AAAA with no direct effect.
> 
> I do receive 16 different irqs, but all serviced on one cpu.
> 
> Only way to have irqs on different cpus is to manualy force irq affinities to be exclusive
> (one bit set in the mask, not several ones), and that is not optimal for moderate loads.
> 
> echo 1 >`echo /proc/irq/*/fiber1-TxRx-0/../smp_affinity`
> echo 1 >`echo /proc/irq/*/fiber1-TxRx-1/../smp_affinity`
> echo 4 >`echo /proc/irq/*/fiber1-TxRx-2/../smp_affinity`
> echo 4 >`echo /proc/irq/*/fiber1-TxRx-3/../smp_affinity`
> echo 10 >`echo /proc/irq/*/fiber1-TxRx-4/../smp_affinity`
> echo 10 >`echo /proc/irq/*/fiber1-TxRx-5/../smp_affinity`
> echo 40 >`echo /proc/irq/*/fiber1-TxRx-6/../smp_affinity`
> echo 40 >`echo /proc/irq/*/fiber1-TxRx-7/../smp_affinity`
> echo 100 >`echo /proc/irq/*/fiber1-TxRx-8/../smp_affinity`
> echo 100 >`echo /proc/irq/*/fiber1-TxRx-9/../smp_affinity`
> echo 400 >`echo /proc/irq/*/fiber1-TxRx-10/../smp_affinity`
> echo 400 >`echo /proc/irq/*/fiber1-TxRx-11/../smp_affinity`
> echo 1000 >`echo /proc/irq/*/fiber1-TxRx-12/../smp_affinity`
> echo 1000 >`echo /proc/irq/*/fiber1-TxRx-13/../smp_affinity`
> echo 4000 >`echo /proc/irq/*/fiber1-TxRx-14/../smp_affinity`
> echo 4000 >`echo /proc/irq/*/fiber1-TxRx-15/../smp_affinity`
> 
> 
> One other problem is that after reload of ixgbe driver, link is 95% of the time
> at 1 Gbps speed, and I could not find an easy way to force it being 10 Gbps
> 
> I run following script many times and stop it when 10 Gbps speed if reached.
> 
> ethtool -A fiber0 rx off tx off
> ip link set fiber0 down
> ip link set fiber1 down
> sleep 2
> ethtool fiber0
> ethtool -s fiber0 speed 10000
> ethtool -s fiber1 speed 10000
> ethtool -r fiber0 &
> ethtool -r fiber1 &
> ethtool fiber0
> ip link set fiber1 up &
> ip link set fiber0 up &
> ethtool fiber0
> 
> [   33.625689] ixgbe: Intel(R) 10 Gigabit PCI Express Network Driver - version 2.0.44-k2
> [   33.625692] ixgbe: Copyright (c) 1999-2009 Intel Corporation.
> [   33.625741] ixgbe 0000:07:00.0: PCI INT A -> GSI 32 (level, low) -> IRQ 32
> [   33.625760] ixgbe 0000:07:00.0: setting latency timer to 64
> [   33.735579] ixgbe 0000:07:00.0: irq 100 for MSI/MSI-X
> [   33.735583] ixgbe 0000:07:00.0: irq 101 for MSI/MSI-X
> [   33.735585] ixgbe 0000:07:00.0: irq 102 for MSI/MSI-X
> [   33.735587] ixgbe 0000:07:00.0: irq 103 for MSI/MSI-X
> [   33.735589] ixgbe 0000:07:00.0: irq 104 for MSI/MSI-X
> [   33.735591] ixgbe 0000:07:00.0: irq 105 for MSI/MSI-X
> [   33.735593] ixgbe 0000:07:00.0: irq 106 for MSI/MSI-X
> [   33.735595] ixgbe 0000:07:00.0: irq 107 for MSI/MSI-X
> [   33.735597] ixgbe 0000:07:00.0: irq 108 for MSI/MSI-X
> [   33.735599] ixgbe 0000:07:00.0: irq 109 for MSI/MSI-X
> [   33.735602] ixgbe 0000:07:00.0: irq 110 for MSI/MSI-X
> [   33.735604] ixgbe 0000:07:00.0: irq 111 for MSI/MSI-X
> [   33.735606] ixgbe 0000:07:00.0: irq 112 for MSI/MSI-X
> [   33.735608] ixgbe 0000:07:00.0: irq 113 for MSI/MSI-X
> [   33.735610] ixgbe 0000:07:00.0: irq 114 for MSI/MSI-X
> [   33.735612] ixgbe 0000:07:00.0: irq 115 for MSI/MSI-X
> [   33.735614] ixgbe 0000:07:00.0: irq 116 for MSI/MSI-X
> [   33.735633] ixgbe: 0000:07:00.0: ixgbe_init_interrupt_scheme: Multiqueue Enabled: Rx Queue count = 16, Tx Queue count = 16
> [   33.735638] ixgbe 0000:07:00.0: (PCI Express:5.0Gb/s:Width x8) 00:1b:21:4a:fe:54
> [   33.735722] ixgbe 0000:07:00.0: MAC: 2, PHY: 11, SFP+: 5, PBA No: e66562-003
> [   33.738111] ixgbe 0000:07:00.0: Intel(R) 10 Gigabit Network Connection
> [   33.738135] ixgbe 0000:07:00.1: PCI INT B -> GSI 42 (level, low) -> IRQ 42
> [   33.738151] ixgbe 0000:07:00.1: setting latency timer to 64
> [   33.853526] ixgbe 0000:07:00.1: irq 117 for MSI/MSI-X
> [   33.853529] ixgbe 0000:07:00.1: irq 118 for MSI/MSI-X
> [   33.853532] ixgbe 0000:07:00.1: irq 119 for MSI/MSI-X
> [   33.853534] ixgbe 0000:07:00.1: irq 120 for MSI/MSI-X
> [   33.853536] ixgbe 0000:07:00.1: irq 121 for MSI/MSI-X
> [   33.853538] ixgbe 0000:07:00.1: irq 122 for MSI/MSI-X
> [   33.853540] ixgbe 0000:07:00.1: irq 123 for MSI/MSI-X
> [   33.853542] ixgbe 0000:07:00.1: irq 124 for MSI/MSI-X
> [   33.853544] ixgbe 0000:07:00.1: irq 125 for MSI/MSI-X
> [   33.853546] ixgbe 0000:07:00.1: irq 126 for MSI/MSI-X
> [   33.853548] ixgbe 0000:07:00.1: irq 127 for MSI/MSI-X
> [   33.853550] ixgbe 0000:07:00.1: irq 128 for MSI/MSI-X
> [   33.853552] ixgbe 0000:07:00.1: irq 129 for MSI/MSI-X
> [   33.853554] ixgbe 0000:07:00.1: irq 130 for MSI/MSI-X
> [   33.853556] ixgbe 0000:07:00.1: irq 131 for MSI/MSI-X
> [   33.853558] ixgbe 0000:07:00.1: irq 132 for MSI/MSI-X
> [   33.853560] ixgbe 0000:07:00.1: irq 133 for MSI/MSI-X
> [   33.853580] ixgbe: 0000:07:00.1: ixgbe_init_interrupt_scheme: Multiqueue Enabled: Rx Queue count = 16, Tx Queue count = 16
> [   33.853585] ixgbe 0000:07:00.1: (PCI Express:5.0Gb/s:Width x8) 00:1b:21:4a:fe:55
> [   33.853669] ixgbe 0000:07:00.1: MAC: 2, PHY: 11, SFP+: 5, PBA No: e66562-003
> [   33.855956] ixgbe 0000:07:00.1: Intel(R) 10 Gigabit Network Connection
> 
> [   85.208233] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: RX/TX
> [   85.237453] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: RX/TX
> [   96.080713] ixgbe: fiber1 NIC Link is Down
> [  102.094610] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
> [  102.119572] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
> [  142.524691] ixgbe: fiber1 NIC Link is Down
> [  148.421332] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
> [  148.449465] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
> [  160.728643] ixgbe: fiber1 NIC Link is Down
> [  172.832301] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
> [  173.659038] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
> [  184.554501] ixgbe: fiber0 NIC Link is Down
> [  185.376273] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
> [  186.493598] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
> [  190.564383] ixgbe: fiber0 NIC Link is Down
> [  191.391149] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
> [  192.484492] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
> [  192.545424] ixgbe: fiber1 NIC Link is Down
> [  205.858197] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
> [  206.684940] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
> [  211.991875] ixgbe: fiber1 NIC Link is Down
> [  220.833478] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
> [  220.833630] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
> [  229.804853] ixgbe: fiber1 NIC Link is Down
> [  248.395672] ixgbe: fiber0 NIC Link is Up 1 Gbps, Flow Control: None
> [  249.222408] ixgbe: fiber1 NIC Link is Up 1 Gbps, Flow Control: None
> [  484.631598] ixgbe: fiber1 NIC Link is Down
> [  490.138931] ixgbe: fiber1 NIC Link is Up 10 Gbps, Flow Control: None
> [  490.167880] ixgbe: fiber0 NIC Link is Up 10 Gbps, Flow Control: None
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

May be its Flow Director?
Multuqueue in this network card work only if you set 1 queue to 1 cpu core in smp_affinity :(
In README:


Intel(R) Ethernet Flow Director
-------------------------------
Supports advanced filters that direct receive packets by their flows to
different queues. Enables tight control on routing a flow in the platform.
Matches flows and CPU cores for flow affinity. Supports multiple parameters
for flexible flow classification and load balancing.

Flow director is enabled only if the kernel is multiple TX queue capable.

An included script (set_irq_affinity.sh) automates setting the IRQ to CPU
affinity.

You can verify that the driver is using Flow Director by looking at the counter
in ethtool: fdir_miss and fdir_match.

The following three parameters impact Flow Director.


FdirMode
--------
Valid Range: 0-2 (0=off, 1=ATR, 2=Perfect filter mode)
Default Value: 1

  Flow Director filtering modes.


FdirPballoc
-----------
Valid Range: 0-2 (0=64k, 1=128k, 2=256k)
Default Value: 0

  Flow Director allocated packet buffer size.


AtrSampleRate
--------------
Valid Range: 1-100
Default Value: 20

  Software ATR Tx packet sample rate. For example, when set to 20, every 20th
  packet, looks to see if the packet will create a new flow.






^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24  8:38         ` Peter Zijlstra
@ 2009-11-24  8:59           ` Peter P Waskiewicz Jr
  2009-11-24  9:08             ` Peter Zijlstra
                               ` (2 more replies)
  0 siblings, 3 replies; 67+ messages in thread
From: Peter P Waskiewicz Jr @ 2009-11-24  8:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Yong Zhang, linux-kernel, arjan, davem, netdev, Thomas Gleixner

On Tue, 2009-11-24 at 01:38 -0700, Peter Zijlstra wrote:
> On Mon, 2009-11-23 at 15:32 -0800, Waskiewicz Jr, Peter P wrote:
> 
> > Unfortunately, a driver can't.  The irq_set_affinity() function isn't 
> > exported.  I proposed a patch on netdev to export it, and then to tie down 
> > an interrupt using IRQF_NOBALANCING, so irqbalance won't touch it.  That 
> > was rejected, since the driver is enforcing policy of the interrupt 
> > balancing, not irqbalance.
> 
> Why would a patch touching the irq subsystem go to netdev?

The only change to the IRQ subsystem was:

EXPORT_SYMBOL(irq_set_affinity);

The majority of the changeset was for the ixgbe driver.

> What is wrong with exporting irq_set_affinity(), and wtf do you need
> IRQF_NOBALANCING for?
> 

Again, the pushback I received was with allowing anything other than
irqbalance to dictate interrupt affinity policy.

And if I set interrupt affinity from the driver or from /proc,
irqbalance will happily rebalance the interrupt elsewhere.  The
IRQF_NOBALANCING flag will prevent irqbalance from being able to move
the interrupt.

> > I and Jesse Brandeburg had a meeting with Arjan about this.  What we came 
> > up with was this interface, so drivers can set what they'd like to see, if 
> > irqbalance decides to honor it.  That way interrupt affinity policies are 
> > set only by irqbalance, but this interface gives us a mechanism to hint to 
> > irqbalance what we'd like it to do.
> 
> If all you want is to expose policy to userspace then you don't need any
> of this, simply expose the NICs home node through a sysfs device thingy
> (I was under the impression its already there somewhere, but I can't
> ever find anything in /sys).
> 
> No need what so ever to poke at the IRQ subsystem.

The point is we need something common that the kernel side (whether a
driver or /proc can modify) that irqbalance can use.

> > Also, if you use the /proc interface to change smp_affinity on an 
> > interrupt without any of these changes, irqbalance will override it on its 
> > next poll interval.  This also is not desirable.
> 
> This all sounds backwards.. we've got a perfectly functional interface
> for affinity -- which people object to being used for some reason. So
> you add another interface on top, and that is ok?
> 

But it's not functional.  If I set the affinity in smp_affinity, then
irqbalance will override it 10 seconds later.

> All the while not CC'ing the IRQ folks,.. brilliant approach.

If I knew who I should CC, I'd be happy to add them.  Can you provide
email addresses please?

Cheers,
-PJ Waskiewicz


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-24  7:46                 ` Eric Dumazet
  2009-11-24  8:46                   ` Badalian Vyacheslav
@ 2009-11-24  9:07                   ` Peter P Waskiewicz Jr
  2009-11-24  9:55                     ` Eric Dumazet
  1 sibling, 1 reply; 67+ messages in thread
From: Peter P Waskiewicz Jr @ 2009-11-24  9:07 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: robert, Jesper Dangaard Brouer, Linux Netdev List

On Tue, 2009-11-24 at 00:46 -0700, Eric Dumazet wrote:
> Waskiewicz Jr, Peter P a écrit :
> > Ok, I was confused earlier.  I thought you were saying that all packets 
> > were headed into a single Rx queue.  This is different.
> > 
> > Do you know what version of irqbalance you're running, or if it's running 
> > at all?  We've seen issues with irqbalance where it won't recognize the 
> > ethernet device if the driver has been reloaded.  In that case, it won't 
> > balance the interrupts at all.  If the default affinity was set to one 
> > CPU, then well, you're screwed.
> > 
> > My suggestion in this case is after you reload ixgbe and start your tests, 
> > see if it all goes to one CPU.  If it does, then restart irqbalance 
> > (service irqbalance restart - or just kill it and restart by hand).  Then 
> > start running your test, and in 10 seconds you should see the interrupts 
> > move and spread out.
> > 
> > Let me know if this helps,
> 
> Sure it helps !
> 
> I tried without irqbalance and with irqbalance (Ubuntu 9.10 ships irqbalance 0.55-4)
> I can see irqbalance setting smp_affinities to 5555 or AAAA with no direct effect.
> 
> I do receive 16 different irqs, but all serviced on one cpu.
> 
> Only way to have irqs on different cpus is to manualy force irq affinities to be exclusive
> (one bit set in the mask, not several ones), and that is not optimal for moderate loads.
> 
> echo 1 >`echo /proc/irq/*/fiber1-TxRx-0/../smp_affinity`
> echo 1 >`echo /proc/irq/*/fiber1-TxRx-1/../smp_affinity`
> echo 4 >`echo /proc/irq/*/fiber1-TxRx-2/../smp_affinity`
> echo 4 >`echo /proc/irq/*/fiber1-TxRx-3/../smp_affinity`
> echo 10 >`echo /proc/irq/*/fiber1-TxRx-4/../smp_affinity`
> echo 10 >`echo /proc/irq/*/fiber1-TxRx-5/../smp_affinity`
> echo 40 >`echo /proc/irq/*/fiber1-TxRx-6/../smp_affinity`
> echo 40 >`echo /proc/irq/*/fiber1-TxRx-7/../smp_affinity`
> echo 100 >`echo /proc/irq/*/fiber1-TxRx-8/../smp_affinity`
> echo 100 >`echo /proc/irq/*/fiber1-TxRx-9/../smp_affinity`
> echo 400 >`echo /proc/irq/*/fiber1-TxRx-10/../smp_affinity`
> echo 400 >`echo /proc/irq/*/fiber1-TxRx-11/../smp_affinity`
> echo 1000 >`echo /proc/irq/*/fiber1-TxRx-12/../smp_affinity`
> echo 1000 >`echo /proc/irq/*/fiber1-TxRx-13/../smp_affinity`
> echo 4000 >`echo /proc/irq/*/fiber1-TxRx-14/../smp_affinity`
> echo 4000 >`echo /proc/irq/*/fiber1-TxRx-15/../smp_affinity`
> 
> 
> One other problem is that after reload of ixgbe driver, link is 95% of the time
> at 1 Gbps speed, and I could not find an easy way to force it being 10 Gbps
> 

You might have this elsewhere, but it sounds like you're connecting back
to back with another 82599 NIC.  Our optics in that NIC are dual-rate,
and the software mechanism that tries to "autoneg" link speed gets out
of sync easily in back-to-back setups.

If it's really annoying, and you're willing to run with a local patch to
disable the autotry mechanism, try this:

diff --git a/drivers/net/ixgbe/ixgbe_main.c
b/drivers/net/ixgbe/ixgbe_main.c
index a5036f7..62c0915 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -4670,6 +4670,10 @@ static void ixgbe_multispeed_fiber_task(struct
work_struct *work)
        autoneg = hw->phy.autoneg_advertised;
        if ((!autoneg) && (hw->mac.ops.get_link_capabilities))
                hw->mac.ops.get_link_capabilities(hw, &autoneg,
&negotiation);
+
+       /* force 10G only */
+       autoneg = IXGBE_LINK_SPEED_10GB_FULL;
+
        if (hw->mac.ops.setup_link)
                hw->mac.ops.setup_link(hw, autoneg, negotiation, true);
        adapter->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;




Cheers,
-PJ


^ permalink raw reply related	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24  8:59           ` Peter P Waskiewicz Jr
@ 2009-11-24  9:08             ` Peter Zijlstra
  2009-11-24  9:15               ` Peter P Waskiewicz Jr
  2009-11-24 14:43               ` Arjan van de Ven
  2009-11-24  9:15             ` Peter Zijlstra
  2009-11-24 10:07             ` Thomas Gleixner
  2 siblings, 2 replies; 67+ messages in thread
From: Peter Zijlstra @ 2009-11-24  9:08 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr
  Cc: Yong Zhang, linux-kernel, arjan, davem, netdev, Thomas Gleixner

On Tue, 2009-11-24 at 00:59 -0800, Peter P Waskiewicz Jr wrote:
> > This all sounds backwards.. we've got a perfectly functional interface
> > for affinity -- which people object to being used for some reason. So
> > you add another interface on top, and that is ok?
> > 
> 
> But it's not functional.  If I set the affinity in smp_affinity, then
> irqbalance will override it 10 seconds later. 

And here I was thinking the kernel round-robins IRQ delivery on the mask
specified there. Are you talking about some daft userspace thing that
writes into the irq smp_affinity to effect irq balancing?


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24  9:08             ` Peter Zijlstra
@ 2009-11-24  9:15               ` Peter P Waskiewicz Jr
  2009-11-24 14:43               ` Arjan van de Ven
  1 sibling, 0 replies; 67+ messages in thread
From: Peter P Waskiewicz Jr @ 2009-11-24  9:15 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Yong Zhang, linux-kernel, arjan, davem, netdev, Thomas Gleixner

On Tue, 2009-11-24 at 02:08 -0700, Peter Zijlstra wrote:
> On Tue, 2009-11-24 at 00:59 -0800, Peter P Waskiewicz Jr wrote:
> > > This all sounds backwards.. we've got a perfectly functional interface
> > > for affinity -- which people object to being used for some reason. So
> > > you add another interface on top, and that is ok?
> > > 
> > 
> > But it's not functional.  If I set the affinity in smp_affinity, then
> > irqbalance will override it 10 seconds later. 
> 
> And here I was thinking the kernel round-robins IRQ delivery on the mask
> specified there. Are you talking about some daft userspace thing that
> writes into the irq smp_affinity to effect irq balancing?
> 

Yep.  That's exactly what irqbalance does.

Cheers,
-PJ


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24  8:59           ` Peter P Waskiewicz Jr
  2009-11-24  9:08             ` Peter Zijlstra
@ 2009-11-24  9:15             ` Peter Zijlstra
  2009-11-24 10:07             ` Thomas Gleixner
  2 siblings, 0 replies; 67+ messages in thread
From: Peter Zijlstra @ 2009-11-24  9:15 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr
  Cc: Yong Zhang, linux-kernel, arjan, davem, netdev, Thomas Gleixner

On Tue, 2009-11-24 at 00:59 -0800, Peter P Waskiewicz Jr wrote:
> 
> > All the while not CC'ing the IRQ folks,.. brilliant approach.
> 
> If I knew who I should CC, I'd be happy to add them.  Can you provide
> email addresses please? 

Since most people can't seen to read a simple MAINTAINERS file, some
other people wrote a script to read it for you:

# scripts/get_maintainer.pl -f kernel/irq/manage.c
Ingo Molnar <mingo@elte.hu>
Thomas Gleixner <tglx@linutronix.de>
linux-kernel@vger.kernel.org


Another option is to do something like:

# git log kernel/irq/manage.c | grep Author | head -30 | awk
'{ t[$0]++; } END { for (i in t) { print t[i] " " i; }}' | sort -rn

10 Author: Thomas Gleixner <tglx@linutronix.de>
9 Author: Ingo Molnar <mingo@elte.hu>
3 Author: Magnus Damm <damm@igel.co.jp>
2 Author: Linus Torvalds <torvalds@linux-foundation.org>
...


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-24  9:07                   ` Peter P Waskiewicz Jr
@ 2009-11-24  9:55                     ` Eric Dumazet
  2009-11-24 10:06                       ` Peter P Waskiewicz Jr
  2009-11-26 14:10                       ` Badalian Vyacheslav
  0 siblings, 2 replies; 67+ messages in thread
From: Eric Dumazet @ 2009-11-24  9:55 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr; +Cc: robert, Jesper Dangaard Brouer, Linux Netdev List

Peter P Waskiewicz Jr a écrit :

> You might have this elsewhere, but it sounds like you're connecting back
> to back with another 82599 NIC.  Our optics in that NIC are dual-rate,
> and the software mechanism that tries to "autoneg" link speed gets out
> of sync easily in back-to-back setups.
> 
> If it's really annoying, and you're willing to run with a local patch to
> disable the autotry mechanism, try this:
> 
> diff --git a/drivers/net/ixgbe/ixgbe_main.c
> b/drivers/net/ixgbe/ixgbe_main.c
> index a5036f7..62c0915 100644
> --- a/drivers/net/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ixgbe/ixgbe_main.c
> @@ -4670,6 +4670,10 @@ static void ixgbe_multispeed_fiber_task(struct
> work_struct *work)
>         autoneg = hw->phy.autoneg_advertised;
>         if ((!autoneg) && (hw->mac.ops.get_link_capabilities))
>                 hw->mac.ops.get_link_capabilities(hw, &autoneg,
> &negotiation);
> +
> +       /* force 10G only */
> +       autoneg = IXGBE_LINK_SPEED_10GB_FULL;
> +
>         if (hw->mac.ops.setup_link)
>                 hw->mac.ops.setup_link(hw, autoneg, negotiation, true);
>         adapter->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;

Thanks ! This did the trick :)

If I am not mistaken, number of TX queues should be capped by number of possible cpus ?

Its currently a fixed 128 value, allocating 128*128 = 16384 bytes,
and polluting "tc -s -d class show dev fiber0" output.

[PATCH net-next-2.6] ixgbe: Do not allocate too many netdev txqueues

Instead of allocating 128 struct netdev_queue per device, use the minimum
value between 128 and number of possible cpus, to reduce ram usage and
"tc -s -d class show dev ..." output

diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index ebcec30..ec2508d 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -5582,7 +5583,10 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
 	pci_set_master(pdev);
 	pci_save_state(pdev);
 
-	netdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter), MAX_TX_QUEUES);
+	netdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter),
+				   min_t(unsigned int,
+					 MAX_TX_QUEUES,
+					 num_possible_cpus()));
 	if (!netdev) {
 		err = -ENOMEM;
 		goto err_alloc_etherdev;

^ permalink raw reply related	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-24  9:55                     ` Eric Dumazet
@ 2009-11-24 10:06                       ` Peter P Waskiewicz Jr
  2009-11-24 11:37                         ` [PATCH net-next-2.6] ixgbe: Fix TX stats accounting Eric Dumazet
  2009-11-24 13:14                         ` ixgbe question John Fastabend
  2009-11-26 14:10                       ` Badalian Vyacheslav
  1 sibling, 2 replies; 67+ messages in thread
From: Peter P Waskiewicz Jr @ 2009-11-24 10:06 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: robert, Jesper Dangaard Brouer, Linux Netdev List

On Tue, 2009-11-24 at 02:55 -0700, Eric Dumazet wrote:
> Peter P Waskiewicz Jr a écrit :
> 
> > You might have this elsewhere, but it sounds like you're connecting back
> > to back with another 82599 NIC.  Our optics in that NIC are dual-rate,
> > and the software mechanism that tries to "autoneg" link speed gets out
> > of sync easily in back-to-back setups.
> > 
> > If it's really annoying, and you're willing to run with a local patch to
> > disable the autotry mechanism, try this:
> > 
> > diff --git a/drivers/net/ixgbe/ixgbe_main.c
> > b/drivers/net/ixgbe/ixgbe_main.c
> > index a5036f7..62c0915 100644
> > --- a/drivers/net/ixgbe/ixgbe_main.c
> > +++ b/drivers/net/ixgbe/ixgbe_main.c
> > @@ -4670,6 +4670,10 @@ static void ixgbe_multispeed_fiber_task(struct
> > work_struct *work)
> >         autoneg = hw->phy.autoneg_advertised;
> >         if ((!autoneg) && (hw->mac.ops.get_link_capabilities))
> >                 hw->mac.ops.get_link_capabilities(hw, &autoneg,
> > &negotiation);
> > +
> > +       /* force 10G only */
> > +       autoneg = IXGBE_LINK_SPEED_10GB_FULL;
> > +
> >         if (hw->mac.ops.setup_link)
> >                 hw->mac.ops.setup_link(hw, autoneg, negotiation, true);
> >         adapter->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
> 
> Thanks ! This did the trick :)
> 
> If I am not mistaken, number of TX queues should be capped by number of possible cpus ?
> 
> Its currently a fixed 128 value, allocating 128*128 = 16384 bytes,
> and polluting "tc -s -d class show dev fiber0" output.
> 

Yes, this is a stupid issue we haven't gotten around to fixing yet.
This looks fine to me.  Thanks for putting it together.

> [PATCH net-next-2.6] ixgbe: Do not allocate too many netdev txqueues
> 
> Instead of allocating 128 struct netdev_queue per device, use the minimum
> value between 128 and number of possible cpus, to reduce ram usage and
> "tc -s -d class show dev ..." output
> 
> diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
> index ebcec30..ec2508d 100644
> --- a/drivers/net/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ixgbe/ixgbe_main.c
> @@ -5582,7 +5583,10 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
>  	pci_set_master(pdev);
>  	pci_save_state(pdev);
>  
> -	netdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter), MAX_TX_QUEUES);
> +	netdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter),
> +				   min_t(unsigned int,
> +					 MAX_TX_QUEUES,
> +					 num_possible_cpus()));
>  	if (!netdev) {
>  		err = -ENOMEM;
>  		goto err_alloc_etherdev;


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24  8:59           ` Peter P Waskiewicz Jr
  2009-11-24  9:08             ` Peter Zijlstra
  2009-11-24  9:15             ` Peter Zijlstra
@ 2009-11-24 10:07             ` Thomas Gleixner
  2009-11-24 17:55               ` Peter P Waskiewicz Jr
  2009-11-25 11:18               ` Peter Zijlstra
  2 siblings, 2 replies; 67+ messages in thread
From: Thomas Gleixner @ 2009-11-24 10:07 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr
  Cc: Peter Zijlstra, Yong Zhang, linux-kernel, arjan, davem, netdev,
	Jesse Barnes

On Tue, 24 Nov 2009, Peter P Waskiewicz Jr wrote:
> On Tue, 2009-11-24 at 01:38 -0700, Peter Zijlstra wrote:
> > On Mon, 2009-11-23 at 15:32 -0800, Waskiewicz Jr, Peter P wrote:
> > 
> > > Unfortunately, a driver can't.  The irq_set_affinity() function isn't 
> > > exported.  I proposed a patch on netdev to export it, and then to tie down 
> > > an interrupt using IRQF_NOBALANCING, so irqbalance won't touch it.  That 
> > > was rejected, since the driver is enforcing policy of the interrupt 
> > > balancing, not irqbalance.
> > 
> > Why would a patch touching the irq subsystem go to netdev?
> 
> The only change to the IRQ subsystem was:
> 
> EXPORT_SYMBOL(irq_set_affinity);

Which is still touching the generic irq subsystem and needs the ack of
the relevant maintainer. If there is a need to expose such an
interface to drivers then the maintainer wants to know exactly why and
needs to be part of the discussion of alternative solutions. Otherwise
you waste time on implementing stuff like the current patch which is
definitely not going anywhere near the irq subsystem.

> > If all you want is to expose policy to userspace then you don't need any
> > of this, simply expose the NICs home node through a sysfs device thingy
> > (I was under the impression its already there somewhere, but I can't
> > ever find anything in /sys).
> > 
> > No need what so ever to poke at the IRQ subsystem.
> 
> The point is we need something common that the kernel side (whether a
> driver or /proc can modify) that irqbalance can use.

/sys/class/net/ethX/device/numa_node 

perhaps ?
 
> > > Also, if you use the /proc interface to change smp_affinity on an 
> > > interrupt without any of these changes, irqbalance will override it on its 
> > > next poll interval.  This also is not desirable.
> > 
> > This all sounds backwards.. we've got a perfectly functional interface
> > for affinity -- which people object to being used for some reason. So
> > you add another interface on top, and that is ok?
> > 
> 
> But it's not functional.  If I set the affinity in smp_affinity, then
> irqbalance will override it 10 seconds later.

And to work around the brain wreckage of irqbalanced you want to
fiddle in the irq code instead of teaching irqbalanced to handle node
affinities ?

The only thing which is worth to investigate is whether the irq core
code should honour the dev->numa_node setting and restrict the
possible irq affinity settings to that node. If a device is tied to a
node it makes a certain amount of sense to do that.

But such a change would not need a new interface in the irq core and
definitely not a new cpumask_t member in the irq_desc structure to
store a node affinity which can be expressed with a simple
integer.

But this needs more thoughts and I want to know more about the
background and the reasoning for such a change.

Thanks,

	tglx




^ permalink raw reply	[flat|nested] 67+ messages in thread

* [PATCH net-next-2.6] ixgbe: Fix TX stats accounting
  2009-11-24 10:06                       ` Peter P Waskiewicz Jr
@ 2009-11-24 11:37                         ` Eric Dumazet
  2009-11-24 13:23                           ` Eric Dumazet
  2009-11-24 13:14                         ` ixgbe question John Fastabend
  1 sibling, 1 reply; 67+ messages in thread
From: Eric Dumazet @ 2009-11-24 11:37 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr, Jeff Kirsher
  Cc: robert, Jesper Dangaard Brouer, Linux Netdev List, David S. Miller

Several cpus can update netdev->stats.tx_bytes & netdev->stats.tx_packets
in parallel. In this case, TX stats are under estimated and false sharing
takes place.

After a pktgen session sending exactly 200000000 packets :
# ifconfig fiber0 | grep TX
          TX packets:198501982 errors:0 dropped:0 overruns:0 carrier:0


Multi queue devices should instead use txq->tx_bytes & txq->tx_packets
in their xmit() method (appropriate txq lock already held by caller, no
cache line miss), or use appropriate locking.

After patch, same pktgen session gives :

# ifconfig fiber0 | grep TX
          TX packets:200000000 errors:0 dropped:0 overruns:0 carrier:0

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 drivers/net/ixgbe/ixgbe_main.c |   20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index ebcec30..1cea120 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -425,8 +425,6 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 	tx_ring->total_packets += total_packets;
 	tx_ring->stats.packets += total_packets;
 	tx_ring->stats.bytes += total_bytes;
-	netdev->stats.tx_bytes += total_bytes;
-	netdev->stats.tx_packets += total_packets;
 	return (count < tx_ring->work_limit);
 }
 
@@ -5249,6 +5247,7 @@ static netdev_tx_t ixgbe_xmit_frame(struct sk_buff *skb,
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	struct ixgbe_ring *tx_ring;
+	struct netdev_queue *txq;
 	unsigned int first;
 	unsigned int tx_flags = 0;
 	u8 hdr_len = 0;
@@ -5345,6 +5344,9 @@ static netdev_tx_t ixgbe_xmit_frame(struct sk_buff *skb,
 				tx_ring->atr_count = 0;
 			}
 		}
+		txq = netdev_get_tx_queue(netdev, r_idx);
+		txq->tx_bytes += skb->len;
+		txq->tx_packets++;
 		ixgbe_tx_queue(adapter, tx_ring, tx_flags, count, skb->len,
 		               hdr_len);
 		ixgbe_maybe_stop_tx(netdev, tx_ring, DESC_NEEDED);
@@ -5359,19 +5361,6 @@ static netdev_tx_t ixgbe_xmit_frame(struct sk_buff *skb,
 }
 
 /**
- * ixgbe_get_stats - Get System Network Statistics
- * @netdev: network interface device structure
- *
- * Returns the address of the device statistics structure.
- * The statistics are actually updated from the timer callback.
- **/
-static struct net_device_stats *ixgbe_get_stats(struct net_device *netdev)
-{
-	/* only return the current stats */
-	return &netdev->stats;
-}
-
-/**
  * ixgbe_set_mac - Change the Ethernet Address of the NIC
  * @netdev: network interface device structure
  * @p: pointer to an address structure
@@ -5501,7 +5490,6 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_stop		= ixgbe_close,
 	.ndo_start_xmit		= ixgbe_xmit_frame,
 	.ndo_select_queue	= ixgbe_select_queue,
-	.ndo_get_stats		= ixgbe_get_stats,
 	.ndo_set_rx_mode        = ixgbe_set_rx_mode,
 	.ndo_set_multicast_list	= ixgbe_set_rx_mode,
 	.ndo_validate_addr	= eth_validate_addr,

^ permalink raw reply related	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-24 10:06                       ` Peter P Waskiewicz Jr
  2009-11-24 11:37                         ` [PATCH net-next-2.6] ixgbe: Fix TX stats accounting Eric Dumazet
@ 2009-11-24 13:14                         ` John Fastabend
  2009-11-29  8:18                           ` David Miller
  1 sibling, 1 reply; 67+ messages in thread
From: John Fastabend @ 2009-11-24 13:14 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr
  Cc: Eric Dumazet, robert, Jesper Dangaard Brouer, Linux Netdev List

Peter P Waskiewicz Jr wrote:
> On Tue, 2009-11-24 at 02:55 -0700, Eric Dumazet wrote:
>   
>> Peter P Waskiewicz Jr a écrit :
>>
>>     
>>> You might have this elsewhere, but it sounds like you're connecting back
>>> to back with another 82599 NIC.  Our optics in that NIC are dual-rate,
>>> and the software mechanism that tries to "autoneg" link speed gets out
>>> of sync easily in back-to-back setups.
>>>
>>> If it's really annoying, and you're willing to run with a local patch to
>>> disable the autotry mechanism, try this:
>>>
>>> diff --git a/drivers/net/ixgbe/ixgbe_main.c
>>> b/drivers/net/ixgbe/ixgbe_main.c
>>> index a5036f7..62c0915 100644
>>> --- a/drivers/net/ixgbe/ixgbe_main.c
>>> +++ b/drivers/net/ixgbe/ixgbe_main.c
>>> @@ -4670,6 +4670,10 @@ static void ixgbe_multispeed_fiber_task(struct
>>> work_struct *work)
>>>         autoneg = hw->phy.autoneg_advertised;
>>>         if ((!autoneg) && (hw->mac.ops.get_link_capabilities))
>>>                 hw->mac.ops.get_link_capabilities(hw, &autoneg,
>>> &negotiation);
>>> +
>>> +       /* force 10G only */
>>> +       autoneg = IXGBE_LINK_SPEED_10GB_FULL;
>>> +
>>>         if (hw->mac.ops.setup_link)
>>>                 hw->mac.ops.setup_link(hw, autoneg, negotiation, true);
>>>         adapter->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
>>>       
>> Thanks ! This did the trick :)
>>
>> If I am not mistaken, number of TX queues should be capped by number of possible cpus ?
>>
>> Its currently a fixed 128 value, allocating 128*128 = 16384 bytes,
>> and polluting "tc -s -d class show dev fiber0" output.
>>
>>     
>
> Yes, this is a stupid issue we haven't gotten around to fixing yet.
> This looks fine to me.  Thanks for putting it together.
>
>   
Believe the below patch will break DCB and FCoE though, both features 
have the potential to set real_num_tx_queues to greater then the number 
of CPUs.  This could result in real_num_tx_queues > num_tx_queues. 

The current solution isn't that great though, maybe we should set to the 
minimum of MAX_TX_QUEUES and num_possible_cpus() * 2 + 8.

That should cover the maximum possible queues for DCB, FCoE and their 
combinations. 

general multiq = num_possible_cpus()
DCB = 8 tx queue's
FCoE = 2*num_possible_cpus()
FCoE + DCB = 8 tx queues + num_possible_cpus

thanks,
john.



>> [PATCH net-next-2.6] ixgbe: Do not allocate too many netdev txqueues
>>
>> Instead of allocating 128 struct netdev_queue per device, use the minimum
>> value between 128 and number of possible cpus, to reduce ram usage and
>> "tc -s -d class show dev ..." output
>>
>> diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
>> index ebcec30..ec2508d 100644
>> --- a/drivers/net/ixgbe/ixgbe_main.c
>> +++ b/drivers/net/ixgbe/ixgbe_main.c
>> @@ -5582,7 +5583,10 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
>>  	pci_set_master(pdev);
>>  	pci_save_state(pdev);
>>  
>> -	netdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter), MAX_TX_QUEUES);
>> +	netdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter),
>> +				   min_t(unsigned int,
>> +					 MAX_TX_QUEUES,
>> +					 num_possible_cpus()));
>>  	if (!netdev) {
>>  		err = -ENOMEM;
>>  		goto err_alloc_etherdev;
>>     
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>   



^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH net-next-2.6] ixgbe: Fix TX stats accounting
  2009-11-24 11:37                         ` [PATCH net-next-2.6] ixgbe: Fix TX stats accounting Eric Dumazet
@ 2009-11-24 13:23                           ` Eric Dumazet
  2009-11-25  7:38                             ` Jeff Kirsher
  0 siblings, 1 reply; 67+ messages in thread
From: Eric Dumazet @ 2009-11-24 13:23 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr, Jeff Kirsher
  Cc: robert, Jesper Dangaard Brouer, Linux Netdev List, David S. Miller

Here is an updated version, because ixgbe_get_ethtool_stats()
needs to call dev_get_stats() or "ethtool -S" wont give 
correct tx_bytes/tx_packets values.

[PATCH net-next-2.6] ixgbe: Fix TX stats accounting

Several cpus can update netdev->stats.tx_bytes & netdev->stats.tx_packets
in parallel. In this case, TX stats are under estimated and false sharing
takes place.

After a pktgen session sending exactly 200000000 packets :
# ifconfig fiber0 | grep TX
          TX packets:198501982 errors:0 dropped:0 overruns:0 carrier:0


Multi queue devices should instead use txq->tx_bytes & txq->tx_packets
in their xmit() method (appropriate txq lock already held by caller, no
cache line miss), or use appropriate locking.

After patch, same pktgen session gives :

# ifconfig fiber0 | grep TX
          TX packets:200000000 errors:0 dropped:0 overruns:0 carrier:0

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
 drivers/net/ixgbe/ixgbe_ethtool.c |    1 +
 drivers/net/ixgbe/ixgbe_main.c    |   20 ++++----------------
 2 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index 74f04e1..7b7f8f6 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -944,6 +944,7 @@ static void ixgbe_get_ethtool_stats(struct net_device *netdev,
 	char *p = NULL;
 
 	ixgbe_update_stats(adapter);
+	dev_get_stats(netdev);
 	for (i = 0; i < IXGBE_GLOBAL_STATS_LEN; i++) {
 		switch (ixgbe_gstrings_stats[i].type) {
 		case NETDEV_STATS:
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index ebcec30..1cea120 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -425,8 +425,6 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 	tx_ring->total_packets += total_packets;
 	tx_ring->stats.packets += total_packets;
 	tx_ring->stats.bytes += total_bytes;
-	netdev->stats.tx_bytes += total_bytes;
-	netdev->stats.tx_packets += total_packets;
 	return (count < tx_ring->work_limit);
 }
 
@@ -5249,6 +5247,7 @@ static netdev_tx_t ixgbe_xmit_frame(struct sk_buff *skb,
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	struct ixgbe_ring *tx_ring;
+	struct netdev_queue *txq;
 	unsigned int first;
 	unsigned int tx_flags = 0;
 	u8 hdr_len = 0;
@@ -5345,6 +5344,9 @@ static netdev_tx_t ixgbe_xmit_frame(struct sk_buff *skb,
 				tx_ring->atr_count = 0;
 			}
 		}
+		txq = netdev_get_tx_queue(netdev, r_idx);
+		txq->tx_bytes += skb->len;
+		txq->tx_packets++;
 		ixgbe_tx_queue(adapter, tx_ring, tx_flags, count, skb->len,
 		               hdr_len);
 		ixgbe_maybe_stop_tx(netdev, tx_ring, DESC_NEEDED);
@@ -5359,19 +5361,6 @@ static netdev_tx_t ixgbe_xmit_frame(struct sk_buff *skb,
 }
 
 /**
- * ixgbe_get_stats - Get System Network Statistics
- * @netdev: network interface device structure
- *
- * Returns the address of the device statistics structure.
- * The statistics are actually updated from the timer callback.
- **/
-static struct net_device_stats *ixgbe_get_stats(struct net_device *netdev)
-{
-	/* only return the current stats */
-	return &netdev->stats;
-}
-
-/**
  * ixgbe_set_mac - Change the Ethernet Address of the NIC
  * @netdev: network interface device structure
  * @p: pointer to an address structure
@@ -5501,7 +5490,6 @@ static const struct net_device_ops ixgbe_netdev_ops = {
 	.ndo_stop		= ixgbe_close,
 	.ndo_start_xmit		= ixgbe_xmit_frame,
 	.ndo_select_queue	= ixgbe_select_queue,
-	.ndo_get_stats		= ixgbe_get_stats,
 	.ndo_set_rx_mode        = ixgbe_set_rx_mode,
 	.ndo_set_multicast_list	= ixgbe_set_rx_mode,
 	.ndo_validate_addr	= eth_validate_addr,


^ permalink raw reply related	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24  8:39         ` Peter Zijlstra
@ 2009-11-24 14:42           ` Arjan van de Ven
  2009-11-24 17:39           ` David Miller
  1 sibling, 0 replies; 67+ messages in thread
From: Arjan van de Ven @ 2009-11-24 14:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Peter P Waskiewicz Jr, Yong Zhang, linux-kernel, arjan, davem, netdev

Peter Zijlstra wrote:
>>> Same for userspace.
>> the problem is that there is no way currently that the driver can communicate
>> "I allocated all my metadata on THIS numa node". irqbalance and sysadmins need
>> that to not make really stupid decisions.....
> 
> And what exactly is struct device::numa_node good for then?
> 

and that is exported to userspace.. how?

... that has nothing to do with an irq, and also falls flat for a driver that
supports multiple irqs, and assigns one to each numa node.



^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24  9:08             ` Peter Zijlstra
  2009-11-24  9:15               ` Peter P Waskiewicz Jr
@ 2009-11-24 14:43               ` Arjan van de Ven
  1 sibling, 0 replies; 67+ messages in thread
From: Arjan van de Ven @ 2009-11-24 14:43 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Peter P Waskiewicz Jr, Yong Zhang, linux-kernel, arjan, davem,
	netdev, Thomas Gleixner

Peter Zijlstra wrote:
> On Tue, 2009-11-24 at 00:59 -0800, Peter P Waskiewicz Jr wrote:
>>> This all sounds backwards.. we've got a perfectly functional interface
>>> for affinity -- which people object to being used for some reason. So
>>> you add another interface on top, and that is ok?
>>>
>> But it's not functional.  If I set the affinity in smp_affinity, then
>> irqbalance will override it 10 seconds later. 
> 
> And here I was thinking the kernel round-robins IRQ delivery on the mask
> specified there. 

the kernel does no such thing, nor has code to do so.

 > Are you talking about some daft userspace thing that
> writes into the irq smp_affinity to effect irq balancing?

thanks ;)




^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24  8:39         ` Peter Zijlstra
  2009-11-24 14:42           ` Arjan van de Ven
@ 2009-11-24 17:39           ` David Miller
  2009-11-24 17:56             ` Peter P Waskiewicz Jr
  1 sibling, 1 reply; 67+ messages in thread
From: David Miller @ 2009-11-24 17:39 UTC (permalink / raw)
  To: peterz
  Cc: arjan, peter.p.waskiewicz.jr, yong.zhang0, linux-kernel, arjan, netdev

From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 24 Nov 2009 09:39:46 +0100

> On Mon, 2009-11-23 at 22:07 -0800, Arjan van de Ven wrote:
>> the problem is that there is no way currently that the driver can communicate
>> "I allocated all my metadata on THIS numa node". irqbalance and sysadmins need
>> that to not make really stupid decisions.....
> 
> And what exactly is struct device::numa_node good for then?

device->numa_node just says where the device is.

For better performance, it can make sense to, for example, allocate the ring
buffers for different device queues on other NUMA nodes.

That's the kind of thing PJ is trying to make available.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 10:07             ` Thomas Gleixner
@ 2009-11-24 17:55               ` Peter P Waskiewicz Jr
  2009-11-25 11:18               ` Peter Zijlstra
  1 sibling, 0 replies; 67+ messages in thread
From: Peter P Waskiewicz Jr @ 2009-11-24 17:55 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter Zijlstra, Yong Zhang, linux-kernel, arjan, davem, netdev,
	Jesse Barnes

On Tue, 2009-11-24 at 03:07 -0700, Thomas Gleixner wrote:
> On Tue, 24 Nov 2009, Peter P Waskiewicz Jr wrote:
> > On Tue, 2009-11-24 at 01:38 -0700, Peter Zijlstra wrote:
> > > On Mon, 2009-11-23 at 15:32 -0800, Waskiewicz Jr, Peter P wrote:
> > > 
> > > > Unfortunately, a driver can't.  The irq_set_affinity() function isn't 
> > > > exported.  I proposed a patch on netdev to export it, and then to tie down 
> > > > an interrupt using IRQF_NOBALANCING, so irqbalance won't touch it.  That 
> > > > was rejected, since the driver is enforcing policy of the interrupt 
> > > > balancing, not irqbalance.
> > > 
> > > Why would a patch touching the irq subsystem go to netdev?
> > 
> > The only change to the IRQ subsystem was:
> > 
> > EXPORT_SYMBOL(irq_set_affinity);
> 
> Which is still touching the generic irq subsystem and needs the ack of
> the relevant maintainer. If there is a need to expose such an
> interface to drivers then the maintainer wants to know exactly why and
> needs to be part of the discussion of alternative solutions. Otherwise
> you waste time on implementing stuff like the current patch which is
> definitely not going anywhere near the irq subsystem.
> 

Understood, and duly noted.

> > > If all you want is to expose policy to userspace then you don't need any
> > > of this, simply expose the NICs home node through a sysfs device thingy
> > > (I was under the impression its already there somewhere, but I can't
> > > ever find anything in /sys).
> > > 
> > > No need what so ever to poke at the IRQ subsystem.
> > 
> > The point is we need something common that the kernel side (whether a
> > driver or /proc can modify) that irqbalance can use.
> 
> /sys/class/net/ethX/device/numa_node 
> 
> perhaps ?

What I'm trying to do though is one to many NUMA node assignments.  See
below for a better overview of what the issue is we're trying to solve.

>  
> > > > Also, if you use the /proc interface to change smp_affinity on an 
> > > > interrupt without any of these changes, irqbalance will override it on its 
> > > > next poll interval.  This also is not desirable.
> > > 
> > > This all sounds backwards.. we've got a perfectly functional interface
> > > for affinity -- which people object to being used for some reason. So
> > > you add another interface on top, and that is ok?
> > > 
> > 
> > But it's not functional.  If I set the affinity in smp_affinity, then
> > irqbalance will override it 10 seconds later.
> 
> And to work around the brain wreckage of irqbalanced you want to
> fiddle in the irq code instead of teaching irqbalanced to handle node
> affinities ?
> 
> The only thing which is worth to investigate is whether the irq core
> code should honour the dev->numa_node setting and restrict the
> possible irq affinity settings to that node. If a device is tied to a
> node it makes a certain amount of sense to do that.
> 
> But such a change would not need a new interface in the irq core and
> definitely not a new cpumask_t member in the irq_desc structure to
> store a node affinity which can be expressed with a simple
> integer.
> 
> But this needs more thoughts and I want to know more about the
> background and the reasoning for such a change.
> 

I'll use the ixgbe driver as my example, since that is where my
immediate problems are.  This is our 10GbE device, and supports 128 Rx
queues, 128 Tx queues, and has a maximum of 64 MSI-X vectors.  In a
typical case, let's say an 8-core machine (Nehalem-EP with
hyperthreading off) brings one port online.  We'll allocate 8 Rx and 8
Tx queues.  When these allocations occur, we want to allocate the memory
for our descriptor rings and buffer structs and DMA areas onto the
various NUMA nodes.  This will promote spreading of the load not just
across CPUs, but also the memory controllers.

If we were to just run like that and have irqbalance move our vectors to
a single node, then we'd have half of our network resources creating
cross-node traffic, which is undesirable, since the OS may have to take
locks node to node to get the memory it's looking for.

The bottom line is we need some mechanism that allows a driver/user to
deterministically assign the underlying interrupt resources to the
correct NUMA node for each interrupt.  And in the example above, we may
have more than one NUMA node we need to balance into.

Please let me know if I've explained this well enough.  I appreciate the
time.

Cheers,
-PJ Waskiewicz


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 17:39           ` David Miller
@ 2009-11-24 17:56             ` Peter P Waskiewicz Jr
  2009-11-24 18:26               ` Eric Dumazet
  0 siblings, 1 reply; 67+ messages in thread
From: Peter P Waskiewicz Jr @ 2009-11-24 17:56 UTC (permalink / raw)
  To: David Miller; +Cc: peterz, arjan, yong.zhang0, linux-kernel, arjan, netdev

On Tue, 2009-11-24 at 09:39 -0800, David Miller wrote:
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Tue, 24 Nov 2009 09:39:46 +0100
> 
> > On Mon, 2009-11-23 at 22:07 -0800, Arjan van de Ven wrote:
> >> the problem is that there is no way currently that the driver can communicate
> >> "I allocated all my metadata on THIS numa node". irqbalance and sysadmins need
> >> that to not make really stupid decisions.....
> > 
> > And what exactly is struct device::numa_node good for then?
> 
> device->numa_node just says where the device is.
> 
> For better performance, it can make sense to, for example, allocate the ring
> buffers for different device queues on other NUMA nodes.
> 
> That's the kind of thing PJ is trying to make available.

Yes, that's exactly what I'm trying to do.  Even further, we want to
allocate the ring SW struct itself and descriptor structures on other
NUMA nodes, and make sure the interrupt lines up with those allocations.

Cheers,
-PJ


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 17:56             ` Peter P Waskiewicz Jr
@ 2009-11-24 18:26               ` Eric Dumazet
  2009-11-24 18:33                 ` Peter P Waskiewicz Jr
  2009-11-24 18:54                 ` David Miller
  0 siblings, 2 replies; 67+ messages in thread
From: Eric Dumazet @ 2009-11-24 18:26 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr
  Cc: David Miller, peterz, arjan, yong.zhang0, linux-kernel, arjan, netdev

Peter P Waskiewicz Jr a écrit :
 That's the kind of thing PJ is trying to make available.
> 
> Yes, that's exactly what I'm trying to do.  Even further, we want to
> allocate the ring SW struct itself and descriptor structures on other
> NUMA nodes, and make sure the interrupt lines up with those allocations.
> 

Say you allocate ring buffers on NUMA node of the CPU handling interrupt
on a particular queue.

If irqbalance or an admin changes /proc/irq/{number}/smp_affinities,
do you want to realloc ring buffer to another NUMA node ?

It seems complex to me, maybe optimal thing would be to use a NUMA policy to
spread vmalloc() allocations to all nodes to get a good bandwidth...


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 18:26               ` Eric Dumazet
@ 2009-11-24 18:33                 ` Peter P Waskiewicz Jr
  2009-11-24 19:01                   ` Eric Dumazet
  2009-11-24 18:54                 ` David Miller
  1 sibling, 1 reply; 67+ messages in thread
From: Peter P Waskiewicz Jr @ 2009-11-24 18:33 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, peterz, arjan, yong.zhang0, linux-kernel, arjan, netdev

On Tue, 2009-11-24 at 10:26 -0800, Eric Dumazet wrote:
> Peter P Waskiewicz Jr a écrit :
>  That's the kind of thing PJ is trying to make available.
> > 
> > Yes, that's exactly what I'm trying to do.  Even further, we want to
> > allocate the ring SW struct itself and descriptor structures on other
> > NUMA nodes, and make sure the interrupt lines up with those allocations.
> > 
> 
> Say you allocate ring buffers on NUMA node of the CPU handling interrupt
> on a particular queue.
> 
> If irqbalance or an admin changes /proc/irq/{number}/smp_affinities,
> do you want to realloc ring buffer to another NUMA node ?
> 

That's why I'm trying to add the node_affinity mechanism that irqbalance
can use to prevent the interrupt being moved to another node.

> It seems complex to me, maybe optimal thing would be to use a NUMA policy to
> spread vmalloc() allocations to all nodes to get a good bandwidth...

That's exactly what we're doing in our 10GbE driver right now (isn't
pushed upstream yet, still finalizing our testing).  We spread to all
NUMA nodes in a semi-intelligent fashion when allocating our rings and
buffers.  The last piece is ensuring the interrupts tied to the various
queues all route to the NUMA nodes those CPUs belong to.  irqbalance
needs some kind of hint to make sure it does the right thing, which
today it does not.

I don't see how this is complex though.  Driver loads, allocates across
the NUMA nodes for optimal throughput, then writes CPU masks for the
NUMA nodes each interrupt belongs to.  irqbalance comes along and looks
at the new mask "hint," and then balances that interrupt within that
hinted mask.

Cheers,
-PJ


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 18:26               ` Eric Dumazet
  2009-11-24 18:33                 ` Peter P Waskiewicz Jr
@ 2009-11-24 18:54                 ` David Miller
  2009-11-24 18:58                   ` Eric Dumazet
  1 sibling, 1 reply; 67+ messages in thread
From: David Miller @ 2009-11-24 18:54 UTC (permalink / raw)
  To: eric.dumazet
  Cc: peter.p.waskiewicz.jr, peterz, arjan, yong.zhang0, linux-kernel,
	arjan, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 24 Nov 2009 19:26:15 +0100

> It seems complex to me, maybe optimal thing would be to use a NUMA policy to
> spread vmalloc() allocations to all nodes to get a good bandwidth...

vmalloc() and sk_buff's don't currently mix and I really don't see us
every allowing them to :-)

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 18:54                 ` David Miller
@ 2009-11-24 18:58                   ` Eric Dumazet
  2009-11-24 20:35                     ` Andi Kleen
  0 siblings, 1 reply; 67+ messages in thread
From: Eric Dumazet @ 2009-11-24 18:58 UTC (permalink / raw)
  To: David Miller
  Cc: peter.p.waskiewicz.jr, peterz, arjan, yong.zhang0, linux-kernel,
	arjan, netdev

David Miller a écrit :
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 24 Nov 2009 19:26:15 +0100
> 
>> It seems complex to me, maybe optimal thing would be to use a NUMA policy to
>> spread vmalloc() allocations to all nodes to get a good bandwidth...
> 
> vmalloc() and sk_buff's don't currently mix and I really don't see us
> every allowing them to :-)

I think Peter was referring to tx/rx rings buffers, not sk_buffs.

They (ring buffers) are allocated with vmalloc() at driver init time.

And Tom pointed out that our rx sk_buff allocation should be using the node
of requester, no need to hardcode node number per rx queue (or per device as of today)



^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 18:33                 ` Peter P Waskiewicz Jr
@ 2009-11-24 19:01                   ` Eric Dumazet
  2009-11-24 19:53                     ` Peter P Waskiewicz Jr
  0 siblings, 1 reply; 67+ messages in thread
From: Eric Dumazet @ 2009-11-24 19:01 UTC (permalink / raw)
  To: Peter P Waskiewicz Jr
  Cc: David Miller, peterz, arjan, yong.zhang0, linux-kernel, arjan, netdev

Peter P Waskiewicz Jr a écrit :

> That's exactly what we're doing in our 10GbE driver right now (isn't
> pushed upstream yet, still finalizing our testing).  We spread to all
> NUMA nodes in a semi-intelligent fashion when allocating our rings and
> buffers.  The last piece is ensuring the interrupts tied to the various
> queues all route to the NUMA nodes those CPUs belong to.  irqbalance
> needs some kind of hint to make sure it does the right thing, which
> today it does not.

sk_buff allocations should be done on the node of the cpu handling rx interrupts.

For rings, I am ok for irqbalance and driver cooperation, in case admin
 doesnt want to change the defaults.

> 
> I don't see how this is complex though.  Driver loads, allocates across
> the NUMA nodes for optimal throughput, then writes CPU masks for the
> NUMA nodes each interrupt belongs to.  irqbalance comes along and looks
> at the new mask "hint," and then balances that interrupt within that
> hinted mask.

So NUMA policy is given by the driver at load time ?

An admin might chose to direct all NIC trafic to a given node, because
its machine has mixed workload. 3 nodes out of 4 for database workload,
one node for network IO...

So if an admin changes smp_affinity, is your driver able to reconfigure itself
and re-allocate all its rings to be on NUMA node chosen by admin ? This is
what I qualify as complex.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 19:01                   ` Eric Dumazet
@ 2009-11-24 19:53                     ` Peter P Waskiewicz Jr
  0 siblings, 0 replies; 67+ messages in thread
From: Peter P Waskiewicz Jr @ 2009-11-24 19:53 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, peterz, arjan, yong.zhang0, linux-kernel, arjan, netdev

On Tue, 2009-11-24 at 11:01 -0800, Eric Dumazet wrote:
> Peter P Waskiewicz Jr a écrit :
> 
> > That's exactly what we're doing in our 10GbE driver right now (isn't
> > pushed upstream yet, still finalizing our testing).  We spread to all
> > NUMA nodes in a semi-intelligent fashion when allocating our rings and
> > buffers.  The last piece is ensuring the interrupts tied to the various
> > queues all route to the NUMA nodes those CPUs belong to.  irqbalance
> > needs some kind of hint to make sure it does the right thing, which
> > today it does not.
> 
> sk_buff allocations should be done on the node of the cpu handling rx interrupts.

Yes, but we preallocate the buffers to minimize overhead when running
our interrupt routines.  Regardless, whatever queue we're filling with
those sk_buff's has an interrupt vector attached.  So wherever the
descriptor ring/queue and its associated buffers were allocated, that is
where the interrupt's affinity needs to be set to.

> For rings, I am ok for irqbalance and driver cooperation, in case admin
>  doesnt want to change the defaults.
> 
> > 
> > I don't see how this is complex though.  Driver loads, allocates across
> > the NUMA nodes for optimal throughput, then writes CPU masks for the
> > NUMA nodes each interrupt belongs to.  irqbalance comes along and looks
> > at the new mask "hint," and then balances that interrupt within that
> > hinted mask.
> 
> So NUMA policy is given by the driver at load time ?

I think it would have to.  Nobody else has insight how the driver
allocated its resources.  So the driver can be told where to allocate
(see below), or the driver needs to indicate upwards how it allocated
resources.

> An admin might chose to direct all NIC trafic to a given node, because
> its machine has mixed workload. 3 nodes out of 4 for database workload,
> one node for network IO...
> 
> So if an admin changes smp_affinity, is your driver able to reconfigure itself
> and re-allocate all its rings to be on NUMA node chosen by admin ? This is
> what I qualify as complex.

No, we don't want to go this route of reallocation.  This, I agree, is
very complex, and can be very devastating.  We'd basically be resetting
the driver whenever an interrupt moved, so this could be a terrible DoS
vulnerability.

Jesse Brandeburg has a set of patches he's working on that will allow us
to bind an interface to a single node.  So in your example of 3 nodes
for DB workload and 1 for network I/O, the driver can be loaded and
directly bound to that 4th node.  Then the node_affinity mask would be
set by the driver for the CPU mask of that single node.  But in these
deployments, a sysadmin changing affinity that will fly directly in the
face of how resources are laid out is poor system administration.  I
know it will happen, but I don't know how far we need to protect the
sysadmin from shooting themselves in the foot in terms of performance
tuning.

Cheers,
-PJ


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 18:58                   ` Eric Dumazet
@ 2009-11-24 20:35                     ` Andi Kleen
  2009-11-24 20:46                       ` Eric Dumazet
  0 siblings, 1 reply; 67+ messages in thread
From: Andi Kleen @ 2009-11-24 20:35 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, peter.p.waskiewicz.jr, peterz, arjan, yong.zhang0,
	linux-kernel, arjan, netdev

Eric Dumazet <eric.dumazet@gmail.com> writes:

> David Miller a écrit :
>> From: Eric Dumazet <eric.dumazet@gmail.com>
>> Date: Tue, 24 Nov 2009 19:26:15 +0100
>> 
>>> It seems complex to me, maybe optimal thing would be to use a NUMA policy to
>>> spread vmalloc() allocations to all nodes to get a good bandwidth...
>> 
>> vmalloc() and sk_buff's don't currently mix and I really don't see us
>> every allowing them to :-)
>
> I think Peter was referring to tx/rx rings buffers, not sk_buffs.
>
> They (ring buffers) are allocated with vmalloc() at driver init time.

They are typically allocated with dma_alloc_coherent(), which does
allocate a continuous area.  In theory you could do interleaving
with IOMMus, but just putting it on the same node as the device
is probably better.

-Andi
-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 20:35                     ` Andi Kleen
@ 2009-11-24 20:46                       ` Eric Dumazet
  2009-11-25 10:30                         ` Eric Dumazet
  0 siblings, 1 reply; 67+ messages in thread
From: Eric Dumazet @ 2009-11-24 20:46 UTC (permalink / raw)
  To: Andi Kleen
  Cc: David Miller, peter.p.waskiewicz.jr, peterz, arjan, yong.zhang0,
	linux-kernel, arjan, netdev

Andi Kleen a écrit :
> They are typically allocated with dma_alloc_coherent(), which does
> allocate a continuous area.  In theory you could do interleaving
> with IOMMus, but just putting it on the same node as the device
> is probably better.

There are two parts, biggest one allocated with vmalloc()
(to hold struct ixgbe_rx_buffer array, 32 bytes or more per entry),
only used by driver (not adapter)

and one allocated with pci_alloc_consistent() 
(to hold ixgbe_adv_tx_desc array, 16 bytes per entry)

vmalloc() one could be spreaded on many nodes.
I am not speaking about the pci_alloc_consistent() one :)



^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH net-next-2.6] ixgbe: Fix TX stats accounting
  2009-11-24 13:23                           ` Eric Dumazet
@ 2009-11-25  7:38                             ` Jeff Kirsher
  2009-11-25  9:31                               ` Eric Dumazet
  0 siblings, 1 reply; 67+ messages in thread
From: Jeff Kirsher @ 2009-11-25  7:38 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Peter P Waskiewicz Jr, robert, Jesper Dangaard Brouer,
	Linux Netdev List, David S. Miller

On Tue, Nov 24, 2009 at 05:23, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Here is an updated version, because ixgbe_get_ethtool_stats()
> needs to call dev_get_stats() or "ethtool -S" wont give
> correct tx_bytes/tx_packets values.
>
> [PATCH net-next-2.6] ixgbe: Fix TX stats accounting
>
> Several cpus can update netdev->stats.tx_bytes & netdev->stats.tx_packets
> in parallel. In this case, TX stats are under estimated and false sharing
> takes place.
>
> After a pktgen session sending exactly 200000000 packets :
> # ifconfig fiber0 | grep TX
>          TX packets:198501982 errors:0 dropped:0 overruns:0 carrier:0
>
>
> Multi queue devices should instead use txq->tx_bytes & txq->tx_packets
> in their xmit() method (appropriate txq lock already held by caller, no
> cache line miss), or use appropriate locking.
>
> After patch, same pktgen session gives :
>
> # ifconfig fiber0 | grep TX
>          TX packets:200000000 errors:0 dropped:0 overruns:0 carrier:0
>
> Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
> ---
>  drivers/net/ixgbe/ixgbe_ethtool.c |    1 +
>  drivers/net/ixgbe/ixgbe_main.c    |   20 ++++----------------
>  2 files changed, 5 insertions(+), 16 deletions(-)
>

Thanks Eric.  I have added the patch to my tree for testing and review.

-- 
Cheers,
Jeff

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH net-next-2.6] ixgbe: Fix TX stats accounting
  2009-11-25  7:38                             ` Jeff Kirsher
@ 2009-11-25  9:31                               ` Eric Dumazet
  2009-11-25  9:38                                 ` Jeff Kirsher
  0 siblings, 1 reply; 67+ messages in thread
From: Eric Dumazet @ 2009-11-25  9:31 UTC (permalink / raw)
  To: Jeff Kirsher
  Cc: Peter P Waskiewicz Jr, robert, Jesper Dangaard Brouer,
	Linux Netdev List, David S. Miller

Jeff Kirsher a écrit :
> 
> Thanks Eric.  I have added the patch to my tree for testing and review.
> 

Thanks Jeff, a similar problem on RX stats should also be addressed.

Is your tree public ?


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH net-next-2.6] ixgbe: Fix TX stats accounting
  2009-11-25  9:31                               ` Eric Dumazet
@ 2009-11-25  9:38                                 ` Jeff Kirsher
  0 siblings, 0 replies; 67+ messages in thread
From: Jeff Kirsher @ 2009-11-25  9:38 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Peter P Waskiewicz Jr, robert, Jesper Dangaard Brouer,
	Linux Netdev List, David S. Miller

On Wed, Nov 25, 2009 at 01:31, Eric Dumazet <eric.dumazet@gmail.com> wrote:
> Jeff Kirsher a écrit :
>>
>> Thanks Eric.  I have added the patch to my tree for testing and review.
>>
>
> Thanks Jeff, a similar problem on RX stats should also be addressed.
>
> Is your tree public ?
>

Unfortunately we do not have a public tree at this time.  I do welcome
any patches you want to submit.

-- 
Cheers,
Jeff

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 20:46                       ` Eric Dumazet
@ 2009-11-25 10:30                         ` Eric Dumazet
  2009-11-25 10:37                           ` Andi Kleen
  0 siblings, 1 reply; 67+ messages in thread
From: Eric Dumazet @ 2009-11-25 10:30 UTC (permalink / raw)
  To: Andi Kleen
  Cc: David Miller, peter.p.waskiewicz.jr, peterz, arjan, yong.zhang0,
	linux-kernel, arjan, netdev

Eric Dumazet a écrit :
> Andi Kleen a écrit :
>> They are typically allocated with dma_alloc_coherent(), which does
>> allocate a continuous area.  In theory you could do interleaving
>> with IOMMus, but just putting it on the same node as the device
>> is probably better.
> 
> There are two parts, biggest one allocated with vmalloc()
> (to hold struct ixgbe_rx_buffer array, 32 bytes or more per entry),
> only used by driver (not adapter)
> 
> and one allocated with pci_alloc_consistent() 
> (to hold ixgbe_adv_tx_desc array, 16 bytes per entry)
> 
> vmalloc() one could be spreaded on many nodes.
> I am not speaking about the pci_alloc_consistent() one :)
> 

BTW, I found my Nehalem dev machine behaves strangly, defeating all
my NUMA tweaks. (This is an HP DL380 G6)

It has two sockets, populated with two E5530 @2.4GH.

Each cpu has 2x4GB RAM modules.

It claims having two memory nodes, but all cpus are on Node 0

dmesg | grep -i node
[    0.000000] SRAT: PXM 0 -> APIC 0 -> Node 0
[    0.000000] SRAT: PXM 0 -> APIC 1 -> Node 0
[    0.000000] SRAT: PXM 0 -> APIC 2 -> Node 0
[    0.000000] SRAT: PXM 0 -> APIC 3 -> Node 0
[    0.000000] SRAT: PXM 0 -> APIC 4 -> Node 0
[    0.000000] SRAT: PXM 0 -> APIC 5 -> Node 0
[    0.000000] SRAT: PXM 0 -> APIC 6 -> Node 0
[    0.000000] SRAT: PXM 0 -> APIC 7 -> Node 0
[    0.000000] SRAT: Node 0 PXM 0 0-e0000000
[    0.000000] SRAT: Node 0 PXM 0 100000000-220000000
[    0.000000] SRAT: Node 1 PXM 1 220000000-420000000
[    0.000000] Bootmem setup node 0 0000000000000000-0000000220000000
[    0.000000]   NODE_DATA [0000000000001000 - 0000000000004fff]
[    0.000000] Bootmem setup node 1 0000000220000000-000000041ffff000
[    0.000000]   NODE_DATA [0000000220000000 - 0000000220003fff]
[    0.000000]  [ffffea0000000000-ffffea00087fffff] PMD -> [ffff880028600000-ffff8800305fffff] on node 0
[    0.000000]  [ffffea0008800000-ffffea00107fffff] PMD -> [ffff880220200000-ffff8802281fffff] on node 1
[    0.000000] Movable zone start PFN for each node
[    0.000000] early_node_map[5] active PFN ranges
[    0.000000] On node 0 totalpages: 2094543
[    0.000000] On node 1 totalpages: 2097151
[    0.000000] NR_CPUS:16 nr_cpumask_bits:16 nr_cpu_ids:16 nr_node_ids:2
[    0.000000] SLUB: Genslabs=14, HWalign=64, Order=0-3, MinObjects=0, CPUs=16, Nodes=2
[    0.004756] Inode-cache hash table entries: 1048576 (order: 11, 8388608 bytes)
[    0.007213] CPU 0/0x0 -> Node 0
[    0.398104] CPU 1/0x10 -> Node 0
[    0.557854] CPU 2/0x4 -> Node 0
[    0.717606] CPU 3/0x14 -> Node 0
[    0.877357] CPU 4/0x2 -> Node 0
[    1.037109] CPU 5/0x12 -> Node 0
[    1.196860] CPU 6/0x6 -> Node 0
[    1.356611] CPU 7/0x16 -> Node 0
[    1.516365] CPU 8/0x1 -> Node 0
[    1.676114] CPU 9/0x11 -> Node 0
[    1.835865] CPU 10/0x5 -> Node 0
[    1.995616] CPU 11/0x15 -> Node 0
[    2.155367] CPU 12/0x3 -> Node 0
[    2.315119] CPU 13/0x13 -> Node 0
[    2.474870] CPU 14/0x7 -> Node 0
[    2.634621] CPU 15/0x17 -> Node 0

# cat /proc/buddyinfo 
Node 0, zone      DMA      2      2      2      1      1      1      1      0      1      1      3 
Node 0, zone    DMA32      5     11      4      5      4     12      1      4      4      5    834 
Node 0, zone   Normal   4109    120     98    153     67     35     21     15     11     10    109 
Node 1, zone   Normal      7     17     10     12      7     14      5      7      6      5   2004 


This is with net-next-2.6, I'll try linux-2.6

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-25 10:30                         ` Eric Dumazet
@ 2009-11-25 10:37                           ` Andi Kleen
  2009-11-25 11:35                             ` Eric Dumazet
  0 siblings, 1 reply; 67+ messages in thread
From: Andi Kleen @ 2009-11-25 10:37 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andi Kleen, David Miller, peter.p.waskiewicz.jr, peterz, arjan,
	yong.zhang0, linux-kernel, arjan, netdev

Works here
> 
> dmesg | grep -i node
> [    0.000000] SRAT: PXM 0 -> APIC 0 -> Node 0
> [    0.000000] SRAT: PXM 0 -> APIC 1 -> Node 0
> [    0.000000] SRAT: PXM 0 -> APIC 2 -> Node 0
> [    0.000000] SRAT: PXM 0 -> APIC 3 -> Node 0
> [    0.000000] SRAT: PXM 0 -> APIC 4 -> Node 0
> [    0.000000] SRAT: PXM 0 -> APIC 5 -> Node 0
> [    0.000000] SRAT: PXM 0 -> APIC 6 -> Node 0
> [    0.000000] SRAT: PXM 0 -> APIC 7 -> Node 0

You seem to only have 8 CPUs (one socket) Normally a dual socket nehalem
should have 16 with HyperThreading enabled.

For some reason the BIOS is not reporting the other CPU.

You could double check with acpidmp / iasl -d if that's
what the BIOS really reports, but normally it should work.

> [    0.000000] SRAT: Node 0 PXM 0 0-e0000000
> [    0.000000] SRAT: Node 0 PXM 0 100000000-220000000
> [    0.000000] SRAT: Node 1 PXM 1 220000000-420000000

-Andi


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-24 10:07             ` Thomas Gleixner
  2009-11-24 17:55               ` Peter P Waskiewicz Jr
@ 2009-11-25 11:18               ` Peter Zijlstra
  1 sibling, 0 replies; 67+ messages in thread
From: Peter Zijlstra @ 2009-11-25 11:18 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: Peter P Waskiewicz Jr, Yong Zhang, linux-kernel, arjan, davem,
	netdev, Jesse Barnes


While we're dicking about with irq affinities, maybe we ought to
consider cpuset integration as well. There's various folks who want to
use exclusive cpusets to isolate workloads.



^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-25 10:37                           ` Andi Kleen
@ 2009-11-25 11:35                             ` Eric Dumazet
  2009-11-25 11:50                               ` Andi Kleen
  0 siblings, 1 reply; 67+ messages in thread
From: Eric Dumazet @ 2009-11-25 11:35 UTC (permalink / raw)
  To: Andi Kleen
  Cc: David Miller, peter.p.waskiewicz.jr, peterz, arjan, yong.zhang0,
	linux-kernel, arjan, netdev

Andi Kleen a écrit :
> Works here
>> dmesg | grep -i node
>> [    0.000000] SRAT: PXM 0 -> APIC 0 -> Node 0
>> [    0.000000] SRAT: PXM 0 -> APIC 1 -> Node 0
>> [    0.000000] SRAT: PXM 0 -> APIC 2 -> Node 0
>> [    0.000000] SRAT: PXM 0 -> APIC 3 -> Node 0
>> [    0.000000] SRAT: PXM 0 -> APIC 4 -> Node 0
>> [    0.000000] SRAT: PXM 0 -> APIC 5 -> Node 0
>> [    0.000000] SRAT: PXM 0 -> APIC 6 -> Node 0
>> [    0.000000] SRAT: PXM 0 -> APIC 7 -> Node 0
> 
> You seem to only have 8 CPUs (one socket) Normally a dual socket nehalem
> should have 16 with HyperThreading enabled.
> 
> For some reason the BIOS is not reporting the other CPU.
> 
> You could double check with acpidmp / iasl -d if that's
> what the BIOS really reports, but normally it should work.
> 

Good Lord, I had a CONFIG_NR_CPUS=16 in my .config.

Changing to to 32 or 64 seems better :)

# dmesg | grep -i node
[    0.000000] SRAT: PXM 0 -> APIC 0 -> Node 0
[    0.000000] SRAT: PXM 0 -> APIC 1 -> Node 0
[    0.000000] SRAT: PXM 0 -> APIC 2 -> Node 0
[    0.000000] SRAT: PXM 0 -> APIC 3 -> Node 0
[    0.000000] SRAT: PXM 0 -> APIC 4 -> Node 0
[    0.000000] SRAT: PXM 0 -> APIC 5 -> Node 0
[    0.000000] SRAT: PXM 0 -> APIC 6 -> Node 0
[    0.000000] SRAT: PXM 0 -> APIC 7 -> Node 0
[    0.000000] SRAT: PXM 1 -> APIC 16 -> Node 1
[    0.000000] SRAT: PXM 1 -> APIC 17 -> Node 1
[    0.000000] SRAT: PXM 1 -> APIC 18 -> Node 1
[    0.000000] SRAT: PXM 1 -> APIC 19 -> Node 1
[    0.000000] SRAT: PXM 1 -> APIC 20 -> Node 1
[    0.000000] SRAT: PXM 1 -> APIC 21 -> Node 1
[    0.000000] SRAT: PXM 1 -> APIC 22 -> Node 1
[    0.000000] SRAT: PXM 1 -> APIC 23 -> Node 1
[    0.000000] SRAT: Node 0 PXM 0 0-e0000000
[    0.000000] SRAT: Node 0 PXM 0 100000000-220000000
[    0.000000] SRAT: Node 1 PXM 1 220000000-420000000
[    0.000000] Bootmem setup node 0 0000000000000000-0000000220000000
[    0.000000]   NODE_DATA [0000000000001000 - 0000000000005fff]
[    0.000000] Bootmem setup node 1 0000000220000000-000000041ffff000
[    0.000000]   NODE_DATA [0000000220000000 - 0000000220004fff]
[    0.000000]  [ffffea0000000000-ffffea00087fffff] PMD -> [ffff880028600000-ffff8800305fffff] on node 0
[    0.000000]  [ffffea0008800000-ffffea00107fffff] PMD -> [ffff880220200000-ffff8802281fffff] on node 1
[    0.000000] Movable zone start PFN for each node
[    0.000000] early_node_map[5] active PFN ranges
[    0.000000] On node 0 totalpages: 2094543
[    0.000000] On node 1 totalpages: 2097151
[    0.000000] NR_CPUS:32 nr_cpumask_bits:32 nr_cpu_ids:32 nr_node_ids:2
[    0.000000] SLUB: Genslabs=14, HWalign=64, Order=0-3, MinObjects=0, CPUs=32, Nodes=2
[    0.004830] Inode-cache hash table entries: 1048576 (order: 11, 8388608 bytes)
[    0.007291] CPU 0/0x0 -> Node 0
[    0.398106] CPU 1/0x10 -> Node 1
[    0.557857] CPU 2/0x4 -> Node 0
[    0.717609] CPU 3/0x14 -> Node 1
[    0.877359] CPU 4/0x2 -> Node 0
[    1.037112] CPU 5/0x12 -> Node 1
[    1.196862] CPU 6/0x6 -> Node 0
[    1.356614] CPU 7/0x16 -> Node 1
[    1.516368] CPU 8/0x1 -> Node 0
[    1.676117] CPU 9/0x11 -> Node 1
[    1.835867] CPU 10/0x5 -> Node 0
[    1.995619] CPU 11/0x15 -> Node 1
[    2.155370] CPU 12/0x3 -> Node 0
[    2.315122] CPU 13/0x13 -> Node 1
[    2.474873] CPU 14/0x7 -> Node 0
[    2.634624] CPU 15/0x17 -> Node 1


Thanks Andi

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-25 11:35                             ` Eric Dumazet
@ 2009-11-25 11:50                               ` Andi Kleen
  2009-11-26 11:43                                 ` Eric Dumazet
  0 siblings, 1 reply; 67+ messages in thread
From: Andi Kleen @ 2009-11-25 11:50 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Andi Kleen, David Miller, peter.p.waskiewicz.jr, peterz, arjan,
	yong.zhang0, linux-kernel, arjan, netdev

On Wed, Nov 25, 2009 at 12:35:03PM +0100, Eric Dumazet wrote:
> > You seem to only have 8 CPUs (one socket) Normally a dual socket nehalem
> > should have 16 with HyperThreading enabled.
> > 
> > For some reason the BIOS is not reporting the other CPU.
> > 
> > You could double check with acpidmp / iasl -d if that's
> > what the BIOS really reports, but normally it should work.
> > 
> 
> Good Lord, I had a CONFIG_NR_CPUS=16 in my .config.

That should be enough for a two socket (2S x 4C x 2T) today,
but of course that will eventually change too.

> Changing to to 32 or 64 seems better :)

That looks weird. It should have worked with CONFIG_NR_CPUS==16 too,
because you only have 16 CPUs and the NR_CPUS should affect APIC
ranges etc.

Something still fishy. I would properly report it.

BTW kernel should give some error message in any case when
there are not enough CPUs I guess. 

-Andi

-- 
ak@linux.intel.com -- Speaking for myself only.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
  2009-11-25 11:50                               ` Andi Kleen
@ 2009-11-26 11:43                                 ` Eric Dumazet
  0 siblings, 0 replies; 67+ messages in thread
From: Eric Dumazet @ 2009-11-26 11:43 UTC (permalink / raw)
  To: Andi Kleen
  Cc: David Miller, peter.p.waskiewicz.jr, peterz, arjan, yong.zhang0,
	linux-kernel, arjan, netdev

Andi Kleen a écrit :

> That looks weird. It should have worked with CONFIG_NR_CPUS==16 too,
> because you only have 16 CPUs and the NR_CPUS should affect APIC
> ranges etc.
> 
> Something still fishy. I would properly report it.
> 
> BTW kernel should give some error message in any case when
> there are not enough CPUs I guess. 

Problem comes from acpi_numa_init(), calling

acpi_table_parse_srat(ACPI_SRAT_TYPE_CPU_AFFINITY, acpi_parse_processor_affinity, NR_CPUS);

But on this machine SRAT table contains 64 processor_affinity entries,
part of them enabled and other parts disabled

[8 ENABLED], [8 disabled], [8 ENABLED], [40 disabled]

So if NR_CPUS = 16, we 'see' only 8 enabled entries and 8 disabled entries.

# acpidump  -t SRAT >SRAT.dump
# acpixtract -a SRAT.dump
# iasl -d SRAT.dat
# cat SRAT.dsl
/*
 * Intel ACPI Component Architecture
 * AML Disassembler version 20090521
 *
 * Disassembly of SRAT.dat, Thu Nov 26 12:29:34 2009
 *
 * ACPI Data Table [SRAT]
 *
 * Format: [HexOffset DecimalOffset ByteLength]  FieldName : FieldValue
 */

[000h 0000  4]                    Signature : "SRAT"    /* System Resource Affinity Table */
[004h 0004  4]                 Table Length : 00000570
[008h 0008  1]                     Revision : 01
[009h 0009  1]                     Checksum : D9
[00Ah 0010  6]                       Oem ID : "HP    "
[010h 0016  8]                 Oem Table ID : "Proliant"
[018h 0024  4]                 Oem Revision : 00000001
[01Ch 0028  4]              Asl Compiler ID : "    "
[020h 0032  4]        Asl Compiler Revision : 0000162E

[024h 0036  4]               Table Revision : 00000001
[028h 0040  8]                     Reserved : 0000000000000000

[030h 0048  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[031h 0049  1]                       Length : 10

[032h 0050  1]      Proximity Domain Low(8) : 00
[033h 0051  1]                      Apic ID : 00
[034h 0052  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[038h 0056  1]              Local Sapic EID : 00
[039h 0057  3]    Proximity Domain High(24) : 000000
[03Ch 0060  4]                     Reserved : 00000000

[040h 0064  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[041h 0065  1]                       Length : 10

[042h 0066  1]      Proximity Domain Low(8) : 00
[043h 0067  1]                      Apic ID : 01
[044h 0068  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[048h 0072  1]              Local Sapic EID : 00
[049h 0073  3]    Proximity Domain High(24) : 000000
[04Ch 0076  4]                     Reserved : 00000000

[050h 0080  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[051h 0081  1]                       Length : 10

[052h 0082  1]      Proximity Domain Low(8) : 00
[053h 0083  1]                      Apic ID : 02
[054h 0084  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[058h 0088  1]              Local Sapic EID : 00
[059h 0089  3]    Proximity Domain High(24) : 000000
[05Ch 0092  4]                     Reserved : 00000000

[060h 0096  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[061h 0097  1]                       Length : 10

[062h 0098  1]      Proximity Domain Low(8) : 00
[063h 0099  1]                      Apic ID : 03
[064h 0100  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[068h 0104  1]              Local Sapic EID : 00
[069h 0105  3]    Proximity Domain High(24) : 000000
[06Ch 0108  4]                     Reserved : 00000000

[070h 0112  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[071h 0113  1]                       Length : 10

[072h 0114  1]      Proximity Domain Low(8) : 00
[073h 0115  1]                      Apic ID : 04
[074h 0116  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[078h 0120  1]              Local Sapic EID : 00
[079h 0121  3]    Proximity Domain High(24) : 000000
[07Ch 0124  4]                     Reserved : 00000000

[080h 0128  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[081h 0129  1]                       Length : 10

[082h 0130  1]      Proximity Domain Low(8) : 00
[083h 0131  1]                      Apic ID : 05
[084h 0132  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[088h 0136  1]              Local Sapic EID : 00
[089h 0137  3]    Proximity Domain High(24) : 000000
[08Ch 0140  4]                     Reserved : 00000000

[090h 0144  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[091h 0145  1]                       Length : 10

[092h 0146  1]      Proximity Domain Low(8) : 00
[093h 0147  1]                      Apic ID : 06
[094h 0148  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[098h 0152  1]              Local Sapic EID : 00
[099h 0153  3]    Proximity Domain High(24) : 000000
[09Ch 0156  4]                     Reserved : 00000000

[0A0h 0160  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[0A1h 0161  1]                       Length : 10

[0A2h 0162  1]      Proximity Domain Low(8) : 00
[0A3h 0163  1]                      Apic ID : 07
[0A4h 0164  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[0A8h 0168  1]              Local Sapic EID : 00
[0A9h 0169  3]    Proximity Domain High(24) : 000000
[0ACh 0172  4]                     Reserved : 00000000

[0B0h 0176  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[0B1h 0177  1]                       Length : 10

[0B2h 0178  1]      Proximity Domain Low(8) : 00
[0B3h 0179  1]                      Apic ID : 08
[0B4h 0180  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[0B8h 0184  1]              Local Sapic EID : 00
[0B9h 0185  3]    Proximity Domain High(24) : 000000
[0BCh 0188  4]                     Reserved : 00000000

[0C0h 0192  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[0C1h 0193  1]                       Length : 10

[0C2h 0194  1]      Proximity Domain Low(8) : 00
[0C3h 0195  1]                      Apic ID : 09
[0C4h 0196  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[0C8h 0200  1]              Local Sapic EID : 00
[0C9h 0201  3]    Proximity Domain High(24) : 000000
[0CCh 0204  4]                     Reserved : 00000000

[0D0h 0208  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[0D1h 0209  1]                       Length : 10

[0D2h 0210  1]      Proximity Domain Low(8) : 00
[0D3h 0211  1]                      Apic ID : 0A
[0D4h 0212  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[0D8h 0216  1]              Local Sapic EID : 00
[0D9h 0217  3]    Proximity Domain High(24) : 000000
[0DCh 0220  4]                     Reserved : 00000000

[0E0h 0224  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[0E1h 0225  1]                       Length : 10

[0E2h 0226  1]      Proximity Domain Low(8) : 00
[0E3h 0227  1]                      Apic ID : 0B
[0E4h 0228  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[0E8h 0232  1]              Local Sapic EID : 00
[0E9h 0233  3]    Proximity Domain High(24) : 000000
[0ECh 0236  4]                     Reserved : 00000000

[0F0h 0240  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[0F1h 0241  1]                       Length : 10

[0F2h 0242  1]      Proximity Domain Low(8) : 00
[0F3h 0243  1]                      Apic ID : 0C
[0F4h 0244  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[0F8h 0248  1]              Local Sapic EID : 00
[0F9h 0249  3]    Proximity Domain High(24) : 000000
[0FCh 0252  4]                     Reserved : 00000000

[100h 0256  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[101h 0257  1]                       Length : 10

[102h 0258  1]      Proximity Domain Low(8) : 00
[103h 0259  1]                      Apic ID : 0D
[104h 0260  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[108h 0264  1]              Local Sapic EID : 00
[109h 0265  3]    Proximity Domain High(24) : 000000
[10Ch 0268  4]                     Reserved : 00000000

[110h 0272  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[111h 0273  1]                       Length : 10

[112h 0274  1]      Proximity Domain Low(8) : 00
[113h 0275  1]                      Apic ID : 0E
[114h 0276  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[118h 0280  1]              Local Sapic EID : 00
[119h 0281  3]    Proximity Domain High(24) : 000000
[11Ch 0284  4]                     Reserved : 00000000

[120h 0288  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[121h 0289  1]                       Length : 10

[122h 0290  1]      Proximity Domain Low(8) : 00
[123h 0291  1]                      Apic ID : 0F
[124h 0292  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[128h 0296  1]              Local Sapic EID : 00
[129h 0297  3]    Proximity Domain High(24) : 000000
[12Ch 0300  4]                     Reserved : 00000000

[130h 0304  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[131h 0305  1]                       Length : 10

[132h 0306  1]      Proximity Domain Low(8) : 01
[133h 0307  1]                      Apic ID : 10
[134h 0308  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[138h 0312  1]              Local Sapic EID : 00
[139h 0313  3]    Proximity Domain High(24) : 000000
[13Ch 0316  4]                     Reserved : 00000000

[140h 0320  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[141h 0321  1]                       Length : 10

[142h 0322  1]      Proximity Domain Low(8) : 01
[143h 0323  1]                      Apic ID : 11
[144h 0324  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[148h 0328  1]              Local Sapic EID : 00
[149h 0329  3]    Proximity Domain High(24) : 000000
[14Ch 0332  4]                     Reserved : 00000000

[150h 0336  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[151h 0337  1]                       Length : 10

[152h 0338  1]      Proximity Domain Low(8) : 01
[153h 0339  1]                      Apic ID : 12
[154h 0340  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[158h 0344  1]              Local Sapic EID : 00
[159h 0345  3]    Proximity Domain High(24) : 000000
[15Ch 0348  4]                     Reserved : 00000000

[160h 0352  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[161h 0353  1]                       Length : 10

[162h 0354  1]      Proximity Domain Low(8) : 01
[163h 0355  1]                      Apic ID : 13
[164h 0356  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[168h 0360  1]              Local Sapic EID : 00
[169h 0361  3]    Proximity Domain High(24) : 000000
[16Ch 0364  4]                     Reserved : 00000000

[170h 0368  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[171h 0369  1]                       Length : 10

[172h 0370  1]      Proximity Domain Low(8) : 01
[173h 0371  1]                      Apic ID : 14
[174h 0372  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[178h 0376  1]              Local Sapic EID : 00
[179h 0377  3]    Proximity Domain High(24) : 000000
[17Ch 0380  4]                     Reserved : 00000000

[180h 0384  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[181h 0385  1]                       Length : 10

[182h 0386  1]      Proximity Domain Low(8) : 01
[183h 0387  1]                      Apic ID : 15
[184h 0388  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[188h 0392  1]              Local Sapic EID : 00
[189h 0393  3]    Proximity Domain High(24) : 000000
[18Ch 0396  4]                     Reserved : 00000000

[190h 0400  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[191h 0401  1]                       Length : 10

[192h 0402  1]      Proximity Domain Low(8) : 01
[193h 0403  1]                      Apic ID : 16
[194h 0404  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[198h 0408  1]              Local Sapic EID : 00
[199h 0409  3]    Proximity Domain High(24) : 000000
[19Ch 0412  4]                     Reserved : 00000000

[1A0h 0416  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[1A1h 0417  1]                       Length : 10

[1A2h 0418  1]      Proximity Domain Low(8) : 01
[1A3h 0419  1]                      Apic ID : 17
[1A4h 0420  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
[1A8h 0424  1]              Local Sapic EID : 00
[1A9h 0425  3]    Proximity Domain High(24) : 000000
[1ACh 0428  4]                     Reserved : 00000000

[1B0h 0432  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[1B1h 0433  1]                       Length : 10

[1B2h 0434  1]      Proximity Domain Low(8) : 00
[1B3h 0435  1]                      Apic ID : 18
[1B4h 0436  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[1B8h 0440  1]              Local Sapic EID : 00
[1B9h 0441  3]    Proximity Domain High(24) : 000000
[1BCh 0444  4]                     Reserved : 00000000

[1C0h 0448  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[1C1h 0449  1]                       Length : 10

[1C2h 0450  1]      Proximity Domain Low(8) : 00
[1C3h 0451  1]                      Apic ID : 19
[1C4h 0452  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[1C8h 0456  1]              Local Sapic EID : 00
[1C9h 0457  3]    Proximity Domain High(24) : 000000
[1CCh 0460  4]                     Reserved : 00000000

[1D0h 0464  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[1D1h 0465  1]                       Length : 10

[1D2h 0466  1]      Proximity Domain Low(8) : 00
[1D3h 0467  1]                      Apic ID : 1A
[1D4h 0468  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[1D8h 0472  1]              Local Sapic EID : 00
[1D9h 0473  3]    Proximity Domain High(24) : 000000
[1DCh 0476  4]                     Reserved : 00000000

[1E0h 0480  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[1E1h 0481  1]                       Length : 10

[1E2h 0482  1]      Proximity Domain Low(8) : 00
[1E3h 0483  1]                      Apic ID : 1B
[1E4h 0484  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[1E8h 0488  1]              Local Sapic EID : 00
[1E9h 0489  3]    Proximity Domain High(24) : 000000
[1ECh 0492  4]                     Reserved : 00000000

[1F0h 0496  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[1F1h 0497  1]                       Length : 10

[1F2h 0498  1]      Proximity Domain Low(8) : 00
[1F3h 0499  1]                      Apic ID : 1C
[1F4h 0500  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[1F8h 0504  1]              Local Sapic EID : 00
[1F9h 0505  3]    Proximity Domain High(24) : 000000
[1FCh 0508  4]                     Reserved : 00000000

[200h 0512  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[201h 0513  1]                       Length : 10

[202h 0514  1]      Proximity Domain Low(8) : 00
[203h 0515  1]                      Apic ID : 1D
[204h 0516  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[208h 0520  1]              Local Sapic EID : 00
[209h 0521  3]    Proximity Domain High(24) : 000000
[20Ch 0524  4]                     Reserved : 00000000

[210h 0528  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[211h 0529  1]                       Length : 10

[212h 0530  1]      Proximity Domain Low(8) : 00
[213h 0531  1]                      Apic ID : 1E
[214h 0532  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[218h 0536  1]              Local Sapic EID : 00
[219h 0537  3]    Proximity Domain High(24) : 000000
[21Ch 0540  4]                     Reserved : 00000000

[220h 0544  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[221h 0545  1]                       Length : 10

[222h 0546  1]      Proximity Domain Low(8) : 00
[223h 0547  1]                      Apic ID : 1F
[224h 0548  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[228h 0552  1]              Local Sapic EID : 00
[229h 0553  3]    Proximity Domain High(24) : 000000
[22Ch 0556  4]                     Reserved : 00000000

[230h 0560  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[231h 0561  1]                       Length : 10

[232h 0562  1]      Proximity Domain Low(8) : 00
[233h 0563  1]                      Apic ID : 20
[234h 0564  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[238h 0568  1]              Local Sapic EID : 00
[239h 0569  3]    Proximity Domain High(24) : 000000
[23Ch 0572  4]                     Reserved : 00000000

[240h 0576  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[241h 0577  1]                       Length : 10

[242h 0578  1]      Proximity Domain Low(8) : 00
[243h 0579  1]                      Apic ID : 21
[244h 0580  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[248h 0584  1]              Local Sapic EID : 00
[249h 0585  3]    Proximity Domain High(24) : 000000
[24Ch 0588  4]                     Reserved : 00000000

[250h 0592  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[251h 0593  1]                       Length : 10

[252h 0594  1]      Proximity Domain Low(8) : 00
[253h 0595  1]                      Apic ID : 22
[254h 0596  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[258h 0600  1]              Local Sapic EID : 00
[259h 0601  3]    Proximity Domain High(24) : 000000
[25Ch 0604  4]                     Reserved : 00000000

[260h 0608  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[261h 0609  1]                       Length : 10

[262h 0610  1]      Proximity Domain Low(8) : 00
[263h 0611  1]                      Apic ID : 23
[264h 0612  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[268h 0616  1]              Local Sapic EID : 00
[269h 0617  3]    Proximity Domain High(24) : 000000
[26Ch 0620  4]                     Reserved : 00000000

[270h 0624  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[271h 0625  1]                       Length : 10

[272h 0626  1]      Proximity Domain Low(8) : 00
[273h 0627  1]                      Apic ID : 24
[274h 0628  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[278h 0632  1]              Local Sapic EID : 00
[279h 0633  3]    Proximity Domain High(24) : 000000
[27Ch 0636  4]                     Reserved : 00000000

[280h 0640  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[281h 0641  1]                       Length : 10

[282h 0642  1]      Proximity Domain Low(8) : 00
[283h 0643  1]                      Apic ID : 25
[284h 0644  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[288h 0648  1]              Local Sapic EID : 00
[289h 0649  3]    Proximity Domain High(24) : 000000
[28Ch 0652  4]                     Reserved : 00000000

[290h 0656  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[291h 0657  1]                       Length : 10

[292h 0658  1]      Proximity Domain Low(8) : 00
[293h 0659  1]                      Apic ID : 26
[294h 0660  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[298h 0664  1]              Local Sapic EID : 00
[299h 0665  3]    Proximity Domain High(24) : 000000
[29Ch 0668  4]                     Reserved : 00000000

[2A0h 0672  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[2A1h 0673  1]                       Length : 10

[2A2h 0674  1]      Proximity Domain Low(8) : 00
[2A3h 0675  1]                      Apic ID : 27
[2A4h 0676  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[2A8h 0680  1]              Local Sapic EID : 00
[2A9h 0681  3]    Proximity Domain High(24) : 000000
[2ACh 0684  4]                     Reserved : 00000000

[2B0h 0688  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[2B1h 0689  1]                       Length : 10

[2B2h 0690  1]      Proximity Domain Low(8) : 00
[2B3h 0691  1]                      Apic ID : 28
[2B4h 0692  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[2B8h 0696  1]              Local Sapic EID : 00
[2B9h 0697  3]    Proximity Domain High(24) : 000000
[2BCh 0700  4]                     Reserved : 00000000

[2C0h 0704  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[2C1h 0705  1]                       Length : 10

[2C2h 0706  1]      Proximity Domain Low(8) : 00
[2C3h 0707  1]                      Apic ID : 29
[2C4h 0708  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[2C8h 0712  1]              Local Sapic EID : 00
[2C9h 0713  3]    Proximity Domain High(24) : 000000
[2CCh 0716  4]                     Reserved : 00000000

[2D0h 0720  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[2D1h 0721  1]                       Length : 10

[2D2h 0722  1]      Proximity Domain Low(8) : 00
[2D3h 0723  1]                      Apic ID : 2A
[2D4h 0724  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[2D8h 0728  1]              Local Sapic EID : 00
[2D9h 0729  3]    Proximity Domain High(24) : 000000
[2DCh 0732  4]                     Reserved : 00000000

[2E0h 0736  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[2E1h 0737  1]                       Length : 10

[2E2h 0738  1]      Proximity Domain Low(8) : 00
[2E3h 0739  1]                      Apic ID : 2B
[2E4h 0740  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[2E8h 0744  1]              Local Sapic EID : 00
[2E9h 0745  3]    Proximity Domain High(24) : 000000
[2ECh 0748  4]                     Reserved : 00000000

[2F0h 0752  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[2F1h 0753  1]                       Length : 10

[2F2h 0754  1]      Proximity Domain Low(8) : 00
[2F3h 0755  1]                      Apic ID : 2C
[2F4h 0756  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[2F8h 0760  1]              Local Sapic EID : 00
[2F9h 0761  3]    Proximity Domain High(24) : 000000
[2FCh 0764  4]                     Reserved : 00000000

[300h 0768  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[301h 0769  1]                       Length : 10

[302h 0770  1]      Proximity Domain Low(8) : 00
[303h 0771  1]                      Apic ID : 2D
[304h 0772  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[308h 0776  1]              Local Sapic EID : 00
[309h 0777  3]    Proximity Domain High(24) : 000000
[30Ch 0780  4]                     Reserved : 00000000

[310h 0784  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[311h 0785  1]                       Length : 10

[312h 0786  1]      Proximity Domain Low(8) : 00
[313h 0787  1]                      Apic ID : 2E
[314h 0788  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[318h 0792  1]              Local Sapic EID : 00
[319h 0793  3]    Proximity Domain High(24) : 000000
[31Ch 0796  4]                     Reserved : 00000000

[320h 0800  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[321h 0801  1]                       Length : 10

[322h 0802  1]      Proximity Domain Low(8) : 00
[323h 0803  1]                      Apic ID : 2F
[324h 0804  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[328h 0808  1]              Local Sapic EID : 00
[329h 0809  3]    Proximity Domain High(24) : 000000
[32Ch 0812  4]                     Reserved : 00000000

[330h 0816  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[331h 0817  1]                       Length : 10

[332h 0818  1]      Proximity Domain Low(8) : 00
[333h 0819  1]                      Apic ID : 30
[334h 0820  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[338h 0824  1]              Local Sapic EID : 00
[339h 0825  3]    Proximity Domain High(24) : 000000
[33Ch 0828  4]                     Reserved : 00000000

[340h 0832  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[341h 0833  1]                       Length : 10

[342h 0834  1]      Proximity Domain Low(8) : 00
[343h 0835  1]                      Apic ID : 31
[344h 0836  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[348h 0840  1]              Local Sapic EID : 00
[349h 0841  3]    Proximity Domain High(24) : 000000
[34Ch 0844  4]                     Reserved : 00000000

[350h 0848  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[351h 0849  1]                       Length : 10

[352h 0850  1]      Proximity Domain Low(8) : 00
[353h 0851  1]                      Apic ID : 32
[354h 0852  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[358h 0856  1]              Local Sapic EID : 00
[359h 0857  3]    Proximity Domain High(24) : 000000
[35Ch 0860  4]                     Reserved : 00000000

[360h 0864  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[361h 0865  1]                       Length : 10

[362h 0866  1]      Proximity Domain Low(8) : 00
[363h 0867  1]                      Apic ID : 33
[364h 0868  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[368h 0872  1]              Local Sapic EID : 00
[369h 0873  3]    Proximity Domain High(24) : 000000
[36Ch 0876  4]                     Reserved : 00000000

[370h 0880  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[371h 0881  1]                       Length : 10

[372h 0882  1]      Proximity Domain Low(8) : 00
[373h 0883  1]                      Apic ID : 34
[374h 0884  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[378h 0888  1]              Local Sapic EID : 00
[379h 0889  3]    Proximity Domain High(24) : 000000
[37Ch 0892  4]                     Reserved : 00000000

[380h 0896  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[381h 0897  1]                       Length : 10

[382h 0898  1]      Proximity Domain Low(8) : 00
[383h 0899  1]                      Apic ID : 35
[384h 0900  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[388h 0904  1]              Local Sapic EID : 00
[389h 0905  3]    Proximity Domain High(24) : 000000
[38Ch 0908  4]                     Reserved : 00000000

[390h 0912  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[391h 0913  1]                       Length : 10

[392h 0914  1]      Proximity Domain Low(8) : 00
[393h 0915  1]                      Apic ID : 36
[394h 0916  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[398h 0920  1]              Local Sapic EID : 00
[399h 0921  3]    Proximity Domain High(24) : 000000
[39Ch 0924  4]                     Reserved : 00000000

[3A0h 0928  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[3A1h 0929  1]                       Length : 10

[3A2h 0930  1]      Proximity Domain Low(8) : 00
[3A3h 0931  1]                      Apic ID : 37
[3A4h 0932  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[3A8h 0936  1]              Local Sapic EID : 00
[3A9h 0937  3]    Proximity Domain High(24) : 000000
[3ACh 0940  4]                     Reserved : 00000000

[3B0h 0944  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[3B1h 0945  1]                       Length : 10

[3B2h 0946  1]      Proximity Domain Low(8) : 00
[3B3h 0947  1]                      Apic ID : 38
[3B4h 0948  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[3B8h 0952  1]              Local Sapic EID : 00
[3B9h 0953  3]    Proximity Domain High(24) : 000000
[3BCh 0956  4]                     Reserved : 00000000

[3C0h 0960  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[3C1h 0961  1]                       Length : 10

[3C2h 0962  1]      Proximity Domain Low(8) : 00
[3C3h 0963  1]                      Apic ID : 39
[3C4h 0964  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[3C8h 0968  1]              Local Sapic EID : 00
[3C9h 0969  3]    Proximity Domain High(24) : 000000
[3CCh 0972  4]                     Reserved : 00000000

[3D0h 0976  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[3D1h 0977  1]                       Length : 10

[3D2h 0978  1]      Proximity Domain Low(8) : 00
[3D3h 0979  1]                      Apic ID : 3A
[3D4h 0980  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[3D8h 0984  1]              Local Sapic EID : 00
[3D9h 0985  3]    Proximity Domain High(24) : 000000
[3DCh 0988  4]                     Reserved : 00000000

[3E0h 0992  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[3E1h 0993  1]                       Length : 10

[3E2h 0994  1]      Proximity Domain Low(8) : 00
[3E3h 0995  1]                      Apic ID : 3B
[3E4h 0996  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[3E8h 1000  1]              Local Sapic EID : 00
[3E9h 1001  3]    Proximity Domain High(24) : 000000
[3ECh 1004  4]                     Reserved : 00000000

[3F0h 1008  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[3F1h 1009  1]                       Length : 10

[3F2h 1010  1]      Proximity Domain Low(8) : 00
[3F3h 1011  1]                      Apic ID : 3C
[3F4h 1012  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[3F8h 1016  1]              Local Sapic EID : 00
[3F9h 1017  3]    Proximity Domain High(24) : 000000
[3FCh 1020  4]                     Reserved : 00000000

[400h 1024  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[401h 1025  1]                       Length : 10

[402h 1026  1]      Proximity Domain Low(8) : 00
[403h 1027  1]                      Apic ID : 3D
[404h 1028  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[408h 1032  1]              Local Sapic EID : 00
[409h 1033  3]    Proximity Domain High(24) : 000000
[40Ch 1036  4]                     Reserved : 00000000

[410h 1040  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[411h 1041  1]                       Length : 10

[412h 1042  1]      Proximity Domain Low(8) : 00
[413h 1043  1]                      Apic ID : 3E
[414h 1044  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[418h 1048  1]              Local Sapic EID : 00
[419h 1049  3]    Proximity Domain High(24) : 000000
[41Ch 1052  4]                     Reserved : 00000000

[420h 1056  1]                Subtable Type : 00 <Processor Local APIC/SAPIC Affinity>
[421h 1057  1]                       Length : 10

[422h 1058  1]      Proximity Domain Low(8) : 00
[423h 1059  1]                      Apic ID : 3F
[424h 1060  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
[428h 1064  1]              Local Sapic EID : 00
[429h 1065  3]    Proximity Domain High(24) : 000000
[42Ch 1068  4]                     Reserved : 00000000

[430h 1072  1]                Subtable Type : 01 <Memory Affinity>
[431h 1073  1]                       Length : 28

[432h 1074  4]             Proximity Domain : 00000000
[436h 1078  2]                     Reserved : 0000
[438h 1080  8]                 Base Address : 0000000000000000
[440h 1088  8]               Address Length : 00000000E0000000
[448h 1096  4]                     Reserved : 00000000
[44Ch 1100  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
                              Hot Pluggable : 0
                               Non-Volatile : 0
[450h 1104  8]                     Reserved : 0000000000000000

[458h 1112  1]                Subtable Type : 01 <Memory Affinity>
[459h 1113  1]                       Length : 28

[45Ah 1114  4]             Proximity Domain : 00000000
[45Eh 1118  2]                     Reserved : 0000
[460h 1120  8]                 Base Address : 00000000E0000000
[468h 1128  8]               Address Length : 0000000020000000
[470h 1136  4]                     Reserved : 00000000
[474h 1140  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
                              Hot Pluggable : 0
                               Non-Volatile : 0
[478h 1144  8]                     Reserved : 0000000000000000

[480h 1152  1]                Subtable Type : 01 <Memory Affinity>
[481h 1153  1]                       Length : 28

[482h 1154  4]             Proximity Domain : 00000000
[486h 1158  2]                     Reserved : 0000
[488h 1160  8]                 Base Address : 0000000100000000
[490h 1168  8]               Address Length : 0000000120000000
[498h 1176  4]                     Reserved : 00000000
[49Ch 1180  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
                              Hot Pluggable : 0
                               Non-Volatile : 0
[4A0h 1184  8]                     Reserved : 0000000000000000

[4A8h 1192  1]                Subtable Type : 01 <Memory Affinity>
[4A9h 1193  1]                       Length : 28

[4AAh 1194  4]             Proximity Domain : 00000001
[4AEh 1198  2]                     Reserved : 0000
[4B0h 1200  8]                 Base Address : 0000000220000000
[4B8h 1208  8]               Address Length : 0000000200000000
[4C0h 1216  4]                     Reserved : 00000000
[4C4h 1220  4]        Flags (decoded below) : 00000001
                                    Enabled : 1
                              Hot Pluggable : 0
                               Non-Volatile : 0
[4C8h 1224  8]                     Reserved : 0000000000000000

[4D0h 1232  1]                Subtable Type : 01 <Memory Affinity>
[4D1h 1233  1]                       Length : 28

[4D2h 1234  4]             Proximity Domain : 00000000
[4D6h 1238  2]                     Reserved : 0000
[4D8h 1240  8]                 Base Address : 0000000420000000
[4E0h 1248  8]               Address Length : 0000000000000000
[4E8h 1256  4]                     Reserved : 00000000
[4ECh 1260  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
                              Hot Pluggable : 0
                               Non-Volatile : 0
[4F0h 1264  8]                     Reserved : 0000000000000000

[4F8h 1272  1]                Subtable Type : 01 <Memory Affinity>
[4F9h 1273  1]                       Length : 28

[4FAh 1274  4]             Proximity Domain : 00000000
[4FEh 1278  2]                     Reserved : 0000
[500h 1280  8]                 Base Address : 0000000420000000
[508h 1288  8]               Address Length : 0000000000000000
[510h 1296  4]                     Reserved : 00000000
[514h 1300  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
                              Hot Pluggable : 0
                               Non-Volatile : 0
[518h 1304  8]                     Reserved : 0000000000000000

[520h 1312  1]                Subtable Type : 01 <Memory Affinity>
[521h 1313  1]                       Length : 28

[522h 1314  4]             Proximity Domain : 00000000
[526h 1318  2]                     Reserved : 0000
[528h 1320  8]                 Base Address : 0000000420000000
[530h 1328  8]               Address Length : 0000000000000000
[538h 1336  4]                     Reserved : 00000000
[53Ch 1340  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
                              Hot Pluggable : 0
                               Non-Volatile : 0
[540h 1344  8]                     Reserved : 0000000000000000

[548h 1352  1]                Subtable Type : 01 <Memory Affinity>
[549h 1353  1]                       Length : 28

[54Ah 1354  4]             Proximity Domain : 00000000
[54Eh 1358  2]                     Reserved : 0000
[550h 1360  8]                 Base Address : 0000000420000000
[558h 1368  8]               Address Length : 0000000000000000
[560h 1376  4]                     Reserved : 00000000
[564h 1380  4]        Flags (decoded below) : 00000000
                                    Enabled : 0
                              Hot Pluggable : 0
                               Non-Volatile : 0
[568h 1384  8]                     Reserved : 0000000000000000

Raw Table Data

  0000: 53 52 41 54 70 05 00 00 01 D9 48 50 20 20 20 20  SRATp.....HP    
  0010: 50 72 6F 6C 69 61 6E 74 01 00 00 00 20 20 20 20  Proliant....    
  0020: 2E 16 00 00 01 00 00 00 00 00 00 00 00 00 00 00  ................
  0030: 00 10 00 00 01 00 00 00 00 00 00 00 00 00 00 00  ................
  0040: 00 10 00 01 01 00 00 00 00 00 00 00 00 00 00 00  ................
  0050: 00 10 00 02 01 00 00 00 00 00 00 00 00 00 00 00  ................
  0060: 00 10 00 03 01 00 00 00 00 00 00 00 00 00 00 00  ................
  0070: 00 10 00 04 01 00 00 00 00 00 00 00 00 00 00 00  ................
  0080: 00 10 00 05 01 00 00 00 00 00 00 00 00 00 00 00  ................
  0090: 00 10 00 06 01 00 00 00 00 00 00 00 00 00 00 00  ................
  00A0: 00 10 00 07 01 00 00 00 00 00 00 00 00 00 00 00  ................
  00B0: 00 10 00 08 00 00 00 00 00 00 00 00 00 00 00 00  ................
  00C0: 00 10 00 09 00 00 00 00 00 00 00 00 00 00 00 00  ................
  00D0: 00 10 00 0A 00 00 00 00 00 00 00 00 00 00 00 00  ................
  00E0: 00 10 00 0B 00 00 00 00 00 00 00 00 00 00 00 00  ................
  00F0: 00 10 00 0C 00 00 00 00 00 00 00 00 00 00 00 00  ................
  0100: 00 10 00 0D 00 00 00 00 00 00 00 00 00 00 00 00  ................
  0110: 00 10 00 0E 00 00 00 00 00 00 00 00 00 00 00 00  ................
  0120: 00 10 00 0F 00 00 00 00 00 00 00 00 00 00 00 00  ................
  0130: 00 10 01 10 01 00 00 00 00 00 00 00 00 00 00 00  ................
  0140: 00 10 01 11 01 00 00 00 00 00 00 00 00 00 00 00  ................
  0150: 00 10 01 12 01 00 00 00 00 00 00 00 00 00 00 00  ................
  0160: 00 10 01 13 01 00 00 00 00 00 00 00 00 00 00 00  ................
  0170: 00 10 01 14 01 00 00 00 00 00 00 00 00 00 00 00  ................
  0180: 00 10 01 15 01 00 00 00 00 00 00 00 00 00 00 00  ................
  0190: 00 10 01 16 01 00 00 00 00 00 00 00 00 00 00 00  ................
  01A0: 00 10 01 17 01 00 00 00 00 00 00 00 00 00 00 00  ................
  01B0: 00 10 00 18 00 00 00 00 00 00 00 00 00 00 00 00  ................
  01C0: 00 10 00 19 00 00 00 00 00 00 00 00 00 00 00 00  ................
  01D0: 00 10 00 1A 00 00 00 00 00 00 00 00 00 00 00 00  ................
  01E0: 00 10 00 1B 00 00 00 00 00 00 00 00 00 00 00 00  ................
  01F0: 00 10 00 1C 00 00 00 00 00 00 00 00 00 00 00 00  ................
  0200: 00 10 00 1D 00 00 00 00 00 00 00 00 00 00 00 00  ................
  0210: 00 10 00 1E 00 00 00 00 00 00 00 00 00 00 00 00  ................
  0220: 00 10 00 1F 00 00 00 00 00 00 00 00 00 00 00 00  ................
  0230: 00 10 00 20 00 00 00 00 00 00 00 00 00 00 00 00  ... ............
  0240: 00 10 00 21 00 00 00 00 00 00 00 00 00 00 00 00  ...!............
  0250: 00 10 00 22 00 00 00 00 00 00 00 00 00 00 00 00  ..."............
  0260: 00 10 00 23 00 00 00 00 00 00 00 00 00 00 00 00  ...#............
  0270: 00 10 00 24 00 00 00 00 00 00 00 00 00 00 00 00  ...$............
  0280: 00 10 00 25 00 00 00 00 00 00 00 00 00 00 00 00  ...%............
  0290: 00 10 00 26 00 00 00 00 00 00 00 00 00 00 00 00  ...&............
  02A0: 00 10 00 27 00 00 00 00 00 00 00 00 00 00 00 00  ...'............
  02B0: 00 10 00 28 00 00 00 00 00 00 00 00 00 00 00 00  ...(............
  02C0: 00 10 00 29 00 00 00 00 00 00 00 00 00 00 00 00  ...)............
  02D0: 00 10 00 2A 00 00 00 00 00 00 00 00 00 00 00 00  ...*............
  02E0: 00 10 00 2B 00 00 00 00 00 00 00 00 00 00 00 00  ...+............
  02F0: 00 10 00 2C 00 00 00 00 00 00 00 00 00 00 00 00  ...,............
  0300: 00 10 00 2D 00 00 00 00 00 00 00 00 00 00 00 00  ...-............
  0310: 00 10 00 2E 00 00 00 00 00 00 00 00 00 00 00 00  ................
  0320: 00 10 00 2F 00 00 00 00 00 00 00 00 00 00 00 00  .../............
  0330: 00 10 00 30 00 00 00 00 00 00 00 00 00 00 00 00  ...0............
  0340: 00 10 00 31 00 00 00 00 00 00 00 00 00 00 00 00  ...1............
  0350: 00 10 00 32 00 00 00 00 00 00 00 00 00 00 00 00  ...2............
  0360: 00 10 00 33 00 00 00 00 00 00 00 00 00 00 00 00  ...3............
  0370: 00 10 00 34 00 00 00 00 00 00 00 00 00 00 00 00  ...4............
  0380: 00 10 00 35 00 00 00 00 00 00 00 00 00 00 00 00  ...5............
  0390: 00 10 00 36 00 00 00 00 00 00 00 00 00 00 00 00  ...6............
  03A0: 00 10 00 37 00 00 00 00 00 00 00 00 00 00 00 00  ...7............
  03B0: 00 10 00 38 00 00 00 00 00 00 00 00 00 00 00 00  ...8............
  03C0: 00 10 00 39 00 00 00 00 00 00 00 00 00 00 00 00  ...9............
  03D0: 00 10 00 3A 00 00 00 00 00 00 00 00 00 00 00 00  ...:............
  03E0: 00 10 00 3B 00 00 00 00 00 00 00 00 00 00 00 00  ...;............
  03F0: 00 10 00 3C 00 00 00 00 00 00 00 00 00 00 00 00  ...<............
  0400: 00 10 00 3D 00 00 00 00 00 00 00 00 00 00 00 00  ...=............
  0410: 00 10 00 3E 00 00 00 00 00 00 00 00 00 00 00 00  ...>............
  0420: 00 10 00 3F 00 00 00 00 00 00 00 00 00 00 00 00  ...?............
  0430: 01 28 00 00 00 00 00 00 00 00 00 00 00 00 00 00  .(..............
  0440: 00 00 00 E0 00 00 00 00 00 00 00 00 01 00 00 00  ................
  0450: 00 00 00 00 00 00 00 00 01 28 00 00 00 00 00 00  .........(......
  0460: 00 00 00 E0 00 00 00 00 00 00 00 20 00 00 00 00  ........... ....
  0470: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  0480: 01 28 00 00 00 00 00 00 00 00 00 00 01 00 00 00  .(..............
  0490: 00 00 00 20 01 00 00 00 00 00 00 00 01 00 00 00  ... ............
  04A0: 00 00 00 00 00 00 00 00 01 28 01 00 00 00 00 00  .........(......
  04B0: 00 00 00 20 02 00 00 00 00 00 00 00 02 00 00 00  ... ............
  04C0: 00 00 00 00 01 00 00 00 00 00 00 00 00 00 00 00  ................
  04D0: 01 28 00 00 00 00 00 00 00 00 00 20 04 00 00 00  .(......... ....
  04E0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  04F0: 00 00 00 00 00 00 00 00 01 28 00 00 00 00 00 00  .........(......
  0500: 00 00 00 20 04 00 00 00 00 00 00 00 00 00 00 00  ... ............
  0510: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  0520: 01 28 00 00 00 00 00 00 00 00 00 20 04 00 00 00  .(......... ....
  0530: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
  0540: 00 00 00 00 00 00 00 00 01 28 00 00 00 00 00 00  .........(......
  0550: 00 00 00 20 04 00 00 00 00 00 00 00 00 00 00 00  ... ............
  0560: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................



^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-24  9:55                     ` Eric Dumazet
  2009-11-24 10:06                       ` Peter P Waskiewicz Jr
@ 2009-11-26 14:10                       ` Badalian Vyacheslav
  1 sibling, 0 replies; 67+ messages in thread
From: Badalian Vyacheslav @ 2009-11-26 14:10 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Peter P Waskiewicz Jr, robert, Jesper Dangaard Brouer, Linux Netdev List

Eric Dumazet пишет:
> Peter P Waskiewicz Jr a écrit :
> 
>> You might have this elsewhere, but it sounds like you're connecting back
>> to back with another 82599 NIC.  Our optics in that NIC are dual-rate,
>> and the software mechanism that tries to "autoneg" link speed gets out
>> of sync easily in back-to-back setups.
>>
>> If it's really annoying, and you're willing to run with a local patch to
>> disable the autotry mechanism, try this:
>>
>> diff --git a/drivers/net/ixgbe/ixgbe_main.c
>> b/drivers/net/ixgbe/ixgbe_main.c
>> index a5036f7..62c0915 100644
>> --- a/drivers/net/ixgbe/ixgbe_main.c
>> +++ b/drivers/net/ixgbe/ixgbe_main.c
>> @@ -4670,6 +4670,10 @@ static void ixgbe_multispeed_fiber_task(struct
>> work_struct *work)
>>         autoneg = hw->phy.autoneg_advertised;
>>         if ((!autoneg) && (hw->mac.ops.get_link_capabilities))
>>                 hw->mac.ops.get_link_capabilities(hw, &autoneg,
>> &negotiation);
>> +
>> +       /* force 10G only */
>> +       autoneg = IXGBE_LINK_SPEED_10GB_FULL;
>> +
>>         if (hw->mac.ops.setup_link)
>>                 hw->mac.ops.setup_link(hw, autoneg, negotiation, true);
>>         adapter->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
> 
> Thanks ! This did the trick :)
> 
> If I am not mistaken, number of TX queues should be capped by number of possible cpus ?
> 
> Its currently a fixed 128 value, allocating 128*128 = 16384 bytes,
> and polluting "tc -s -d class show dev fiber0" output.
> 
> [PATCH net-next-2.6] ixgbe: Do not allocate too many netdev txqueues
> 
> Instead of allocating 128 struct netdev_queue per device, use the minimum
> value between 128 and number of possible cpus, to reduce ram usage and
> "tc -s -d class show dev ..." output
> 
> diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
> index ebcec30..ec2508d 100644
> --- a/drivers/net/ixgbe/ixgbe_main.c
> +++ b/drivers/net/ixgbe/ixgbe_main.c
> @@ -5582,7 +5583,10 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
>  	pci_set_master(pdev);
>  	pci_save_state(pdev);
>  
> -	netdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter), MAX_TX_QUEUES);
> +	netdev = alloc_etherdev_mq(sizeof(struct ixgbe_adapter),
> +				   min_t(unsigned int,
> +					 MAX_TX_QUEUES,
> +					 num_possible_cpus()));
>  	if (!netdev) {
>  		err = -ENOMEM;
>  		goto err_alloc_etherdev;
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 

This also fix log time loading TC rules for me

Tested-by: Badalian Vyacheslav <slavon.net@gmail.com>

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-24 13:14                         ` ixgbe question John Fastabend
@ 2009-11-29  8:18                           ` David Miller
  2009-11-30 13:02                             ` Eric Dumazet
  0 siblings, 1 reply; 67+ messages in thread
From: David Miller @ 2009-11-29  8:18 UTC (permalink / raw)
  To: john.r.fastabend
  Cc: peter.p.waskiewicz.jr, eric.dumazet, robert, hawk, netdev

From: John Fastabend <john.r.fastabend@intel.com>
Date: Tue, 24 Nov 2009 13:14:12 +0000

> Believe the below patch will break DCB and FCoE though, both features
> have the potential to set real_num_tx_queues to greater then the
> number of CPUs.  This could result in real_num_tx_queues >
> num_tx_queues.
> 
> The current solution isn't that great though, maybe we should set to
> the minimum of MAX_TX_QUEUES and num_possible_cpus() * 2 + 8.
> 
> That should cover the maximum possible queues for DCB, FCoE and their
> combinations.
> 
> general multiq = num_possible_cpus()
> DCB = 8 tx queue's
> FCoE = 2*num_possible_cpus()
> FCoE + DCB = 8 tx queues + num_possible_cpus

Eric, I'm tossing your patch because of this problem, just FYI.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-29  8:18                           ` David Miller
@ 2009-11-30 13:02                             ` Eric Dumazet
  2009-11-30 20:20                               ` John Fastabend
  0 siblings, 1 reply; 67+ messages in thread
From: Eric Dumazet @ 2009-11-30 13:02 UTC (permalink / raw)
  To: David Miller
  Cc: john.r.fastabend, peter.p.waskiewicz.jr, robert, hawk, netdev

David Miller a écrit :
> From: John Fastabend <john.r.fastabend@intel.com>
> Date: Tue, 24 Nov 2009 13:14:12 +0000
> 
>> Believe the below patch will break DCB and FCoE though, both features
>> have the potential to set real_num_tx_queues to greater then the
>> number of CPUs.  This could result in real_num_tx_queues >
>> num_tx_queues.
>>
>> The current solution isn't that great though, maybe we should set to
>> the minimum of MAX_TX_QUEUES and num_possible_cpus() * 2 + 8.
>>
>> That should cover the maximum possible queues for DCB, FCoE and their
>> combinations.
>>
>> general multiq = num_possible_cpus()
>> DCB = 8 tx queue's
>> FCoE = 2*num_possible_cpus()
>> FCoE + DCB = 8 tx queues + num_possible_cpus
> 
> Eric, I'm tossing your patch because of this problem, just FYI.

Sure, I guess we need a more generic way to handle this.


^ permalink raw reply	[flat|nested] 67+ messages in thread

* Re: ixgbe question
  2009-11-30 13:02                             ` Eric Dumazet
@ 2009-11-30 20:20                               ` John Fastabend
  0 siblings, 0 replies; 67+ messages in thread
From: John Fastabend @ 2009-11-30 20:20 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, Waskiewicz Jr, Peter P, robert, hawk, netdev

Eric Dumazet wrote:
> David Miller a écrit :
>   
>> From: John Fastabend <john.r.fastabend@intel.com>
>> Date: Tue, 24 Nov 2009 13:14:12 +0000
>>
>>     
>>> Believe the below patch will break DCB and FCoE though, both features
>>> have the potential to set real_num_tx_queues to greater then the
>>> number of CPUs.  This could result in real_num_tx_queues >
>>> num_tx_queues.
>>>
>>> The current solution isn't that great though, maybe we should set to
>>> the minimum of MAX_TX_QUEUES and num_possible_cpus() * 2 + 8.
>>>
>>> That should cover the maximum possible queues for DCB, FCoE and their
>>> combinations.
>>>
>>> general multiq = num_possible_cpus()
>>> DCB = 8 tx queue's
>>> FCoE = 2*num_possible_cpus()
>>> FCoE + DCB = 8 tx queues + num_possible_cpus
>>>       
>> Eric, I'm tossing your patch because of this problem, just FYI.
>>     
>
> Sure, I guess we need a more generic way to handle this.
>
>   
Eric,

I'll resubmit your patch with a small update to fix my concerns soon.

thanks,
john.

^ permalink raw reply	[flat|nested] 67+ messages in thread

* [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints
@ 2009-11-23  7:12 Peter P Waskiewicz Jr
  0 siblings, 0 replies; 67+ messages in thread
From: Peter P Waskiewicz Jr @ 2009-11-23  7:12 UTC (permalink / raw)
  To: linux-kernel, arjan; +Cc: davem, netdev

This patchset adds a new CPU mask for SMP systems to the irq_desc
struct.  It also exposes an API for underlying device drivers to
assist irqbalance in making smarter decisions when balancing, especially
in a NUMA environment.  For example, an ethernet driver with MSI-X may
wish to limit the CPUs that an interrupt can be balanced within to
stay on a single NUMA node.  Current irqbalance operation can move the
interrupt off the node, resulting in cross-node memory accesses and
locks.

The API is a get/set API within the kernel, along with a /proc entry
for the interrupt.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
---

 include/linux/interrupt.h |    8 ++++++
 include/linux/irq.h       |    2 ++
 kernel/irq/manage.c       |   32 +++++++++++++++++++++++++
 kernel/irq/proc.c         |   57 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 99 insertions(+), 0 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 75f3f00..9fd08aa 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -208,6 +208,8 @@ extern cpumask_var_t irq_default_affinity;
 extern int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask);
 extern int irq_can_set_affinity(unsigned int irq);
 extern int irq_select_affinity(unsigned int irq);
+extern int irq_set_node_affinity(unsigned int irq,
+                                 const struct cpumask *cpumask);
 
 #else /* CONFIG_SMP */
 
@@ -223,6 +225,12 @@ static inline int irq_can_set_affinity(unsigned int irq)
 
 static inline int irq_select_affinity(unsigned int irq)  { return 0; }
 
+static inline int irq_set_node_affinity(unsigned int irq,
+                                        const struct cpumask *m)
+{
+	return -EINVAL;
+}
+
 #endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */
 
 #ifdef CONFIG_GENERIC_HARDIRQS
diff --git a/include/linux/irq.h b/include/linux/irq.h
index ae9653d..26d7d07 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -166,6 +166,7 @@ struct irq_2_iommu;
  * @lock:		locking for SMP
  * @affinity:		IRQ affinity on SMP
  * @node:		node index useful for balancing
+ * @node_affinity:	irq mask hints for irqbalance
  * @pending_mask:	pending rebalanced interrupts
  * @threads_active:	number of irqaction threads currently running
  * @wait_for_threads:	wait queue for sync_irq to wait for threaded handlers
@@ -196,6 +197,7 @@ struct irq_desc {
 #ifdef CONFIG_SMP
 	cpumask_var_t		affinity;
 	unsigned int		node;
+	cpumask_var_t		node_affinity;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_var_t		pending_mask;
 #endif
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7305b29..9e80783 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,38 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	return 0;
 }
 
+/**
+ *	irq_set_node_affinity - Set the CPU mask this interrupt can run on
+ *	@irq:		Interrupt to modify
+ *	@cpumask:	CPU mask to assign to the interrupt
+ *
+ */
+int irq_set_node_affinity(unsigned int irq, const struct cpumask *cpumask)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	unsigned long flags;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	cpumask_copy(desc->node_affinity, cpumask);
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(irq_set_node_affinity);
+
+/**
+ *	irq_get_node_affinity - Get the CPU mask this interrupt can run on
+ *	@irq:		Interrupt to get information
+ *
+ */
+struct cpumask *irq_get_node_affinity(unsigned int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	return desc->node_affinity;
+}
+EXPORT_SYMBOL(irq_get_node_affinity);
+
 #ifndef CONFIG_AUTO_IRQ_AFFINITY
 /*
  * Generic version of the affinity autoselector.
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 0832145..192e3fb 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -31,6 +31,16 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+static int irq_node_affinity_proc_show(struct seq_file *m, void *v)
+{
+	struct irq_desc *desc = irq_to_desc((long)m->private);
+	const struct cpumask *mask = desc->node_affinity;
+
+	seq_cpumask(m, mask);
+	seq_putc(m, '\n');
+	return 0;
+}
+
 #ifndef is_affinity_mask_valid
 #define is_affinity_mask_valid(val) 1
 #endif
@@ -78,11 +88,46 @@ free_cpumask:
 	return err;
 }
 
+static ssize_t irq_node_affinity_proc_write(struct file *file,
+		const char __user *buffer, size_t count, loff_t *pos)
+{
+	unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
+	cpumask_var_t new_value;
+	int err;
+
+	if (no_irq_affinity || irq_balancing_disabled(irq))
+		return -EIO;
+
+	if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
+		return -ENOMEM;
+
+	err = cpumask_parse_user(buffer, count, new_value);
+	if (err)
+		goto free_cpumask;
+
+	if (!is_affinity_mask_valid(new_value)) {
+		err = -EINVAL;
+		goto free_cpumask;
+	}
+
+	irq_set_node_affinity(irq, new_value);
+	err = count;
+
+free_cpumask:
+	free_cpumask_var(new_value);
+	return err;
+}
+
 static int irq_affinity_proc_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
 }
 
+static int irq_node_affinity_proc_open(struct inode *inode, struct file *f)
+{
+	return single_open(f, irq_node_affinity_proc_show, PDE(inode)->data);
+}
+
 static const struct file_operations irq_affinity_proc_fops = {
 	.open		= irq_affinity_proc_open,
 	.read		= seq_read,
@@ -91,6 +136,14 @@ static const struct file_operations irq_affinity_proc_fops = {
 	.write		= irq_affinity_proc_write,
 };
 
+static const struct file_operations irq_node_affinity_proc_fops = {
+	.open		= irq_node_affinity_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.write		= irq_node_affinity_proc_write,
+};
+
 static int default_affinity_show(struct seq_file *m, void *v)
 {
 	seq_cpumask(m, irq_default_affinity);
@@ -230,6 +283,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
 	/* create /proc/irq/<irq>/smp_affinity */
 	proc_create_data("smp_affinity", 0600, desc->dir,
 			 &irq_affinity_proc_fops, (void *)(long)irq);
+
+	/* create /proc/irq/<irq>/node_affinity */
+	proc_create_data("node_affinity", 0600, desc->dir,
+	                 &irq_node_affinity_proc_fops, (void *)(long)irq);
 #endif
 
 	proc_create_data("spurious", 0444, desc->dir,


^ permalink raw reply related	[flat|nested] 67+ messages in thread

end of thread, other threads:[~2009-11-30 20:20 UTC | newest]

Thread overview: 67+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-11-23  6:46 [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints Peter P Waskiewicz Jr
2009-11-23  7:32 ` Yong Zhang
2009-11-23  7:32   ` Yong Zhang
2009-11-23  9:36   ` Peter P Waskiewicz Jr
2009-11-23 10:21     ` ixgbe question Eric Dumazet
2009-11-23 10:30       ` Badalian Vyacheslav
2009-11-23 10:34       ` Waskiewicz Jr, Peter P
2009-11-23 10:37         ` Eric Dumazet
2009-11-23 14:05           ` Eric Dumazet
2009-11-23 21:26           ` David Miller
2009-11-23 14:10       ` Jesper Dangaard Brouer
2009-11-23 14:38         ` Eric Dumazet
2009-11-23 18:30           ` robert
2009-11-23 16:59             ` Eric Dumazet
2009-11-23 20:54               ` robert
2009-11-23 21:28                 ` David Miller
2009-11-23 22:14                   ` Robert Olsson
2009-11-23 23:28               ` Waskiewicz Jr, Peter P
2009-11-23 23:44                 ` David Miller
2009-11-24  7:46                 ` Eric Dumazet
2009-11-24  8:46                   ` Badalian Vyacheslav
2009-11-24  9:07                   ` Peter P Waskiewicz Jr
2009-11-24  9:55                     ` Eric Dumazet
2009-11-24 10:06                       ` Peter P Waskiewicz Jr
2009-11-24 11:37                         ` [PATCH net-next-2.6] ixgbe: Fix TX stats accounting Eric Dumazet
2009-11-24 13:23                           ` Eric Dumazet
2009-11-25  7:38                             ` Jeff Kirsher
2009-11-25  9:31                               ` Eric Dumazet
2009-11-25  9:38                                 ` Jeff Kirsher
2009-11-24 13:14                         ` ixgbe question John Fastabend
2009-11-29  8:18                           ` David Miller
2009-11-30 13:02                             ` Eric Dumazet
2009-11-30 20:20                               ` John Fastabend
2009-11-26 14:10                       ` Badalian Vyacheslav
2009-11-23 17:05     ` [PATCH] irq: Add node_affinity CPU masks for smarter irqbalance hints Peter Zijlstra
2009-11-23 23:32       ` Waskiewicz Jr, Peter P
2009-11-24  8:38         ` Peter Zijlstra
2009-11-24  8:59           ` Peter P Waskiewicz Jr
2009-11-24  9:08             ` Peter Zijlstra
2009-11-24  9:15               ` Peter P Waskiewicz Jr
2009-11-24 14:43               ` Arjan van de Ven
2009-11-24  9:15             ` Peter Zijlstra
2009-11-24 10:07             ` Thomas Gleixner
2009-11-24 17:55               ` Peter P Waskiewicz Jr
2009-11-25 11:18               ` Peter Zijlstra
2009-11-24  6:07       ` Arjan van de Ven
2009-11-24  8:39         ` Peter Zijlstra
2009-11-24 14:42           ` Arjan van de Ven
2009-11-24 17:39           ` David Miller
2009-11-24 17:56             ` Peter P Waskiewicz Jr
2009-11-24 18:26               ` Eric Dumazet
2009-11-24 18:33                 ` Peter P Waskiewicz Jr
2009-11-24 19:01                   ` Eric Dumazet
2009-11-24 19:53                     ` Peter P Waskiewicz Jr
2009-11-24 18:54                 ` David Miller
2009-11-24 18:58                   ` Eric Dumazet
2009-11-24 20:35                     ` Andi Kleen
2009-11-24 20:46                       ` Eric Dumazet
2009-11-25 10:30                         ` Eric Dumazet
2009-11-25 10:37                           ` Andi Kleen
2009-11-25 11:35                             ` Eric Dumazet
2009-11-25 11:50                               ` Andi Kleen
2009-11-26 11:43                                 ` Eric Dumazet
2009-11-24  5:17     ` Yong Zhang
2009-11-24  5:17       ` Yong Zhang
2009-11-24  8:39       ` Peter P Waskiewicz Jr
2009-11-23  7:12 Peter P Waskiewicz Jr

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.