linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] clustered apic irq affinity fix for i386
@ 2003-04-30 23:07 Keith Mannthey
  2003-04-30 23:36 ` Andrew Morton
  0 siblings, 1 reply; 10+ messages in thread
From: Keith Mannthey @ 2003-04-30 23:07 UTC (permalink / raw)
  To: linux-kernel

Hello all,
	Machines with clustered apics are buggy when it comes to setting irq
affinity.  The function set_ioapic_affinity gladly writes the bottom 8
bits to the ioapic.  This workes great for small smp boxes that are in
flat apic mode but can cause a bit of trouble for clustered apic boxes
(including hanging the box).  
	What I propose is some minor checking for clustered apic sub arches. 
Before writing to the apic making sure the value came from kirqd (by
checking pending_irq_balance_apic[irq] against the mask we are about to
write). kirqd uses cpu_to_logical_apicid to setup the ioapic writes so
we trust those values.  If irqbalance_disabled is true we allow writing
of any masks, we trust the /proc interface user knows what they are
doing.
	This is a fairly minimal patch that should still allow both kirqd and
userspace irq balancing.  I would like to see a fix for this problem go
into the kernel. As you can tell this patch is against 2.5.68 stock. 
  
Thanks,
  Keith Mannthey 

diff -urN linux-2.5.68/arch/i386/kernel/io_apic.c linux-2.5.68-irqfix/arch/i386/kernel/io_apic.c
--- linux-2.5.68/arch/i386/kernel/io_apic.c	Sat Apr 19 19:49:09 2003
+++ linux-2.5.68-irqfix/arch/i386/kernel/io_apic.c	Thu May  1 19:09:43 2003
@@ -240,29 +240,6 @@
 			clear_IO_APIC_pin(apic, pin);
 }
 
-static void set_ioapic_affinity (unsigned int irq, unsigned long mask)
-{
-	unsigned long flags;
-	int pin;
-	struct irq_pin_list *entry = irq_2_pin + irq;
-
-	/*
-	 * Only the first 8 bits are valid.
-	 */
-	mask = mask << 24;
-	spin_lock_irqsave(&ioapic_lock, flags);
-	for (;;) {
-		pin = entry->pin;
-		if (pin == -1)
-			break;
-		io_apic_write(entry->apic, 0x10 + 1 + pin*2, mask);
-		if (!entry->next)
-			break;
-		entry = irq_2_pin + entry->next;
-	}
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
 #if defined(CONFIG_SMP)
 # include <asm/processor.h>	/* kernel_thread() */
 # include <linux/kernel_stat.h>	/* kstat */
@@ -672,6 +649,26 @@
 #endif /* defined(CONFIG_SMP) */
 
 
+static void set_ioapic_affinity (unsigned int irq, unsigned long mask)
+{
+	unsigned long flags;
+	int pin;
+	struct irq_pin_list *entry = irq_2_pin + irq;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	for (;;) {
+		pin = entry->pin;
+		if (pin == -1)
+			break;
+		io_apic_write_affinity(entry->apic, 0x10 + 1 + pin*2, mask,irq);
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+
 /*
  * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
  * specific CPU-side IRQs.
@@ -822,6 +819,7 @@
 			if (irq_entry == -1)
 				continue;
 			irq = pin_2_irq(irq_entry, ioapic, pin);
+			pending_irq_balance_apicid[irq] = mask;
 			set_ioapic_affinity(irq, mask);
 		}
 
diff -urN linux-2.5.68/include/asm-i386/mach-bigsmp/mach_apic.h linux-2.5.68-irqfix/include/asm-i386/mach-bigsmp/mach_apic.h
--- linux-2.5.68/include/asm-i386/mach-bigsmp/mach_apic.h	Sat Apr 19 19:51:08 2003
+++ linux-2.5.68-irqfix/include/asm-i386/mach-bigsmp/mach_apic.h	Thu May  1 19:12:37 2003
@@ -115,4 +115,21 @@
 	return (1);
 }
 
+extern int __cacheline_aligned pending_irq_balance_apicid[];
+extern int irqbalance_disabled;
+/*we want to be careful what we write we are in clustered mode
+ *if the mask came from pending_irq_balance_apicid we are ok because
+ *it was generated with cpu_to_logical_apicid*/
+static inline void io_apic_write_affinity(unsigned int apic, unsigned int reg,  unsigned int mask, unsigned int irq)
+{
+       if ((pending_irq_balance_apicid[irq] == mask) || (irqbalance_disabled))
+       {
+               mask = mask << 24;
+               io_apic_write(apic,reg,mask);
+       }
+       else
+       {
+               printk ("Trying to write abartry affinity value to ioapic! Not allowed!");
+       }
+}
 #endif /* __ASM_MACH_APIC_H */
diff -urN linux-2.5.68/include/asm-i386/mach-default/mach_apic.h linux-2.5.68-irqfix/include/asm-i386/mach-default/mach_apic.h
--- linux-2.5.68/include/asm-i386/mach-default/mach_apic.h	Sat Apr 19 19:51:19 2003
+++ linux-2.5.68-irqfix/include/asm-i386/mach-default/mach_apic.h	Thu May  1 19:18:44 2003
@@ -99,4 +99,11 @@
 	return test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map);
 }
 
+static inline void io_apic_write_affinity(unsigned int apic, unsigned int reg,  unsigned int mask,int irq)
+{
+	/*only the first 8 bits are valid*/
+	mask = mask << 24;
+	io_apic_write(apic,reg,mask);
+}
+
 #endif /* __ASM_MACH_APIC_H */
diff -urN linux-2.5.68/include/asm-i386/mach-numaq/mach_apic.h linux-2.5.68-irqfix/include/asm-i386/mach-numaq/mach_apic.h
--- linux-2.5.68/include/asm-i386/mach-numaq/mach_apic.h	Sat Apr 19 19:49:17 2003
+++ linux-2.5.68-irqfix/include/asm-i386/mach-numaq/mach_apic.h	Thu May  1 19:13:40 2003
@@ -103,4 +103,22 @@
 	return (1);
 }
 
+extern int __cacheline_aligned pending_irq_balance_apicid[];
+extern int irqbalance_disabled;
+/*we want to be careful what we write we are in clustered mode
+ *if the mask came from pending_irq_balance_apicid we are ok because
+ *it was generated with cpu_to_logical_apicid*/
+static inline void io_apic_write_affinity(unsigned int apic, unsigned int reg,  unsigned int mask, unsigned int irq)
+{
+       if ((pending_irq_balance_apicid[irq] == mask) || (irqbalance_disabled))
+       {
+               mask = mask << 24;
+               io_apic_write(apic,reg,mask);
+       }
+       else
+       {
+               printk ("Trying to write abartry affinity value to ioapic! Not allowed!");
+       }
+}
+
 #endif /* __ASM_MACH_APIC_H */
diff -urN linux-2.5.68/include/asm-i386/mach-summit/mach_apic.h linux-2.5.68-irqfix/include/asm-i386/mach-summit/mach_apic.h
--- linux-2.5.68/include/asm-i386/mach-summit/mach_apic.h	Sat Apr 19 19:50:06 2003
+++ linux-2.5.68-irqfix/include/asm-i386/mach-summit/mach_apic.h	Thu May  1 19:11:17 2003
@@ -113,4 +113,24 @@
 		return test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map);
 }
 
+
+extern int __cacheline_aligned pending_irq_balance_apicid[];
+extern int irqbalance_disabled;
+/*we want to be careful what we write we are in clustered mode
+ *if the mask came from pending_irq_balance_apicid we are ok because
+ *it was generated with cpu_to_logical_apicid*/
+static inline void io_apic_write_affinity(unsigned int apic, unsigned int reg,  unsigned int mask, unsigned int irq)
+{
+       if ((pending_irq_balance_apicid[irq] == mask) || (irqbalance_disabled))
+       {
+               mask = mask << 24;
+               io_apic_write(apic,reg,mask);
+       }
+       else
+       {
+               printk ("Trying to write abartry affinity value to ioapic! Not allowed!");
+       }
+}
+						
+
 #endif /* __ASM_MACH_APIC_H */
diff -urN linux-2.5.68/include/asm-i386/mach-visws/mach_apic.h linux-2.5.68-irqfix/include/asm-i386/mach-visws/mach_apic.h
--- linux-2.5.68/include/asm-i386/mach-visws/mach_apic.h	Sat Apr 19 19:48:49 2003
+++ linux-2.5.68-irqfix/include/asm-i386/mach-visws/mach_apic.h	Thu May  1 19:18:32 2003
@@ -77,4 +77,11 @@
 	return test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map);
 }
 
+static inline void io_apic_write_affinity(unsigned int apic, unsigned int reg,  unsigned int mask, int irq)
+{
+	/*only the first 8 bits are valid*/
+	mask = mask << 24;
+	io_apic_write(apic,reg,mask);
+}
+
 #endif /* __ASM_MACH_APIC_H */




^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] clustered apic irq affinity fix for i386
  2003-04-30 23:07 [RFC] clustered apic irq affinity fix for i386 Keith Mannthey
@ 2003-04-30 23:36 ` Andrew Morton
  2003-05-01  1:05   ` Keith Mannthey
  2003-05-01  9:10   ` [RFC] clustered apic irq affinity fix for i386 Arjan van de Ven
  0 siblings, 2 replies; 10+ messages in thread
From: Andrew Morton @ 2003-04-30 23:36 UTC (permalink / raw)
  To: Keith Mannthey; +Cc: linux-kernel

Keith Mannthey <kmannth@us.ibm.com> wrote:
>
> Hello all,
> 	Machines with clustered apics are buggy when it comes to setting irq
> affinity.

You stand accused of crimes against whitespace.

- The patch uses eight-spaces everywhere.  Please use eight-stop tabs.

- Funny comment format:

/*we want to be careful what we write we are in clustered mode
 *if the mask came from pending_irq_balance_apicid we are ok because
 *it was generated with cpu_to_logical_apicid*/

 should be

/*
 * We want to be careful what we write we are in clustered mode if the mask
 * came from pending_irq_balance_apicid we are ok because it was generated
 * with cpu_to_logical_apicid.
 */

- 1e16-column xterms.  Please aim for 80-columns:

static inline void io_apic_write_affinity(unsigned int apic, unsigned int reg,  unsigned int mask, unsigned int irq)

  should be:

static inline void io_apic_write_affinity(unsigned int apic,
		unsigned int reg,  unsigned int mask, unsigned int irq)

  or

static inline void
io_apic_write_affinity(unsigned int apic, unsigned int reg,
			unsigned int mask, unsigned int irq)


- non-K&R braces:


       if ((pending_irq_balance_apicid[irq] == mask) || (irqbalance_disabled))
       {
               mask = mask << 24;
               io_apic_write(apic,reg,mask);
       }
       else
       {
               printk ("Trying to write abartry affinity value to ioapic! Not allowed!");
       }
}

  should be:

       if ((pending_irq_balance_apicid[irq] == mask) || irqbalance_disabled) {
               mask = mask << 24;
               io_apic_write(apic,reg,mask);
       } else {
               printk("Trying to write abartry affinity value to ioapic! Not allowed!");
       }

  (note: no space between "printk" and "(")

  (s/abartry/arbitrary/)

  (best replace that printk with a BUG() or a WARN_ON(1))

  (or split the string up so it fits in 80-cols)

- Funny comments:

	/*only the first 8 bits are valid*/

  should be:

	/* Only the first 8 bits are valid */


Could you please take a look at all that and resend?

Thanks.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] clustered apic irq affinity fix for i386
  2003-04-30 23:36 ` Andrew Morton
@ 2003-05-01  1:05   ` Keith Mannthey
  2003-05-01  2:22     ` Andrew Morton
  2003-05-01  9:10   ` [RFC] clustered apic irq affinity fix for i386 Arjan van de Ven
  1 sibling, 1 reply; 10+ messages in thread
From: Keith Mannthey @ 2003-05-01  1:05 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel


> You stand accused of crimes against whitespace.

Yea I guess a little :).

> Could you please take a look at all that and resend?

This should be better. Thanks for the comments. 

Keith 

diff -urN linux-2.5.68/arch/i386/kernel/io_apic.c linux-2.5.68-irqfix/arch/i386/kernel/io_apic.c
--- linux-2.5.68/arch/i386/kernel/io_apic.c	Sat Apr 19 19:49:09 2003
+++ linux-2.5.68-irqfix/arch/i386/kernel/io_apic.c	Thu May  1 21:40:32 2003
@@ -240,29 +240,6 @@
 			clear_IO_APIC_pin(apic, pin);
 }
 
-static void set_ioapic_affinity (unsigned int irq, unsigned long mask)
-{
-	unsigned long flags;
-	int pin;
-	struct irq_pin_list *entry = irq_2_pin + irq;
-
-	/*
-	 * Only the first 8 bits are valid.
-	 */
-	mask = mask << 24;
-	spin_lock_irqsave(&ioapic_lock, flags);
-	for (;;) {
-		pin = entry->pin;
-		if (pin == -1)
-			break;
-		io_apic_write(entry->apic, 0x10 + 1 + pin*2, mask);
-		if (!entry->next)
-			break;
-		entry = irq_2_pin + entry->next;
-	}
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
 #if defined(CONFIG_SMP)
 # include <asm/processor.h>	/* kernel_thread() */
 # include <linux/kernel_stat.h>	/* kstat */
@@ -671,6 +648,25 @@
 static inline void move_irq(int irq) { }
 #endif /* defined(CONFIG_SMP) */
 
+static void set_ioapic_affinity (unsigned int irq, unsigned long mask)
+{
+	unsigned long flags;
+	int pin;
+	struct irq_pin_list *entry = irq_2_pin + irq;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	for (;;) {
+		pin = entry->pin;
+		if (pin == -1)
+			break;
+		io_apic_write_affinity(entry->apic, 0x10 + 1 + pin*2, mask,irq);
+		if (!entry->next)
+			break;
+		entry = irq_2_pin + entry->next;
+	}
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
 
 /*
  * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
@@ -822,6 +818,7 @@
 			if (irq_entry == -1)
 				continue;
 			irq = pin_2_irq(irq_entry, ioapic, pin);
+			pending_irq_balance_apicid[irq] = mask;
 			set_ioapic_affinity(irq, mask);
 		}
 
diff -urN linux-2.5.68/include/asm-i386/mach-bigsmp/mach_apic.h linux-2.5.68-irqfix/include/asm-i386/mach-bigsmp/mach_apic.h
--- linux-2.5.68/include/asm-i386/mach-bigsmp/mach_apic.h	Sat Apr 19 19:51:08 2003
+++ linux-2.5.68-irqfix/include/asm-i386/mach-bigsmp/mach_apic.h	Thu May  1 21:22:35 2003
@@ -115,4 +115,26 @@
 	return (1);
 }
 
+extern int __cacheline_aligned pending_irq_balance_apicid[];
+extern int irqbalance_disabled;
+/*
+ * We want to be careful what we write we are in clustered mode
+ * if the mask came from pending_irq_balance_apicid we are ok because
+ * it was generated with cpu_to_logical_apicid
+ */
+static inline void io_apic_write_affinity(unsigned int apic, 
+		unsigned int reg,  unsigned int mask, unsigned int irq)
+{
+	if ((pending_irq_balance_apicid[irq] == mask) || irqbalance_disabled) {
+		mask = mask << 24;
+		io_apic_write(apic,reg,mask);
+	} else {
+		/*
+		 * You are in clustered apic mode don't write arbitrary affinity
+		 * values to the apic with irqbalancing enabled
+		 */
+		BUG();
+	}
+}
+
 #endif /* __ASM_MACH_APIC_H */
diff -urN linux-2.5.68/include/asm-i386/mach-default/mach_apic.h linux-2.5.68-irqfix/include/asm-i386/mach-default/mach_apic.h
--- linux-2.5.68/include/asm-i386/mach-default/mach_apic.h	Sat Apr 19 19:51:19 2003
+++ linux-2.5.68-irqfix/include/asm-i386/mach-default/mach_apic.h	Thu May  1 21:38:34 2003
@@ -99,4 +99,12 @@
 	return test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map);
 }
 
+static inline void io_apic_write_affinity(unsigned int apic, 
+			unsigned int reg,  unsigned int mask,int irq)
+{
+	/* Only the first 8 bits are valid */
+	mask = mask << 24;
+	io_apic_write(apic,reg,mask);
+}
+
 #endif /* __ASM_MACH_APIC_H */
diff -urN linux-2.5.68/include/asm-i386/mach-numaq/mach_apic.h linux-2.5.68-irqfix/include/asm-i386/mach-numaq/mach_apic.h
--- linux-2.5.68/include/asm-i386/mach-numaq/mach_apic.h	Sat Apr 19 19:49:17 2003
+++ linux-2.5.68-irqfix/include/asm-i386/mach-numaq/mach_apic.h	Thu May  1 21:18:25 2003
@@ -103,4 +103,26 @@
 	return (1);
 }
 
+extern int __cacheline_aligned pending_irq_balance_apicid[];
+extern int irqbalance_disabled;
+/*
+ * We want to be careful what we write we are in clustered mode
+ * if the mask came from pending_irq_balance_apicid we are ok because
+ * it was generated with cpu_to_logical_apicid
+ */
+static inline void io_apic_write_affinity(unsigned int apic, 
+		unsigned int reg,  unsigned int mask, unsigned int irq)
+{
+	if ((pending_irq_balance_apicid[irq] == mask) || irqbalance_disabled) {
+		mask = mask << 24;
+		io_apic_write(apic,reg,mask);
+	} else {
+		/*
+		 * You are in clustered apic mode don't write arbitrary affinity
+		 * values to the apic with irqbalancing enabled
+		 */
+		BUG();
+	}
+}
+
 #endif /* __ASM_MACH_APIC_H */
diff -urN linux-2.5.68/include/asm-i386/mach-summit/mach_apic.h linux-2.5.68-irqfix/include/asm-i386/mach-summit/mach_apic.h
--- linux-2.5.68/include/asm-i386/mach-summit/mach_apic.h	Sat Apr 19 19:50:06 2003
+++ linux-2.5.68-irqfix/include/asm-i386/mach-summit/mach_apic.h	Thu May  1 21:19:53 2003
@@ -113,4 +113,27 @@
 		return test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map);
 }
 
+
+extern int __cacheline_aligned pending_irq_balance_apicid[];
+extern int irqbalance_disabled;
+/*
+ * We want to be careful what we write we are in clustered mode
+ * if the mask came from pending_irq_balance_apicid we are ok because
+ * it was generated with cpu_to_logical_apicid
+ */
+static inline void io_apic_write_affinity(unsigned int apic, 
+		unsigned int reg,  unsigned int mask, unsigned int irq)
+{
+	if ((pending_irq_balance_apicid[irq] == mask) || irqbalance_disabled) {
+		mask = mask << 24;
+		io_apic_write(apic,reg,mask);
+	} else {
+		/*
+		 * You are in clustered apic mode don't write arbitrary affinity
+		 * values to the apic with irqbalancing enabled
+		 */
+		BUG();
+	}
+}
+
 #endif /* __ASM_MACH_APIC_H */
diff -urN linux-2.5.68/include/asm-i386/mach-visws/mach_apic.h linux-2.5.68-irqfix/include/asm-i386/mach-visws/mach_apic.h
--- linux-2.5.68/include/asm-i386/mach-visws/mach_apic.h	Sat Apr 19 19:48:49 2003
+++ linux-2.5.68-irqfix/include/asm-i386/mach-visws/mach_apic.h	Thu May  1 21:39:00 2003
@@ -77,4 +77,12 @@
 	return test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map);
 }
 
+static inline void io_apic_write_affinity(unsigned int apic, 
+			unsigned int reg,  unsigned int mask, int irq)
+{
+	/* Only the first 8 bits are valid */
+	mask = mask << 24;
+	io_apic_write(apic,reg,mask);
+}
+
 #endif /* __ASM_MACH_APIC_H */


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] clustered apic irq affinity fix for i386
  2003-05-01  1:05   ` Keith Mannthey
@ 2003-05-01  2:22     ` Andrew Morton
  2003-05-01  2:49       ` William Lee Irwin III
                         ` (3 more replies)
  0 siblings, 4 replies; 10+ messages in thread
From: Andrew Morton @ 2003-05-01  2:22 UTC (permalink / raw)
  To: Keith Mannthey; +Cc: linux-kernel

Keith Mannthey <kmannth@us.ibm.com> wrote:
>
> This should be better. Thanks for the comments. 

Remind me again what the patch actually does?  It seems to be purely adding
debug checks?

Won't it just go BUG if someone boots the kernel and then tries to manually
set affinity?

Seems a bit racy too. setup_ioapic_dest() does:

                        pending_irq_balance_apicid[irq] = mask;
        ==> window here
                        set_ioapic_affinity(irq, mask);

ioapic_lock is not held, so there is a window where
pending_irq_balance_apicid[irq] can be set to some other value and
io_apic_write_affinity() will accidentally go BUG.


Is it not possible to fix set_ioapic_affinity() for real for clustered APIC
mode?  What is involved in that?


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] clustered apic irq affinity fix for i386
  2003-05-01  2:22     ` Andrew Morton
@ 2003-05-01  2:49       ` William Lee Irwin III
  2003-05-01  4:07       ` Martin J. Bligh
                         ` (2 subsequent siblings)
  3 siblings, 0 replies; 10+ messages in thread
From: William Lee Irwin III @ 2003-05-01  2:49 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Keith Mannthey, linux-kernel

On Wed, Apr 30, 2003 at 07:22:05PM -0700, Andrew Morton wrote:
> Is it not possible to fix set_ioapic_affinity() for real for clustered APIC
> mode?  What is involved in that?

The clustered hierarchical destination format formats destinations as

bits | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 |
--------------------------------------
     | cluster id no | local cpumask |
--------------------------------------

which can only describe 15*16 of the 2**60 nonempty subsets of 60 cpus.


-- wli

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] clustered apic irq affinity fix for i386
  2003-05-01  2:22     ` Andrew Morton
  2003-05-01  2:49       ` William Lee Irwin III
@ 2003-05-01  4:07       ` Martin J. Bligh
  2003-05-01 17:51       ` Keith Mannthey
  2003-05-06  0:04       ` [RFC][PATCH] fix for clusterd io_apics Keith Mannthey
  3 siblings, 0 replies; 10+ messages in thread
From: Martin J. Bligh @ 2003-05-01  4:07 UTC (permalink / raw)
  To: Andrew Morton, Keith Mannthey; +Cc: linux-kernel

>> This should be better. Thanks for the comments. 
> 
> Remind me again what the patch actually does?  It seems to be purely
> adding debug checks?
> 
> Won't it just go BUG if someone boots the kernel and then tries to
> manually set affinity?

Just ignoring the idiot caller would seem better, IMHO. BUG is a little
extreme ;-) Personally I'm happy for clustered apic mode machines with
irqbalance *disabled* to just fail the call. With it enabled, they can just
fraggle the affinity for irqbalance, and be happy.

> Seems a bit racy too. setup_ioapic_dest() does:
> 
>                         pending_irq_balance_apicid[irq] = mask;
>         ==> window here
>                         set_ioapic_affinity(irq, mask);
> 
> ioapic_lock is not held, so there is a window where
> pending_irq_balance_apicid[irq] can be set to some other value and
> io_apic_write_affinity() will accidentally go BUG.
> 
> 
> Is it not possible to fix set_ioapic_affinity() for real for clustered
> APIC mode?  What is involved in that?

The hardware doesn't support arbitrary cpumasks. However, as long as
we're running irqbalance, it doesn't matter  - we only do one cpu at a time
anyway.

The existing code in the mainline kernel is wrong for platforms other than
clustered apic mode too. irqbalance is happily rotating us one cpu at a
time, and then we go along and put our big masky foot in the thing, and
splat it across multiple CPUs. If irqbalance is on, all we need to do is
set up the irqbalance mask, then force a rebalance immediately.

M.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] clustered apic irq affinity fix for i386
  2003-04-30 23:36 ` Andrew Morton
  2003-05-01  1:05   ` Keith Mannthey
@ 2003-05-01  9:10   ` Arjan van de Ven
  1 sibling, 0 replies; 10+ messages in thread
From: Arjan van de Ven @ 2003-05-01  9:10 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Keith Mannthey, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 390 bytes --]

On Thu, 2003-05-01 at 01:36, Andrew Morton wrote:
> Keith Mannthey <kmannth@us.ibm.com> wrote:
> >
> > Hello all,
> > 	Machines with clustered apics are buggy when it comes to setting irq
> > affinity.
> 
> You stand accused of crimes against whitespace.

The final verdict is: guilty. The punishment will consist of being
forced to read the sourcecode of indent for one hour.


[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC] clustered apic irq affinity fix for i386
  2003-05-01  2:22     ` Andrew Morton
  2003-05-01  2:49       ` William Lee Irwin III
  2003-05-01  4:07       ` Martin J. Bligh
@ 2003-05-01 17:51       ` Keith Mannthey
  2003-05-06  0:04       ` [RFC][PATCH] fix for clusterd io_apics Keith Mannthey
  3 siblings, 0 replies; 10+ messages in thread
From: Keith Mannthey @ 2003-05-01 17:51 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel

> Remind me again what the patch actually does?  It seems to be purely adding
> debug checks?



> Won't it just go BUG if someone boots the kernel and then tries to manually
> set affinity?

  I would assume if someone was setting the affinity from /proc they
would have disabled the kernel irqbalancer and the flag
irqbalance_disabled would be set.
  What I am trying to avoid is a branching for both kirqd and user space
setting affinity.  I see though I hadn't fully thought out user trying
to set affinity and have for following idea.  This would also make
Martin happy :)  

--- ../linux-2.5.68/arch/i386/kernel/irq.c	Sat Apr 19 19:48:50 2003
+++ arch/i386/kernel/irq.c	Fri May  2 14:01:12 2003
@@ -871,8 +871,11 @@
 		return -EINVAL;
 
 	irq_affinity[irq] = new_value;
-	irq_desc[irq].handler->set_affinity(irq, new_value);
-
+	if (irqbalance_disabled) {
+		irq_desc[irq].handler->set_affinity(irq, new_value);
+	} else {
+		do_irq_balance();
+	}	
 	return full_count;
 }
 

 This would make the /proc call tie into kirqd in cases where the
irq_balance has not been disabled. 

 
> Seems a bit racy too. setup_ioapic_dest() does:
> 
>                         pending_irq_balance_apicid[irq] = mask;
>         ==> window here
>                         set_ioapic_affinity(irq, mask);
> 
> ioapic_lock is not held, so there is a window where
> pending_irq_balance_apicid[irq] can be set to some other value and
> io_apic_write_affinity() will accidentally go BUG.
  
  It don't think it is the ioapic_lock we would need to hold it would be
the desc->lock.  This is also early in the boot sequence, but I could
grab the lock anyway.  

> Is it not possible to fix set_ioapic_affinity() for real for clustered APIC
> mode?  What is involved in that?
 
   The real issue is in clustered apic mode not all possible masks are
possible.  If flat mode you have 8 bits (00000000) where each bit
represents a cpu.  In clusterd mode you have 8 bits (0000-node id
0000-cpu id) also.  If the cpu mask you want to write has cpus on
different nodes it impossible (except in special cases) to write that
value the ioapic.  
  In clustered apic mode I can only mask an irq to a single cpu, cpus on
the same node, all cpus, or I beleive the Nth cpu on every node.  It
can't do everything flat apic mode can.  This makes it difficult to
decide what a real fix should be.  


Keith 


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [RFC][PATCH] fix for clusterd io_apics
  2003-05-01  2:22     ` Andrew Morton
                         ` (2 preceding siblings ...)
  2003-05-01 17:51       ` Keith Mannthey
@ 2003-05-06  0:04       ` Keith Mannthey
  2003-05-06 15:15         ` Martin J. Bligh
  3 siblings, 1 reply; 10+ messages in thread
From: Keith Mannthey @ 2003-05-06  0:04 UTC (permalink / raw)
  To: lkml; +Cc: Martin J. Bligh, Andrew Morton, James Cleverdon

Hello all,
  The following is a patch to fix inconsistent use of the function
set_ioapic_affinity. In the current kernel it is unclear as to weather
the value being passed to the function is a cpu mask or valid apic id. 
In irq_affinity_write_proc the kernel passes on a cpu mask but the kirqd
thread passes on logical apic ids.  In flat apic mode this is not an
issue because a cpu mask represents the apic value.  However in
clustered apic mode the cpu mask is very different from the logical apic
id.  
  This is an attempt to do the right thing for clustered apics.  I
clarify that the value being passed to set_ioapic_affinity is a cpu mask
not a apicid.  Set_ioapic_affinity will do the conversion to logical
apic ids.  Since many cpu masks don't map to valid apicids in clustered
apic mode TARGET_CPUS is used as a default value when such a situation
occurs.  I think this is a good step in making irq_affinity clustered
apic safe.
  
Keith 

diff -urN linux-2.5.68/arch/i386/kernel/io_apic.c linux-2.5.68-fix_irq_affinity/arch/i386/kernel/io_apic.c
--- linux-2.5.68/arch/i386/kernel/io_apic.c	Sat Apr 19 19:49:09 2003
+++ linux-2.5.68-fix_irq_affinity/arch/i386/kernel/io_apic.c	Tue May  6 23:10:18 2003
@@ -240,22 +240,22 @@
 			clear_IO_APIC_pin(apic, pin);
 }
 
-static void set_ioapic_affinity (unsigned int irq, unsigned long mask)
+static void set_ioapic_affinity (unsigned int irq, unsigned long cpu_mask)
 {
 	unsigned long flags;
 	int pin;
 	struct irq_pin_list *entry = irq_2_pin + irq;
-
-	/*
-	 * Only the first 8 bits are valid.
-	 */
-	mask = mask << 24;
+	unsigned int apicid_value;
+	
+	apicid_value = cpu_mask_to_apicid(cpu_mask);
+	/* Prepare to do the io_apic_write */
+	apicid_value = apicid_value << 24;
 	spin_lock_irqsave(&ioapic_lock, flags);
 	for (;;) {
 		pin = entry->pin;
 		if (pin == -1)
 			break;
-		io_apic_write(entry->apic, 0x10 + 1 + pin*2, mask);
+		io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
 		if (!entry->next)
 			break;
 		entry = irq_2_pin + entry->next;
@@ -279,7 +279,7 @@
 
 extern unsigned long irq_affinity[NR_IRQS];
 
-static int __cacheline_aligned pending_irq_balance_apicid[NR_IRQS];
+static int __cacheline_aligned pending_irq_balance_cpumask[NR_IRQS];
 static int irqbalance_disabled = NO_BALANCE_IRQ;
 static int physical_balance = 0;
 
@@ -352,7 +352,7 @@
 		unsigned long flags;
 
 		spin_lock_irqsave(&desc->lock, flags);
-		pending_irq_balance_apicid[irq]=cpu_to_logical_apicid(new_cpu);
+		pending_irq_balance_cpumask[irq] = 1 << new_cpu;
 		spin_unlock_irqrestore(&desc->lock, flags);
 	}
 }
@@ -549,8 +549,7 @@
 				selected_irq, min_loaded);
 		/* mark for change destination */
 		spin_lock_irqsave(&desc->lock, flags);
-		pending_irq_balance_apicid[selected_irq] =
-				cpu_to_logical_apicid(min_loaded);
+		pending_irq_balance_cpumask[selected_irq] = 1 << min_loaded;
 		spin_unlock_irqrestore(&desc->lock, flags);
 		/* Since we made a change, come back sooner to 
 		 * check for more variation.
@@ -582,7 +581,7 @@
 	
 	/* push everything to CPU 0 to give us a starting point.  */
 	for (i = 0 ; i < NR_IRQS ; i++)
-		pending_irq_balance_apicid[i] = cpu_to_logical_apicid(0);
+		pending_irq_balance_cpumask[i] = 1;
 
 repeat:
 	set_current_state(TASK_INTERRUPTIBLE);
@@ -659,9 +658,9 @@
 static inline void move_irq(int irq)
 {
 	/* note - we hold the desc->lock */
-	if (unlikely(pending_irq_balance_apicid[irq])) {
-		set_ioapic_affinity(irq, pending_irq_balance_apicid[irq]);
-		pending_irq_balance_apicid[irq] = 0;
+	if (unlikely(pending_irq_balance_cpumask[irq])) {
+		set_ioapic_affinity(irq, pending_irq_balance_cpumask[irq]);
+		pending_irq_balance_cpumask[irq] = 0;
 	}
 }
 
diff -urN linux-2.5.68/include/asm-i386/mach-bigsmp/mach_apic.h linux-2.5.68-fix_irq_affinity/include/asm-i386/mach-bigsmp/mach_apic.h
--- linux-2.5.68/include/asm-i386/mach-bigsmp/mach_apic.h	Sat Apr 19 19:51:08 2003
+++ linux-2.5.68-fix_irq_affinity/include/asm-i386/mach-bigsmp/mach_apic.h	Tue May  6 22:28:37 2003
@@ -27,7 +27,7 @@
 #define APIC_BROADCAST_ID     (0x0f)
 #define check_apicid_used(bitmap, apicid) (0)
 #define check_apicid_present(bit) (phys_cpu_present_map & (1 << bit))
-
+#define apicid_cluster(apicid) (apicid & 0xF0)
 static inline unsigned long calculate_ldr(unsigned long old)
 {
 	unsigned long id;
@@ -115,4 +115,37 @@
 	return (1);
 }
 
+static inline unsigned int cpu_mask_to_apicid (unsigned long cpumask)
+{
+	int num_bits_set;
+	int cpus_found = 0;
+	int cpu;
+	int apicid;	
+
+	num_bits_set = hweight32(cpumask); 
+	/* Return id to all */
+	if (num_bits_set == 32)
+		return (int) 0xFF;
+	/* 
+	 * The cpus in the mask must all be on the apic cluster.  If are not 
+	 * on the same apicid cluster return default value of TARGET_CPUS. 
+	 */
+	cpu = ffs(cpumask)-1;
+	apicid = cpu_to_logical_apicid(cpu);
+	while (cpus_found < num_bits_set) {
+		if (cpumask & (1 << cpu)) {
+			int new_apicid = cpu_to_logical_apicid(cpu);
+			if (apicid_cluster(apicid) != 
+					apicid_cluster(new_apicid)){
+				printk ("%s: Not a valid mask!\n",__FUNCTION__);
+				return TARGET_CPUS;
+			}
+			apicid = apicid | new_apicid;
+			cpus_found++;
+		}
+		cpu++;
+	}
+	return apicid;
+}
+
 #endif /* __ASM_MACH_APIC_H */
diff -urN linux-2.5.68/include/asm-i386/mach-default/mach_apic.h linux-2.5.68-fix_irq_affinity/include/asm-i386/mach-default/mach_apic.h
--- linux-2.5.68/include/asm-i386/mach-default/mach_apic.h	Sat Apr 19 19:51:19 2003
+++ linux-2.5.68-fix_irq_affinity/include/asm-i386/mach-default/mach_apic.h	Tue May  6 22:14:57 2003
@@ -99,4 +99,9 @@
 	return test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map);
 }
 
+static inline unsigned int cpu_mask_to_apicid (unsigned long cpumask)
+{
+	return cpumask;
+}
+
 #endif /* __ASM_MACH_APIC_H */
diff -urN linux-2.5.68/include/asm-i386/mach-numaq/mach_apic.h linux-2.5.68-fix_irq_affinity/include/asm-i386/mach-numaq/mach_apic.h
--- linux-2.5.68/include/asm-i386/mach-numaq/mach_apic.h	Sat Apr 19 19:49:17 2003
+++ linux-2.5.68-fix_irq_affinity/include/asm-i386/mach-numaq/mach_apic.h	Tue May  6 22:51:02 2003
@@ -14,6 +14,7 @@
 #define APIC_BROADCAST_ID      0x0F
 #define check_apicid_used(bitmap, apicid) ((bitmap) & (1 << (apicid)))
 #define check_apicid_present(bit) (phys_cpu_present_map & (1 << bit))
+#define apicid_cluster(apicid) (apicid & 0xF0)
 
 static inline int apic_id_registered(void)
 {
@@ -103,4 +104,37 @@
 	return (1);
 }
 
+static inline unsigned int cpu_mask_to_apicid (unsigned long cpumask)
+{
+	int num_bits_set;
+	int cpus_found = 0;
+	int cpu;
+	int apicid;	
+
+	num_bits_set = hweight32(cpumask); 
+	/* Return id to all */
+	if (num_bits_set == 32)
+		return (int) 0xFF;
+	/* 
+	 * The cpus in the mask must all be on the apic cluster.  If are not 
+	 * on the same apicid cluster return default value of TARGET_CPUS. 
+	 */
+	cpu = ffs(cpumask)-1;
+	apicid = cpu_to_logical_apicid(cpu);
+	while (cpus_found < num_bits_set) {
+		if (cpumask & (1 << cpu)) {
+			int new_apicid = cpu_to_logical_apicid(cpu);
+			if (apicid_cluster(apicid) != 
+					apicid_cluster(new_apicid)){
+				printk ("%s: Not a valid mask!\n",__FUNCTION__);
+				return TARGET_CPUS;
+			}
+			apicid = apicid | new_apicid;
+			cpus_found++;
+		}
+		cpu++;
+	}
+	return apicid;
+}
+
 #endif /* __ASM_MACH_APIC_H */
diff -urN linux-2.5.68/include/asm-i386/mach-summit/mach_apic.h linux-2.5.68-fix_irq_affinity/include/asm-i386/mach-summit/mach_apic.h
--- linux-2.5.68/include/asm-i386/mach-summit/mach_apic.h	Sat Apr 19 19:50:06 2003
+++ linux-2.5.68-fix_irq_affinity/include/asm-i386/mach-summit/mach_apic.h	Tue May  6 22:12:04 2003
@@ -23,7 +23,7 @@
 
 /* we don't use the phys_cpu_present_map to indicate apicid presence */
 #define check_apicid_present(bit) (x86_summit ? 1 : (phys_cpu_present_map & (1 << bit))) 
-
+#define apicid_cluster(apicid) (apicid & 0xF0)
 extern u8 bios_cpu_apicid[];
 
 static inline void init_apic_ldr(void)
@@ -113,4 +113,37 @@
 		return test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map);
 }
 
+static inline unsigned int cpu_mask_to_apicid (unsigned long cpumask)
+{
+	int num_bits_set;
+	int cpus_found = 0;
+	int cpu;
+	int apicid;	
+
+	num_bits_set = hweight32(cpumask); 
+	/* Return id to all */
+	if (num_bits_set == 32)
+		return (int) 0xFF;
+	/* 
+	 * The cpus in the mask must all be on the apic cluster.  If are not 
+	 * on the same apicid cluster return default value of TARGET_CPUS. 
+	 */
+	cpu = ffs(cpumask)-1;
+	apicid = cpu_to_logical_apicid(cpu);
+	while (cpus_found < num_bits_set) {
+		if (cpumask & (1 << cpu)) {
+			int new_apicid = cpu_to_logical_apicid(cpu);
+			if (apicid_cluster(apicid) != 
+					apicid_cluster(new_apicid)){
+				printk ("%s: Not a valid mask!\n",__FUNCTION__);
+				return TARGET_CPUS;
+			}
+			apicid = apicid | new_apicid;
+			cpus_found++;
+		}
+		cpu++;
+	}
+	return apicid;
+}
+
 #endif /* __ASM_MACH_APIC_H */
diff -urN linux-2.5.68/include/asm-i386/mach-visws/mach_apic.h linux-2.5.68-fix_irq_affinity/include/asm-i386/mach-visws/mach_apic.h
--- linux-2.5.68/include/asm-i386/mach-visws/mach_apic.h	Sat Apr 19 19:48:49 2003
+++ linux-2.5.68-fix_irq_affinity/include/asm-i386/mach-visws/mach_apic.h	Tue May  6 22:16:52 2003
@@ -77,4 +77,8 @@
 	return test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map);
 }
 
+static inline unsigned int cpu_mask_to_apicid (unsigned long cpumask)
+{
+	return cpumask;
+}
 #endif /* __ASM_MACH_APIC_H */



^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC][PATCH] fix for clusterd io_apics
  2003-05-06  0:04       ` [RFC][PATCH] fix for clusterd io_apics Keith Mannthey
@ 2003-05-06 15:15         ` Martin J. Bligh
  0 siblings, 0 replies; 10+ messages in thread
From: Martin J. Bligh @ 2003-05-06 15:15 UTC (permalink / raw)
  To: Keith Mannthey, lkml; +Cc: Andrew Morton, James Cleverdon

>   The following is a patch to fix inconsistent use of the function
> set_ioapic_affinity. In the current kernel it is unclear as to weather
> the value being passed to the function is a cpu mask or valid apic id. 
> In irq_affinity_write_proc the kernel passes on a cpu mask but the kirqd
> thread passes on logical apic ids.  In flat apic mode this is not an
> issue because a cpu mask represents the apic value.  However in
> clustered apic mode the cpu mask is very different from the logical apic
> id.  
>   This is an attempt to do the right thing for clustered apics.  I
> clarify that the value being passed to set_ioapic_affinity is a cpu mask
> not a apicid.  Set_ioapic_affinity will do the conversion to logical
> apic ids.  Since many cpu masks don't map to valid apicids in clustered
> apic mode TARGET_CPUS is used as a default value when such a situation
> occurs.  I think this is a good step in making irq_affinity clustered
> apic safe.

Thanks Keith, looks great. The existing code in the main kernel is just
broken (and confusing as hell with it's mixed use of "mask").

M.


^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2003-05-06 15:03 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-04-30 23:07 [RFC] clustered apic irq affinity fix for i386 Keith Mannthey
2003-04-30 23:36 ` Andrew Morton
2003-05-01  1:05   ` Keith Mannthey
2003-05-01  2:22     ` Andrew Morton
2003-05-01  2:49       ` William Lee Irwin III
2003-05-01  4:07       ` Martin J. Bligh
2003-05-01 17:51       ` Keith Mannthey
2003-05-06  0:04       ` [RFC][PATCH] fix for clusterd io_apics Keith Mannthey
2003-05-06 15:15         ` Martin J. Bligh
2003-05-01  9:10   ` [RFC] clustered apic irq affinity fix for i386 Arjan van de Ven

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).