All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline
@ 2009-10-26 22:24 Suresh Siddha
  2009-10-26 22:24 ` [patch 1/6] x86: unify fixup_irqs() for 32-bit and 64-bit kernels Suresh Siddha
                   ` (6 more replies)
  0 siblings, 7 replies; 25+ messages in thread
From: Suresh Siddha @ 2009-10-26 22:24 UTC (permalink / raw)
  To: hpa, mingo, tglx, ebiederm; +Cc: garyhade, suresh.b.siddha, linux-kernel

First four patches in the series unify the fixup_irqs() along with
couple of cleanups. It also fixes an issue where the interrupt subsystem can 
point the interrupt to the offlined cpu (for non interrupt-remapping case)
causing the device to not work. This was observed by Gary.

These four patches are ready for inclusion and while there was a debate
in the past that fixup_irqs() is kind of broken (because of migrating
interrupts in the process context for non intr-remapping platforms)
we think that these patches enhance the existing code and not introduce any
more new races.

The last two patches in the series are titled as RFC. These patches
address removing the local_irq_enable()/local_irq_disable() in
the fixup_irqs(). All the patches (including these last two RFC patches) are
tested on Intel platforms (by myself) and on AMD platforms(by Gary).

We labeled the last two patches as RFC mainly because they need more testing and
thoughts from Eric and others who did some unsuccessful experiments in this
area (migrating irq in the process context for non intr-remapping platforms)
before.

thanks,
suresh


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [patch 1/6] x86: unify fixup_irqs() for 32-bit and 64-bit kernels
  2009-10-26 22:24 [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline Suresh Siddha
@ 2009-10-26 22:24 ` Suresh Siddha
  2009-11-02 16:16   ` [tip:x86/apic] x86: Unify " tip-bot for Suresh Siddha
  2009-10-26 22:24 ` [patch 2/6] x86, intr-remap: Avoid irq_chip mask/unmask in fixup_irqs() for intr-remapping Suresh Siddha
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 25+ messages in thread
From: Suresh Siddha @ 2009-10-26 22:24 UTC (permalink / raw)
  To: hpa, mingo, tglx, ebiederm; +Cc: garyhade, suresh.b.siddha, linux-kernel

[-- Attachment #1: merge_fixup_irqs.patch --]
[-- Type: text/plain, Size: 5187 bytes --]

There is no reason to have different fixup_irqs() for 32-bit and 64-bit kernels.
Unify by using the superior 64bit version for both the kernels.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Gary Hade <garyhade@us.ibm.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/x86/kernel/irq.c    |   59 +++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/irq_32.c |   45 -----------------------------------
 arch/x86/kernel/irq_64.c |   58 ----------------------------------------------
 3 files changed, 59 insertions(+), 103 deletions(-)

Index: tree/arch/x86/kernel/irq.c
===================================================================
--- tree.orig/arch/x86/kernel/irq.c
+++ tree/arch/x86/kernel/irq.c
@@ -274,3 +274,62 @@ void smp_generic_interrupt(struct pt_reg
 }
 
 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
+void fixup_irqs(void)
+{
+	unsigned int irq;
+	static int warned;
+	struct irq_desc *desc;
+
+	for_each_irq_desc(irq, desc) {
+		int break_affinity = 0;
+		int set_affinity = 1;
+		const struct cpumask *affinity;
+
+		if (!desc)
+			continue;
+		if (irq == 2)
+			continue;
+
+		/* interrupt's are disabled at this point */
+		spin_lock(&desc->lock);
+
+		affinity = desc->affinity;
+		if (!irq_has_action(irq) ||
+		    cpumask_equal(affinity, cpu_online_mask)) {
+			spin_unlock(&desc->lock);
+			continue;
+		}
+
+		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+			break_affinity = 1;
+			affinity = cpu_all_mask;
+		}
+
+		if (desc->chip->mask)
+			desc->chip->mask(irq);
+
+		if (desc->chip->set_affinity)
+			desc->chip->set_affinity(irq, affinity);
+		else if (!(warned++))
+			set_affinity = 0;
+
+		if (desc->chip->unmask)
+			desc->chip->unmask(irq);
+
+		spin_unlock(&desc->lock);
+
+		if (break_affinity && set_affinity)
+			printk("Broke affinity for irq %i\n", irq);
+		else if (!set_affinity)
+			printk("Cannot set affinity for irq %i\n", irq);
+	}
+
+	/* That doesn't seem sufficient.  Give it 1ms. */
+	local_irq_enable();
+	mdelay(1);
+	local_irq_disable();
+}
+#endif
Index: tree/arch/x86/kernel/irq_32.c
===================================================================
--- tree.orig/arch/x86/kernel/irq_32.c
+++ tree/arch/x86/kernel/irq_32.c
@@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_
 
 	return true;
 }
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
-void fixup_irqs(void)
-{
-	unsigned int irq;
-	struct irq_desc *desc;
-
-	for_each_irq_desc(irq, desc) {
-		const struct cpumask *affinity;
-
-		if (!desc)
-			continue;
-		if (irq == 2)
-			continue;
-
-		affinity = desc->affinity;
-		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
-			printk("Breaking affinity for irq %i\n", irq);
-			affinity = cpu_all_mask;
-		}
-		if (desc->chip->set_affinity)
-			desc->chip->set_affinity(irq, affinity);
-		else if (desc->action)
-			printk_once("Cannot set affinity for irq %i\n", irq);
-	}
-
-#if 0
-	barrier();
-	/* Ingo Molnar says: "after the IO-APIC masks have been redirected
-	   [note the nop - the interrupt-enable boundary on x86 is two
-	   instructions from sti] - to flush out pending hardirqs and
-	   IPIs. After this point nothing is supposed to reach this CPU." */
-	__asm__ __volatile__("sti; nop; cli");
-	barrier();
-#else
-	/* That doesn't seem sufficient.  Give it 1ms. */
-	local_irq_enable();
-	mdelay(1);
-	local_irq_disable();
-#endif
-}
-#endif
-
Index: tree/arch/x86/kernel/irq_64.c
===================================================================
--- tree.orig/arch/x86/kernel/irq_64.c
+++ tree/arch/x86/kernel/irq_64.c
@@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_
 	return true;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
-void fixup_irqs(void)
-{
-	unsigned int irq;
-	static int warned;
-	struct irq_desc *desc;
-
-	for_each_irq_desc(irq, desc) {
-		int break_affinity = 0;
-		int set_affinity = 1;
-		const struct cpumask *affinity;
-
-		if (!desc)
-			continue;
-		if (irq == 2)
-			continue;
-
-		/* interrupt's are disabled at this point */
-		spin_lock(&desc->lock);
-
-		affinity = desc->affinity;
-		if (!irq_has_action(irq) ||
-		    cpumask_equal(affinity, cpu_online_mask)) {
-			spin_unlock(&desc->lock);
-			continue;
-		}
-
-		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
-			break_affinity = 1;
-			affinity = cpu_all_mask;
-		}
-
-		if (desc->chip->mask)
-			desc->chip->mask(irq);
-
-		if (desc->chip->set_affinity)
-			desc->chip->set_affinity(irq, affinity);
-		else if (!(warned++))
-			set_affinity = 0;
-
-		if (desc->chip->unmask)
-			desc->chip->unmask(irq);
-
-		spin_unlock(&desc->lock);
-
-		if (break_affinity && set_affinity)
-			printk("Broke affinity for irq %i\n", irq);
-		else if (!set_affinity)
-			printk("Cannot set affinity for irq %i\n", irq);
-	}
-
-	/* That doesn't seem sufficient.  Give it 1ms. */
-	local_irq_enable();
-	mdelay(1);
-	local_irq_disable();
-}
-#endif
 
 extern void call_softirq(void);
 



^ permalink raw reply	[flat|nested] 25+ messages in thread

* [patch 2/6] x86, intr-remap: Avoid irq_chip mask/unmask in fixup_irqs() for intr-remapping
  2009-10-26 22:24 [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline Suresh Siddha
  2009-10-26 22:24 ` [patch 1/6] x86: unify fixup_irqs() for 32-bit and 64-bit kernels Suresh Siddha
@ 2009-10-26 22:24 ` Suresh Siddha
  2009-11-02 16:16   ` [tip:x86/apic] " tip-bot for Suresh Siddha
  2009-10-26 22:24 ` [patch 3/6] x86: remove move_cleanup_count from irq_cfg Suresh Siddha
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 25+ messages in thread
From: Suresh Siddha @ 2009-10-26 22:24 UTC (permalink / raw)
  To: hpa, mingo, tglx, ebiederm; +Cc: garyhade, suresh.b.siddha, linux-kernel

[-- Attachment #1: fix_fixup_irqs_intr_remap.patch --]
[-- Type: text/plain, Size: 1409 bytes --]

In the presence of interrupt-remapping, irqs will be migrated in the
process context and we don't do (and there is no need to) irq_chip mask/unmask
while migrating the interrupt.

Similarly fix the fixup_irqs() that get called during cpu offline and avoid
calling irq_chip mask/unmask for irqs that are ok to be migrated in the
process context.

While we didn't observe any race condition with the existing code,
this change takes complete advantage of interrupt-remapping in
the newer generation platforms and avoids any potential HW lockup's
(that often worry Eric :)

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/x86/kernel/irq.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: tree/arch/x86/kernel/irq.c
===================================================================
--- tree.orig/arch/x86/kernel/irq.c
+++ tree/arch/x86/kernel/irq.c
@@ -308,7 +308,7 @@ void fixup_irqs(void)
 			affinity = cpu_all_mask;
 		}
 
-		if (desc->chip->mask)
+		if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
 			desc->chip->mask(irq);
 
 		if (desc->chip->set_affinity)
@@ -316,7 +316,7 @@ void fixup_irqs(void)
 		else if (!(warned++))
 			set_affinity = 0;
 
-		if (desc->chip->unmask)
+		if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
 			desc->chip->unmask(irq);
 
 		spin_unlock(&desc->lock);



^ permalink raw reply	[flat|nested] 25+ messages in thread

* [patch 3/6] x86: remove move_cleanup_count from irq_cfg
  2009-10-26 22:24 [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline Suresh Siddha
  2009-10-26 22:24 ` [patch 1/6] x86: unify fixup_irqs() for 32-bit and 64-bit kernels Suresh Siddha
  2009-10-26 22:24 ` [patch 2/6] x86, intr-remap: Avoid irq_chip mask/unmask in fixup_irqs() for intr-remapping Suresh Siddha
@ 2009-10-26 22:24 ` Suresh Siddha
  2009-11-02 16:17   ` [tip:x86/apic] x86: Remove " tip-bot for Suresh Siddha
  2009-10-26 22:24 ` [patch 4/6] x86: force irq complete move during cpu offline Suresh Siddha
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 25+ messages in thread
From: Suresh Siddha @ 2009-10-26 22:24 UTC (permalink / raw)
  To: hpa, mingo, tglx, ebiederm; +Cc: garyhade, suresh.b.siddha, linux-kernel

[-- Attachment #1: remove_move_cleanup_count.patch --]
[-- Type: text/plain, Size: 2935 bytes --]

move_cleanup_count for each irq in irq_cfg is keeping track of the total number
of cpus that need to free the corresponding vectors associated with the irq
which has now been migrated to new destination. As long as this
move_cleanup_count is non-zero (i.e., as long as we have n't freed the vector
allocations on the old destinations) we were preventing the irq's further
migration.

This cleanup count is unnecessary and it is enough to not allow the irq
migration till we send the cleanup vector to the previous irq destination,
for which we already have irq_cfg's move_in_progress.  All we need to make
sure is that we free the vector at the old desintation but we don't need
to wait till that gets freed.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Gary Hade <garyhade@us.ibm.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/x86/include/asm/hw_irq.h  |    1 -
 arch/x86/kernel/apic/io_apic.c |    9 +--------
 2 files changed, 1 insertion(+), 9 deletions(-)

Index: tip/arch/x86/kernel/apic/io_apic.c
===================================================================
--- tip.orig/arch/x86/kernel/apic/io_apic.c
+++ tip/arch/x86/kernel/apic/io_apic.c
@@ -1161,7 +1161,7 @@ __assign_irq_vector(int irq, struct irq_
 	int cpu, err;
 	cpumask_var_t tmp_mask;
 
-	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+	if (cfg->move_in_progress)
 		return -EBUSY;
 
 	if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
@@ -2234,14 +2234,10 @@ void send_cleanup_vector(struct irq_cfg 
 
 	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
 		unsigned int i;
-		cfg->move_cleanup_count = 0;
-		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
-			cfg->move_cleanup_count++;
 		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
 			apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
 	} else {
 		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
-		cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
 		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
 		free_cpumask_var(cleanup_mask);
 	}
@@ -2430,8 +2426,6 @@ asmlinkage void smp_irq_move_cleanup_int
 
 		cfg = irq_cfg(irq);
 		spin_lock(&desc->lock);
-		if (!cfg->move_cleanup_count)
-			goto unlock;
 
 		if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
 			goto unlock;
@@ -2449,7 +2443,6 @@ asmlinkage void smp_irq_move_cleanup_int
 			goto unlock;
 		}
 		__get_cpu_var(vector_irq)[vector] = -1;
-		cfg->move_cleanup_count--;
 unlock:
 		spin_unlock(&desc->lock);
 	}
Index: tip/arch/x86/include/asm/hw_irq.h
===================================================================
--- tip.orig/arch/x86/include/asm/hw_irq.h
+++ tip/arch/x86/include/asm/hw_irq.h
@@ -94,7 +94,6 @@ struct irq_cfg {
 	struct irq_pin_list	*irq_2_pin;
 	cpumask_var_t		domain;
 	cpumask_var_t		old_domain;
-	unsigned		move_cleanup_count;
 	u8			vector;
 	u8			move_in_progress : 1;
 };



^ permalink raw reply	[flat|nested] 25+ messages in thread

* [patch 4/6] x86: force irq complete move during cpu offline
  2009-10-26 22:24 [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline Suresh Siddha
                   ` (2 preceding siblings ...)
  2009-10-26 22:24 ` [patch 3/6] x86: remove move_cleanup_count from irq_cfg Suresh Siddha
@ 2009-10-26 22:24 ` Suresh Siddha
  2009-11-02 16:17   ` [tip:x86/apic] x86: Force " tip-bot for Suresh Siddha
  2009-10-26 22:24 ` [RFC patch 5/6] x86: Use EOI register in io-apic on intel platforms Suresh Siddha
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 25+ messages in thread
From: Suresh Siddha @ 2009-10-26 22:24 UTC (permalink / raw)
  To: hpa, mingo, tglx, ebiederm; +Cc: garyhade, suresh.b.siddha, linux-kernel

[-- Attachment #1: fix_assign_irq_vector_cpu_offline.patch --]
[-- Type: text/plain, Size: 3129 bytes --]

When a cpu goes offline, fixup_irqs() try to move irq's currently
destined to the offline cpu to a new cpu. But this attempt will fail
if the irq is recently moved to this cpu and the irq still hasn't
arrived at this cpu (for non intr-remapping platforms this is when we free the
vector allocation at the previous destination) that is about to go offline.

This will endup with the interrupt subsystem still pointing the irq to
the offline cpu, causing that irq to not work any more.

Fix this by forcing the irq to complete its move (its been a long time
we moved the irq to this cpu which we are offlining now) and then
move this irq to a new cpu before this cpu goes offline.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Gary Hade <garyhade@us.ibm.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/x86/include/asm/irq.h     |    1 +
 arch/x86/kernel/apic/io_apic.c |   18 +++++++++++++++---
 arch/x86/kernel/irq.c          |    7 +++++++
 3 files changed, 23 insertions(+), 3 deletions(-)

Index: tip/arch/x86/kernel/apic/io_apic.c
===================================================================
--- tip.orig/arch/x86/kernel/apic/io_apic.c
+++ tip/arch/x86/kernel/apic/io_apic.c
@@ -2450,21 +2450,33 @@ unlock:
 	irq_exit();
 }
 
-static void irq_complete_move(struct irq_desc **descp)
+static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
 {
 	struct irq_desc *desc = *descp;
 	struct irq_cfg *cfg = desc->chip_data;
-	unsigned vector, me;
+	unsigned me;
 
 	if (likely(!cfg->move_in_progress))
 		return;
 
-	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 
 	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
 		send_cleanup_vector(cfg);
 }
+
+static void irq_complete_move(struct irq_desc **descp)
+{
+	__irq_complete_move(descp, ~get_irq_regs()->orig_ax);
+}
+
+void irq_force_complete_move(int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = desc->chip_data;
+
+	__irq_complete_move(&desc, cfg->vector);
+}
 #else
 static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
Index: tip/arch/x86/kernel/irq.c
===================================================================
--- tip.orig/arch/x86/kernel/irq.c
+++ tip/arch/x86/kernel/irq.c
@@ -303,6 +303,13 @@ void fixup_irqs(void)
 			continue;
 		}
 
+		/*
+		 * Complete the irq move. This cpu is going down and for
+		 * non intr-remapping case, we can't wait till this interrupt
+		 * arrives at this cpu before completing the irq move.
+		 */
+		irq_force_complete_move(irq);
+
 		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
 			break_affinity = 1;
 			affinity = cpu_all_mask;
Index: tip/arch/x86/include/asm/irq.h
===================================================================
--- tip.orig/arch/x86/include/asm/irq.h
+++ tip/arch/x86/include/asm/irq.h
@@ -34,6 +34,7 @@ static inline int irq_canonicalize(int i
 #ifdef CONFIG_HOTPLUG_CPU
 #include <linux/cpumask.h>
 extern void fixup_irqs(void);
+extern void irq_force_complete_move(int);
 #endif
 
 extern void (*x86_platform_ipi_callback)(void);



^ permalink raw reply	[flat|nested] 25+ messages in thread

* [RFC patch 5/6] x86: Use EOI register in io-apic on intel platforms
  2009-10-26 22:24 [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline Suresh Siddha
                   ` (3 preceding siblings ...)
  2009-10-26 22:24 ` [patch 4/6] x86: force irq complete move during cpu offline Suresh Siddha
@ 2009-10-26 22:24 ` Suresh Siddha
  2009-11-02 16:17   ` [tip:x86/apic] " tip-bot for Suresh Siddha
  2009-10-26 22:24 ` [RFC patch 6/6] x86: remove local_irq_enable()/local_irq_disable() in fixup_irqs() Suresh Siddha
  2009-10-30 19:25 ` [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline Suresh Siddha
  6 siblings, 1 reply; 25+ messages in thread
From: Suresh Siddha @ 2009-10-26 22:24 UTC (permalink / raw)
  To: hpa, mingo, tglx, ebiederm; +Cc: garyhade, suresh.b.siddha, linux-kernel

[-- Attachment #1: use_eoi_for_intel_platforms.patch --]
[-- Type: text/plain, Size: 2916 bytes --]

IO-APIC's in intel chipsets support EOI register starting from IO-APIC version
2. Use that when ever we need to clear the IO-APIC RTE's RemoteIRR bit
explicitly.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Gary Hade <garyhade@us.ibm.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/x86/kernel/apic/io_apic.c |   80 +++++++++++++++++++++++++++--------------
 1 file changed, 53 insertions(+), 27 deletions(-)

Index: tip/arch/x86/kernel/apic/io_apic.c
===================================================================
--- tip.orig/arch/x86/kernel/apic/io_apic.c
+++ tip/arch/x86/kernel/apic/io_apic.c
@@ -2492,6 +2492,50 @@ static void ack_apic_edge(unsigned int i
 
 atomic_t irq_mis_count;
 
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+	struct irq_pin_list *entry;
+
+	for_each_irq_pin(entry, cfg->irq_2_pin)
+		if (irq_remapped(irq))
+			io_apic_eoi(entry->apic, entry->pin);
+		else
+			io_apic_eoi(entry->apic, cfg->vector);
+}
+
+static void
+eoi_ioapic_irq(struct irq_desc *desc)
+{
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	unsigned int irq;
+
+	irq = desc->irq;
+	cfg = desc->chip_data;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__eoi_ioapic_irq(irq, cfg);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static int use_eoi_reg;
+static int ioapic_supports_eoi(void)
+{
+	struct pci_dev *root;
+
+	root = pci_get_bus_and_slot(0, PCI_DEVFN(0, 0));
+	if (root && root->vendor == PCI_VENDOR_ID_INTEL &&
+	    mp_ioapics[0].apicver >= 0x2) {
+		use_eoi_reg = 1;
+		printk("IO-APIC supports EOI register\n");
+	} else
+		printk("IO-APIC doesn't support EOI\n");
+
+	return 0;
+}
+
+fs_initcall(ioapic_supports_eoi);
+
 static void ack_apic_level(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
@@ -2575,37 +2619,19 @@ static void ack_apic_level(unsigned int 
 	/* Tail end of version 0x11 I/O APIC bug workaround */
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
-		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(cfg);
-		__unmask_and_level_IO_APIC_irq(cfg);
-		spin_unlock(&ioapic_lock);
+
+		if (use_eoi_reg)
+			eoi_ioapic_irq(desc);
+		else {
+			spin_lock(&ioapic_lock);
+			__mask_and_edge_IO_APIC_irq(cfg);
+			__unmask_and_level_IO_APIC_irq(cfg);
+			spin_unlock(&ioapic_lock);
+		}
 	}
 }
 
 #ifdef CONFIG_INTR_REMAP
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
-{
-	struct irq_pin_list *entry;
-
-	for_each_irq_pin(entry, cfg->irq_2_pin)
-		io_apic_eoi(entry->apic, entry->pin);
-}
-
-static void
-eoi_ioapic_irq(struct irq_desc *desc)
-{
-	struct irq_cfg *cfg;
-	unsigned long flags;
-	unsigned int irq;
-
-	irq = desc->irq;
-	cfg = desc->chip_data;
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	__eoi_ioapic_irq(irq, cfg);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
 static void ir_ack_apic_edge(unsigned int irq)
 {
 	ack_APIC_irq();



^ permalink raw reply	[flat|nested] 25+ messages in thread

* [RFC patch 6/6] x86: remove local_irq_enable()/local_irq_disable() in fixup_irqs()
  2009-10-26 22:24 [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline Suresh Siddha
                   ` (4 preceding siblings ...)
  2009-10-26 22:24 ` [RFC patch 5/6] x86: Use EOI register in io-apic on intel platforms Suresh Siddha
@ 2009-10-26 22:24 ` Suresh Siddha
  2009-11-02 16:17   ` [tip:x86/apic] x86: Remove " tip-bot for Suresh Siddha
  2009-10-30 19:25 ` [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline Suresh Siddha
  6 siblings, 1 reply; 25+ messages in thread
From: Suresh Siddha @ 2009-10-26 22:24 UTC (permalink / raw)
  To: hpa, mingo, tglx, ebiederm; +Cc: garyhade, suresh.b.siddha, linux-kernel

[-- Attachment #1: fix_irq_migration_during_cpu_offline.patch --]
[-- Type: text/plain, Size: 2874 bytes --]

To ensure that we handle all the pending interrupts (destined for this cpu that
is going down) in the interrupt subsystem before the cpu goes offline,
fixup_irqs() does:

	local_irq_enable();
	mdelay(1);
	local_irq_disable();

Enabling interrupts is not a good thing as this cpu is already offline.
So this patch replaces that logic with,

	mdelay(1);
	check APIC_IRR bits
	Retrigger the irq at the new destination if any interrupt has arrived
	via IPI.

For IO-APIC level triggered interrupts, this retrigger IPI will appear as an
edge interrupt. ack_apic_level() will detect this condition and IO-APIC RTE's
remoteIRR is cleared using directed EOI(using IO-APIC EOI register) on Intel
platforms and for others it uses the existing mask+edge logic followed by
unmask+level.

We can also remove mdelay() and then send spuriuous interrupts to
new cpu targets for all the irqs that were handled previously by this cpu
that is going offline. While it works, I have seen spurious interrupt messages
(nothing wrong but still annoying messages during cpu offline, which
can be seen during suspend/resume etc)

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Gary Hade <garyhade@us.ibm.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
---
 arch/x86/kernel/irq.c |   32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

Index: tip/arch/x86/kernel/irq.c
===================================================================
--- tip.orig/arch/x86/kernel/irq.c
+++ tip/arch/x86/kernel/irq.c
@@ -279,7 +279,7 @@ EXPORT_SYMBOL_GPL(vector_used_by_percpu_
 /* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
 void fixup_irqs(void)
 {
-	unsigned int irq;
+	unsigned int irq, vector;
 	static int warned;
 	struct irq_desc *desc;
 
@@ -334,9 +334,33 @@ void fixup_irqs(void)
 			printk("Cannot set affinity for irq %i\n", irq);
 	}
 
-	/* That doesn't seem sufficient.  Give it 1ms. */
-	local_irq_enable();
+	/*
+	 * We can remove mdelay() and then send spuriuous interrupts to
+	 * new cpu targets for all the irqs that were handled previously by
+	 * this cpu. While it works, I have seen spurious interrupt messages
+	 * (nothing wrong but still...).
+	 *
+	 * So for now, retain mdelay(1) and check the IRR and then send those
+	 * interrupts to new targets as this cpu is already offlined...
+	 */
 	mdelay(1);
-	local_irq_disable();
+
+	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+		unsigned int irr;
+
+		if (__get_cpu_var(vector_irq)[vector] < 0)
+			continue;
+
+		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+		if (irr  & (1 << (vector % 32))) {
+			irq = __get_cpu_var(vector_irq)[vector];
+
+			desc = irq_to_desc(irq);
+			spin_lock(&desc->lock);
+			if (desc->chip->retrigger)
+				desc->chip->retrigger(irq);
+			spin_unlock(&desc->lock);
+		}
+	}
 }
 #endif



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline
  2009-10-26 22:24 [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline Suresh Siddha
                   ` (5 preceding siblings ...)
  2009-10-26 22:24 ` [RFC patch 6/6] x86: remove local_irq_enable()/local_irq_disable() in fixup_irqs() Suresh Siddha
@ 2009-10-30 19:25 ` Suresh Siddha
  2009-11-02 14:59   ` Ingo Molnar
  6 siblings, 1 reply; 25+ messages in thread
From: Suresh Siddha @ 2009-10-30 19:25 UTC (permalink / raw)
  To: Ingo Molnar, ebiederm; +Cc: hpa, tglx, ebiederm, garyhade, linux-kernel

On Mon, 2009-10-26 at 15:24 -0700, Siddha, Suresh B wrote:
> First four patches in the series unify the fixup_irqs() along with
> couple of cleanups. It also fixes an issue where the interrupt subsystem can 
> point the interrupt to the offlined cpu (for non interrupt-remapping case)
> causing the device to not work. This was observed by Gary.
> 
> These four patches are ready for inclusion

Ingo, if no one has any objections, can you please consider the first
four patches in this patchset for -tip testing?

> and while there was a debate
> in the past that fixup_irqs() is kind of broken (because of migrating
> interrupts in the process context for non intr-remapping platforms)
> we think that these patches enhance the existing code and not introduce any
> more new races.
> 
> The last two patches in the series are titled as RFC. 

Eric, any thoughts on these two RFC patches or the other 4 patches that
are ready to go.

> These patches
> address removing the local_irq_enable()/local_irq_disable() in
> the fixup_irqs(). All the patches (including these last two RFC patches) are
> tested on Intel platforms (by myself) and on AMD platforms(by Gary).
> 
> We labeled the last two patches as RFC mainly because they need more testing and
> thoughts from Eric and others who did some unsuccessful experiments in this
> area (migrating irq in the process context for non intr-remapping platforms)
> before.

thanks,
suresh


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline
  2009-10-30 19:25 ` [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline Suresh Siddha
@ 2009-11-02 14:59   ` Ingo Molnar
  2009-11-02 17:35     ` Gary Hade
  0 siblings, 1 reply; 25+ messages in thread
From: Ingo Molnar @ 2009-11-02 14:59 UTC (permalink / raw)
  To: Suresh Siddha; +Cc: ebiederm, hpa, tglx, garyhade, linux-kernel


* Suresh Siddha <suresh.b.siddha@intel.com> wrote:

> On Mon, 2009-10-26 at 15:24 -0700, Siddha, Suresh B wrote:
> > First four patches in the series unify the fixup_irqs() along with
> > couple of cleanups. It also fixes an issue where the interrupt subsystem can 
> > point the interrupt to the offlined cpu (for non interrupt-remapping case)
> > causing the device to not work. This was observed by Gary.
> > 
> > These four patches are ready for inclusion
> 
> Ingo, if no one has any objections, can you please consider the first 
> four patches in this patchset for -tip testing?

Yep, queued them up. I went for the whole batch as it looks good and 
needs wider testing as well.

Note, i fixed a few small details in patch #5.

Note #2, the signoffs from Gary looked weird - they were put last while 
the patches did not come from him. I changed those to Acked-by - i 
suppose that was the intention, right?

	Ingo

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [tip:x86/apic] x86: Unify fixup_irqs() for 32-bit and 64-bit kernels
  2009-10-26 22:24 ` [patch 1/6] x86: unify fixup_irqs() for 32-bit and 64-bit kernels Suresh Siddha
@ 2009-11-02 16:16   ` tip-bot for Suresh Siddha
  0 siblings, 0 replies; 25+ messages in thread
From: tip-bot for Suresh Siddha @ 2009-11-02 16:16 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, ebiederm, garyhade, suresh.b.siddha,
	tglx, mingo

Commit-ID:  7a7732bc0f7c46f217dbec723f25366b6285cc42
Gitweb:     http://git.kernel.org/tip/7a7732bc0f7c46f217dbec723f25366b6285cc42
Author:     Suresh Siddha <suresh.b.siddha@intel.com>
AuthorDate: Mon, 26 Oct 2009 14:24:31 -0800
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 2 Nov 2009 15:56:34 +0100

x86: Unify fixup_irqs() for 32-bit and 64-bit kernels

There is no reason to have different fixup_irqs() for 32-bit and
64-bit kernels. Unify by using the superior 64-bit version for
both the kernels.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Gary Hade <garyhade@us.ibm.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <20091026230001.562512739@sbs-t61.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq.c    |   59 ++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/irq_32.c |   45 -----------------------------------
 arch/x86/kernel/irq_64.c |   58 ---------------------------------------------
 3 files changed, 59 insertions(+), 103 deletions(-)

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3912061..3ea6655 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -276,3 +276,62 @@ void smp_generic_interrupt(struct pt_regs *regs)
 }
 
 EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
+
+#ifdef CONFIG_HOTPLUG_CPU
+/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
+void fixup_irqs(void)
+{
+	unsigned int irq;
+	static int warned;
+	struct irq_desc *desc;
+
+	for_each_irq_desc(irq, desc) {
+		int break_affinity = 0;
+		int set_affinity = 1;
+		const struct cpumask *affinity;
+
+		if (!desc)
+			continue;
+		if (irq == 2)
+			continue;
+
+		/* interrupt's are disabled at this point */
+		spin_lock(&desc->lock);
+
+		affinity = desc->affinity;
+		if (!irq_has_action(irq) ||
+		    cpumask_equal(affinity, cpu_online_mask)) {
+			spin_unlock(&desc->lock);
+			continue;
+		}
+
+		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
+			break_affinity = 1;
+			affinity = cpu_all_mask;
+		}
+
+		if (desc->chip->mask)
+			desc->chip->mask(irq);
+
+		if (desc->chip->set_affinity)
+			desc->chip->set_affinity(irq, affinity);
+		else if (!(warned++))
+			set_affinity = 0;
+
+		if (desc->chip->unmask)
+			desc->chip->unmask(irq);
+
+		spin_unlock(&desc->lock);
+
+		if (break_affinity && set_affinity)
+			printk("Broke affinity for irq %i\n", irq);
+		else if (!set_affinity)
+			printk("Cannot set affinity for irq %i\n", irq);
+	}
+
+	/* That doesn't seem sufficient.  Give it 1ms. */
+	local_irq_enable();
+	mdelay(1);
+	local_irq_disable();
+}
+#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 7d35d0f..10709f2 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
 
 	return true;
 }
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
-void fixup_irqs(void)
-{
-	unsigned int irq;
-	struct irq_desc *desc;
-
-	for_each_irq_desc(irq, desc) {
-		const struct cpumask *affinity;
-
-		if (!desc)
-			continue;
-		if (irq == 2)
-			continue;
-
-		affinity = desc->affinity;
-		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
-			printk("Breaking affinity for irq %i\n", irq);
-			affinity = cpu_all_mask;
-		}
-		if (desc->chip->set_affinity)
-			desc->chip->set_affinity(irq, affinity);
-		else if (desc->action)
-			printk_once("Cannot set affinity for irq %i\n", irq);
-	}
-
-#if 0
-	barrier();
-	/* Ingo Molnar says: "after the IO-APIC masks have been redirected
-	   [note the nop - the interrupt-enable boundary on x86 is two
-	   instructions from sti] - to flush out pending hardirqs and
-	   IPIs. After this point nothing is supposed to reach this CPU." */
-	__asm__ __volatile__("sti; nop; cli");
-	barrier();
-#else
-	/* That doesn't seem sufficient.  Give it 1ms. */
-	local_irq_enable();
-	mdelay(1);
-	local_irq_disable();
-#endif
-}
-#endif
-
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 977d8b4..acf8fbf 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
 	return true;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-/* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
-void fixup_irqs(void)
-{
-	unsigned int irq;
-	static int warned;
-	struct irq_desc *desc;
-
-	for_each_irq_desc(irq, desc) {
-		int break_affinity = 0;
-		int set_affinity = 1;
-		const struct cpumask *affinity;
-
-		if (!desc)
-			continue;
-		if (irq == 2)
-			continue;
-
-		/* interrupt's are disabled at this point */
-		spin_lock(&desc->lock);
-
-		affinity = desc->affinity;
-		if (!irq_has_action(irq) ||
-		    cpumask_equal(affinity, cpu_online_mask)) {
-			spin_unlock(&desc->lock);
-			continue;
-		}
-
-		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
-			break_affinity = 1;
-			affinity = cpu_all_mask;
-		}
-
-		if (desc->chip->mask)
-			desc->chip->mask(irq);
-
-		if (desc->chip->set_affinity)
-			desc->chip->set_affinity(irq, affinity);
-		else if (!(warned++))
-			set_affinity = 0;
-
-		if (desc->chip->unmask)
-			desc->chip->unmask(irq);
-
-		spin_unlock(&desc->lock);
-
-		if (break_affinity && set_affinity)
-			printk("Broke affinity for irq %i\n", irq);
-		else if (!set_affinity)
-			printk("Cannot set affinity for irq %i\n", irq);
-	}
-
-	/* That doesn't seem sufficient.  Give it 1ms. */
-	local_irq_enable();
-	mdelay(1);
-	local_irq_disable();
-}
-#endif
 
 extern void call_softirq(void);
 

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [tip:x86/apic] x86, intr-remap: Avoid irq_chip mask/unmask in fixup_irqs() for intr-remapping
  2009-10-26 22:24 ` [patch 2/6] x86, intr-remap: Avoid irq_chip mask/unmask in fixup_irqs() for intr-remapping Suresh Siddha
@ 2009-11-02 16:16   ` tip-bot for Suresh Siddha
  0 siblings, 0 replies; 25+ messages in thread
From: tip-bot for Suresh Siddha @ 2009-11-02 16:16 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, suresh.b.siddha, tglx, mingo, ebiederm

Commit-ID:  84e21493a3b28c9fefe99fe827fc0c0c101a813d
Gitweb:     http://git.kernel.org/tip/84e21493a3b28c9fefe99fe827fc0c0c101a813d
Author:     Suresh Siddha <suresh.b.siddha@intel.com>
AuthorDate: Mon, 26 Oct 2009 14:24:32 -0800
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 2 Nov 2009 15:56:35 +0100

x86, intr-remap: Avoid irq_chip mask/unmask in fixup_irqs() for intr-remapping

In the presence of interrupt-remapping, irqs will be migrated in
the process context and we don't do (and there is no need to)
irq_chip mask/unmask while migrating the interrupt.

Similarly fix the fixup_irqs() that get called during cpu
offline and avoid calling irq_chip mask/unmask for irqs that are
ok to be migrated in the process context.

While we didn't observe any race condition with the existing
code, this change takes complete advantage of
interrupt-remapping in the newer generation platforms and avoids
any potential HW lockup's (that often worry Eric :)

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: garyhade@us.ibm.com
LKML-Reference: <20091026230001.661423939@sbs-t61.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3ea6655..342bcbc 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -310,7 +310,7 @@ void fixup_irqs(void)
 			affinity = cpu_all_mask;
 		}
 
-		if (desc->chip->mask)
+		if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
 			desc->chip->mask(irq);
 
 		if (desc->chip->set_affinity)
@@ -318,7 +318,7 @@ void fixup_irqs(void)
 		else if (!(warned++))
 			set_affinity = 0;
 
-		if (desc->chip->unmask)
+		if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
 			desc->chip->unmask(irq);
 
 		spin_unlock(&desc->lock);

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [tip:x86/apic] x86: Remove move_cleanup_count from irq_cfg
  2009-10-26 22:24 ` [patch 3/6] x86: remove move_cleanup_count from irq_cfg Suresh Siddha
@ 2009-11-02 16:17   ` tip-bot for Suresh Siddha
  0 siblings, 0 replies; 25+ messages in thread
From: tip-bot for Suresh Siddha @ 2009-11-02 16:17 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, ebiederm, garyhade, suresh.b.siddha,
	tglx, mingo

Commit-ID:  23359a88e7eca3c4f402562b102f23014db3c2aa
Gitweb:     http://git.kernel.org/tip/23359a88e7eca3c4f402562b102f23014db3c2aa
Author:     Suresh Siddha <suresh.b.siddha@intel.com>
AuthorDate: Mon, 26 Oct 2009 14:24:33 -0800
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 2 Nov 2009 15:56:35 +0100

x86: Remove move_cleanup_count from irq_cfg

move_cleanup_count for each irq in irq_cfg is keeping track of
the total number of cpus that need to free the corresponding
vectors associated with the irq which has now been migrated to
new destination. As long as this move_cleanup_count is non-zero
(i.e., as long as we have n't freed the vector allocations on
the old destinations) we were preventing the irq's further
migration.

This cleanup count is unnecessary and it is enough to not allow
the irq migration till we send the cleanup vector to the
previous irq destination, for which we already have irq_cfg's
move_in_progress.  All we need to make sure is that we free the
vector at the old desintation but we don't need to wait till
that gets freed.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Gary Hade <garyhade@us.ibm.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <20091026230001.752968906@sbs-t61.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/hw_irq.h  |    1 -
 arch/x86/kernel/apic/io_apic.c |    9 +--------
 2 files changed, 1 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 1984ce9..6e12426 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -94,7 +94,6 @@ struct irq_cfg {
 	struct irq_pin_list	*irq_2_pin;
 	cpumask_var_t		domain;
 	cpumask_var_t		old_domain;
-	unsigned		move_cleanup_count;
 	u8			vector;
 	u8			move_in_progress : 1;
 };
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ce16b65..e9e5b02 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1161,7 +1161,7 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
 	int cpu, err;
 	cpumask_var_t tmp_mask;
 
-	if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+	if (cfg->move_in_progress)
 		return -EBUSY;
 
 	if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
@@ -2234,14 +2234,10 @@ void send_cleanup_vector(struct irq_cfg *cfg)
 
 	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
 		unsigned int i;
-		cfg->move_cleanup_count = 0;
-		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
-			cfg->move_cleanup_count++;
 		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
 			apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
 	} else {
 		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
-		cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
 		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
 		free_cpumask_var(cleanup_mask);
 	}
@@ -2430,8 +2426,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
 
 		cfg = irq_cfg(irq);
 		spin_lock(&desc->lock);
-		if (!cfg->move_cleanup_count)
-			goto unlock;
 
 		if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
 			goto unlock;
@@ -2449,7 +2443,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
 			goto unlock;
 		}
 		__get_cpu_var(vector_irq)[vector] = -1;
-		cfg->move_cleanup_count--;
 unlock:
 		spin_unlock(&desc->lock);
 	}

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [tip:x86/apic] x86: Force irq complete move during cpu offline
  2009-10-26 22:24 ` [patch 4/6] x86: force irq complete move during cpu offline Suresh Siddha
@ 2009-11-02 16:17   ` tip-bot for Suresh Siddha
  0 siblings, 0 replies; 25+ messages in thread
From: tip-bot for Suresh Siddha @ 2009-11-02 16:17 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, ebiederm, garyhade, suresh.b.siddha,
	tglx, mingo

Commit-ID:  a5e74b841930bec78a4684ab9f208b2ddfe7c736
Gitweb:     http://git.kernel.org/tip/a5e74b841930bec78a4684ab9f208b2ddfe7c736
Author:     Suresh Siddha <suresh.b.siddha@intel.com>
AuthorDate: Mon, 26 Oct 2009 14:24:34 -0800
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 2 Nov 2009 15:56:36 +0100

x86: Force irq complete move during cpu offline

When a cpu goes offline, fixup_irqs() try to move irq's
currently destined to the offline cpu to a new cpu. But this
attempt will fail if the irq is recently moved to this cpu and
the irq still hasn't arrived at this cpu (for non intr-remapping
platforms this is when we free the vector allocation at the
previous destination) that is about to go offline.

This will endup with the interrupt subsystem still pointing the
irq to the offline cpu, causing that irq to not work any more.

Fix this by forcing the irq to complete its move (its been a
long time we moved the irq to this cpu which we are offlining
now) and then move this irq to a new cpu before this cpu goes
offline.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Gary Hade <garyhade@us.ibm.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <20091026230001.848830905@sbs-t61.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/irq.h     |    1 +
 arch/x86/kernel/apic/io_apic.c |   18 +++++++++++++++---
 arch/x86/kernel/irq.c          |    7 +++++++
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ddda6cb..ffd700f 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -34,6 +34,7 @@ static inline int irq_canonicalize(int irq)
 #ifdef CONFIG_HOTPLUG_CPU
 #include <linux/cpumask.h>
 extern void fixup_irqs(void);
+extern void irq_force_complete_move(int);
 #endif
 
 extern void (*generic_interrupt_extension)(void);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e9e5b02..4e886ef 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2450,21 +2450,33 @@ unlock:
 	irq_exit();
 }
 
-static void irq_complete_move(struct irq_desc **descp)
+static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
 {
 	struct irq_desc *desc = *descp;
 	struct irq_cfg *cfg = desc->chip_data;
-	unsigned vector, me;
+	unsigned me;
 
 	if (likely(!cfg->move_in_progress))
 		return;
 
-	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 
 	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
 		send_cleanup_vector(cfg);
 }
+
+static void irq_complete_move(struct irq_desc **descp)
+{
+	__irq_complete_move(descp, ~get_irq_regs()->orig_ax);
+}
+
+void irq_force_complete_move(int irq)
+{
+	struct irq_desc *desc = irq_to_desc(irq);
+	struct irq_cfg *cfg = desc->chip_data;
+
+	__irq_complete_move(&desc, cfg->vector);
+}
 #else
 static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 342bcbc..b10a5e1 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -305,6 +305,13 @@ void fixup_irqs(void)
 			continue;
 		}
 
+		/*
+		 * Complete the irq move. This cpu is going down and for
+		 * non intr-remapping case, we can't wait till this interrupt
+		 * arrives at this cpu before completing the irq move.
+		 */
+		irq_force_complete_move(irq);
+
 		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
 			break_affinity = 1;
 			affinity = cpu_all_mask;

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [tip:x86/apic] x86: Use EOI register in io-apic on intel platforms
  2009-10-26 22:24 ` [RFC patch 5/6] x86: Use EOI register in io-apic on intel platforms Suresh Siddha
@ 2009-11-02 16:17   ` tip-bot for Suresh Siddha
  2009-11-04  0:53     ` Maciej W. Rozycki
  0 siblings, 1 reply; 25+ messages in thread
From: tip-bot for Suresh Siddha @ 2009-11-02 16:17 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, ebiederm, garyhade, suresh.b.siddha,
	tglx, mingo

Commit-ID:  b3ec0a37a7907813bb4fb85a2d94102c152470b7
Gitweb:     http://git.kernel.org/tip/b3ec0a37a7907813bb4fb85a2d94102c152470b7
Author:     Suresh Siddha <suresh.b.siddha@intel.com>
AuthorDate: Mon, 26 Oct 2009 14:24:35 -0800
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 2 Nov 2009 15:56:36 +0100

x86: Use EOI register in io-apic on intel platforms

IO-APIC's in intel chipsets support EOI register starting from
IO-APIC version 2. Use that when ever we need to clear the
IO-APIC RTE's RemoteIRR bit explicitly.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Gary Hade <garyhade@us.ibm.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <20091026230001.947855317@sbs-t61.sc.intel.com>
[ Marked use_eio_reg as __read_mostly, fixed small details ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c |   81 ++++++++++++++++++++++++++-------------
 1 files changed, 54 insertions(+), 27 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 4e886ef..31e9db3 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2492,6 +2492,51 @@ static void ack_apic_edge(unsigned int irq)
 
 atomic_t irq_mis_count;
 
+static int use_eoi_reg __read_mostly;
+
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+	struct irq_pin_list *entry;
+
+	for_each_irq_pin(entry, cfg->irq_2_pin) {
+		if (irq_remapped(irq))
+			io_apic_eoi(entry->apic, entry->pin);
+		else
+			io_apic_eoi(entry->apic, cfg->vector);
+	}
+}
+
+static void eoi_ioapic_irq(struct irq_desc *desc)
+{
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	unsigned int irq;
+
+	irq = desc->irq;
+	cfg = desc->chip_data;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__eoi_ioapic_irq(irq, cfg);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static int ioapic_supports_eoi(void)
+{
+	struct pci_dev *root;
+
+	root = pci_get_bus_and_slot(0, PCI_DEVFN(0, 0));
+	if (root && root->vendor == PCI_VENDOR_ID_INTEL &&
+	    mp_ioapics[0].apicver >= 0x2) {
+		use_eoi_reg = 1;
+		printk(KERN_INFO "IO-APIC supports EOI register\n");
+	} else
+		printk(KERN_INFO "IO-APIC doesn't support EOI\n");
+
+	return 0;
+}
+
+fs_initcall(ioapic_supports_eoi);
+
 static void ack_apic_level(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
@@ -2575,37 +2620,19 @@ static void ack_apic_level(unsigned int irq)
 	/* Tail end of version 0x11 I/O APIC bug workaround */
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
-		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(cfg);
-		__unmask_and_level_IO_APIC_irq(cfg);
-		spin_unlock(&ioapic_lock);
+
+		if (use_eoi_reg)
+			eoi_ioapic_irq(desc);
+		else {
+			spin_lock(&ioapic_lock);
+			__mask_and_edge_IO_APIC_irq(cfg);
+			__unmask_and_level_IO_APIC_irq(cfg);
+			spin_unlock(&ioapic_lock);
+		}
 	}
 }
 
 #ifdef CONFIG_INTR_REMAP
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
-{
-	struct irq_pin_list *entry;
-
-	for_each_irq_pin(entry, cfg->irq_2_pin)
-		io_apic_eoi(entry->apic, entry->pin);
-}
-
-static void
-eoi_ioapic_irq(struct irq_desc *desc)
-{
-	struct irq_cfg *cfg;
-	unsigned long flags;
-	unsigned int irq;
-
-	irq = desc->irq;
-	cfg = desc->chip_data;
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	__eoi_ioapic_irq(irq, cfg);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
 static void ir_ack_apic_edge(unsigned int irq)
 {
 	ack_APIC_irq();

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [tip:x86/apic] x86: Remove local_irq_enable()/local_irq_disable() in fixup_irqs()
  2009-10-26 22:24 ` [RFC patch 6/6] x86: remove local_irq_enable()/local_irq_disable() in fixup_irqs() Suresh Siddha
@ 2009-11-02 16:17   ` tip-bot for Suresh Siddha
  0 siblings, 0 replies; 25+ messages in thread
From: tip-bot for Suresh Siddha @ 2009-11-02 16:17 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, ebiederm, garyhade, suresh.b.siddha,
	tglx, mingo

Commit-ID:  5231a68614b94f60e8f6c56bc6e3d75955b9e75e
Gitweb:     http://git.kernel.org/tip/5231a68614b94f60e8f6c56bc6e3d75955b9e75e
Author:     Suresh Siddha <suresh.b.siddha@intel.com>
AuthorDate: Mon, 26 Oct 2009 14:24:36 -0800
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Mon, 2 Nov 2009 15:56:37 +0100

x86: Remove local_irq_enable()/local_irq_disable() in fixup_irqs()

To ensure that we handle all the pending interrupts (destined
for this cpu that is going down) in the interrupt subsystem
before the cpu goes offline, fixup_irqs() does:

	local_irq_enable();
	mdelay(1);
	local_irq_disable();

Enabling interrupts is not a good thing as this cpu is already
offline. So this patch replaces that logic with,

	mdelay(1);
	check APIC_IRR bits
	Retrigger the irq at the new destination if any interrupt has arrived
	via IPI.

For IO-APIC level triggered interrupts, this retrigger IPI will
appear as an edge interrupt. ack_apic_level() will detect this
condition and IO-APIC RTE's remoteIRR is cleared using directed
EOI(using IO-APIC EOI register) on Intel platforms and for
others it uses the existing mask+edge logic followed by
unmask+level.

We can also remove mdelay() and then send spuriuous interrupts
to new cpu targets for all the irqs that were handled previously
by this cpu that is going offline. While it works, I have seen
spurious interrupt messages (nothing wrong but still annoying
messages during cpu offline, which can be seen during
suspend/resume etc)

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Gary Hade <garyhade@us.ibm.com>
Cc: Eric W. Biederman <ebiederm@xmission.com>
LKML-Reference: <20091026230002.043281924@sbs-t61.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq.c |   32 ++++++++++++++++++++++++++++----
 1 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index b10a5e1..8a82728 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -281,7 +281,7 @@ EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
 /* A cpu has been removed from cpu_online_mask.  Reset irq affinities. */
 void fixup_irqs(void)
 {
-	unsigned int irq;
+	unsigned int irq, vector;
 	static int warned;
 	struct irq_desc *desc;
 
@@ -336,9 +336,33 @@ void fixup_irqs(void)
 			printk("Cannot set affinity for irq %i\n", irq);
 	}
 
-	/* That doesn't seem sufficient.  Give it 1ms. */
-	local_irq_enable();
+	/*
+	 * We can remove mdelay() and then send spuriuous interrupts to
+	 * new cpu targets for all the irqs that were handled previously by
+	 * this cpu. While it works, I have seen spurious interrupt messages
+	 * (nothing wrong but still...).
+	 *
+	 * So for now, retain mdelay(1) and check the IRR and then send those
+	 * interrupts to new targets as this cpu is already offlined...
+	 */
 	mdelay(1);
-	local_irq_disable();
+
+	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+		unsigned int irr;
+
+		if (__get_cpu_var(vector_irq)[vector] < 0)
+			continue;
+
+		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+		if (irr  & (1 << (vector % 32))) {
+			irq = __get_cpu_var(vector_irq)[vector];
+
+			desc = irq_to_desc(irq);
+			spin_lock(&desc->lock);
+			if (desc->chip->retrigger)
+				desc->chip->retrigger(irq);
+			spin_unlock(&desc->lock);
+		}
+	}
 }
 #endif

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline
  2009-11-02 14:59   ` Ingo Molnar
@ 2009-11-02 17:35     ` Gary Hade
  0 siblings, 0 replies; 25+ messages in thread
From: Gary Hade @ 2009-11-02 17:35 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Suresh Siddha, ebiederm, hpa, tglx, garyhade, linux-kernel

On Mon, Nov 02, 2009 at 03:59:38PM +0100, Ingo Molnar wrote:
> 
> * Suresh Siddha <suresh.b.siddha@intel.com> wrote:
> 
> > On Mon, 2009-10-26 at 15:24 -0700, Siddha, Suresh B wrote:
> > > First four patches in the series unify the fixup_irqs() along with
> > > couple of cleanups. It also fixes an issue where the interrupt subsystem can 
> > > point the interrupt to the offlined cpu (for non interrupt-remapping case)
> > > causing the device to not work. This was observed by Gary.
> > > 
> > > These four patches are ready for inclusion
> > 
> > Ingo, if no one has any objections, can you please consider the first 
> > four patches in this patchset for -tip testing?
> 
> Yep, queued them up. I went for the whole batch as it looks good and 
> needs wider testing as well.
> 
> Note, i fixed a few small details in patch #5.
> 
> Note #2, the signoffs from Gary looked weird - they were put last while 
> the patches did not come from him. I changed those to Acked-by -

Thats fine with me.  I was involved to a minor extent
in the development of the patches but I think my biggest
contribution was testing.

BTW, looks like "[patch 1/6] x86: unify fixup_irqs() for 32-bit
and 64-bit kernels" got committed without this change.

Thanks for giving them a chance.

Gary

-- 
Gary Hade
System x Enablement
IBM Linux Technology Center
503-578-4503  IBM T/L: 775-4503
garyhade@us.ibm.com
http://www.ibm.com/linux/ltc


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [tip:x86/apic] x86: Use EOI register in io-apic on intel platforms
  2009-11-02 16:17   ` [tip:x86/apic] " tip-bot for Suresh Siddha
@ 2009-11-04  0:53     ` Maciej W. Rozycki
  2009-11-04  2:24       ` Suresh Siddha
  0 siblings, 1 reply; 25+ messages in thread
From: Maciej W. Rozycki @ 2009-11-04  0:53 UTC (permalink / raw)
  To: Ingo Molnar, hpa, linux-kernel, ebiederm, garyhade,
	suresh.b.siddha, tglx, mingo

On Mon, 2 Nov 2009, tip-bot for Suresh Siddha wrote:

> Commit-ID:  b3ec0a37a7907813bb4fb85a2d94102c152470b7
> Gitweb:     http://git.kernel.org/tip/b3ec0a37a7907813bb4fb85a2d94102c152470b7
> Author:     Suresh Siddha <suresh.b.siddha@intel.com>
> AuthorDate: Mon, 26 Oct 2009 14:24:35 -0800
> Committer:  Ingo Molnar <mingo@elte.hu>
> CommitDate: Mon, 2 Nov 2009 15:56:36 +0100
> 
> x86: Use EOI register in io-apic on intel platforms
> 
> IO-APIC's in intel chipsets support EOI register starting from
> IO-APIC version 2. Use that when ever we need to clear the
> IO-APIC RTE's RemoteIRR bit explicitly.
> 
> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
> Acked-by: Gary Hade <garyhade@us.ibm.com>
> Cc: Eric W. Biederman <ebiederm@xmission.com>
> LKML-Reference: <20091026230001.947855317@sbs-t61.sc.intel.com>
> [ Marked use_eio_reg as __read_mostly, fixed small details ]
> Signed-off-by: Ingo Molnar <mingo@elte.hu>
> ---
[...]
> +static int ioapic_supports_eoi(void)
> +{
> +	struct pci_dev *root;
> +
> +	root = pci_get_bus_and_slot(0, PCI_DEVFN(0, 0));
> +	if (root && root->vendor == PCI_VENDOR_ID_INTEL &&
> +	    mp_ioapics[0].apicver >= 0x2) {
> +		use_eoi_reg = 1;
> +		printk(KERN_INFO "IO-APIC supports EOI register\n");
> +	} else
> +		printk(KERN_INFO "IO-APIC doesn't support EOI\n");
> +
> +	return 0;
> +}

 This is wrong -- the 82093AA I/O APIC has its version set to 0x11 and it 
does not support the EOI register.  Similarly I/O APICs integrated into 
the 82379AB south bridge and the 82374EB/SB EISA component.

 Overall values below 0x10 are reserved for the 82489DX -- are you sure 
you didn't mean 0x12 or 0x20?

  Maciej

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [tip:x86/apic] x86: Use EOI register in io-apic on intel platforms
  2009-11-04  0:53     ` Maciej W. Rozycki
@ 2009-11-04  2:24       ` Suresh Siddha
  2009-11-04 23:04         ` Suresh Siddha
  0 siblings, 1 reply; 25+ messages in thread
From: Suresh Siddha @ 2009-11-04  2:24 UTC (permalink / raw)
  To: Maciej W. Rozycki
  Cc: Ingo Molnar, hpa, linux-kernel, ebiederm, garyhade, tglx,
	Sankaran, Rajesh

On Tue, 2009-11-03 at 16:53 -0800, Maciej W. Rozycki wrote:
> On Mon, 2 Nov 2009, tip-bot for Suresh Siddha wrote:
> 
> > Commit-ID:  b3ec0a37a7907813bb4fb85a2d94102c152470b7
> > Gitweb:     http://git.kernel.org/tip/b3ec0a37a7907813bb4fb85a2d94102c152470b7
> > Author:     Suresh Siddha <suresh.b.siddha@intel.com>
> > AuthorDate: Mon, 26 Oct 2009 14:24:35 -0800
> > Committer:  Ingo Molnar <mingo@elte.hu>
> > CommitDate: Mon, 2 Nov 2009 15:56:36 +0100
> > 
> > x86: Use EOI register in io-apic on intel platforms
> > 
> > IO-APIC's in intel chipsets support EOI register starting from
> > IO-APIC version 2. Use that when ever we need to clear the
> > IO-APIC RTE's RemoteIRR bit explicitly.
> > 
> > Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
> > Acked-by: Gary Hade <garyhade@us.ibm.com>
> > Cc: Eric W. Biederman <ebiederm@xmission.com>
> > LKML-Reference: <20091026230001.947855317@sbs-t61.sc.intel.com>
> > [ Marked use_eio_reg as __read_mostly, fixed small details ]
> > Signed-off-by: Ingo Molnar <mingo@elte.hu>
> > ---
> [...]
> > +static int ioapic_supports_eoi(void)
> > +{
> > +	struct pci_dev *root;
> > +
> > +	root = pci_get_bus_and_slot(0, PCI_DEVFN(0, 0));
> > +	if (root && root->vendor == PCI_VENDOR_ID_INTEL &&
> > +	    mp_ioapics[0].apicver >= 0x2) {
> > +		use_eoi_reg = 1;
> > +		printk(KERN_INFO "IO-APIC supports EOI register\n");
> > +	} else
> > +		printk(KERN_INFO "IO-APIC doesn't support EOI\n");
> > +
> > +	return 0;
> > +}
> 
>  This is wrong -- the 82093AA I/O APIC has its version set to 0x11 and it 
> does not support the EOI register.  Similarly I/O APICs integrated into 
> the 82379AB south bridge and the 82374EB/SB EISA component.

Maciej, There might be some confusion (mostly on my side). When I looked
at ICH2 spec http://www.intel.com/assets/pdf/datasheet/290687.pdf it
says it has an EOI register and it is version 2.

All I heard internally was we have it from version 2 and it works (our
experiments on ICH4 etc worked).

But I do agree that I overlooked the version 11h of 82093AA (which is
older than ICH2).

> Overall values below 0x10 are reserved for the 82489DX

This is for local apic right?

>  -- are you sure 
> you didn't mean 0x12 or 0x20?

Let me check tomorrow and see where the confusion is.

thanks for looking at this.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [tip:x86/apic] x86: Use EOI register in io-apic on intel platforms
  2009-11-04  2:24       ` Suresh Siddha
@ 2009-11-04 23:04         ` Suresh Siddha
  2009-11-05 14:46           ` Maciej W. Rozycki
  0 siblings, 1 reply; 25+ messages in thread
From: Suresh Siddha @ 2009-11-04 23:04 UTC (permalink / raw)
  To: Maciej W. Rozycki
  Cc: Ingo Molnar, hpa, linux-kernel, ebiederm, garyhade, tglx,
	Sankaran, Rajesh

On Tue, 2009-11-03 at 18:24 -0800, Suresh Siddha wrote:
> Maciej, There might be some confusion (mostly on my side). When I looked
> at ICH2 spec http://www.intel.com/assets/pdf/datasheet/290687.pdf it
> says it has an EOI register and it is version 2.

Ok, Issue is in the specs documentation. ICH2 to ICH5 seems to document
that they use version 0x2 ioapic's, while infact they use 0x20 version
and has a working EOI register. So we should use 0x20 as the version
check and not 0x2.

Maciej, can I have your ack for the appended patch?
---

From: Suresh Siddha <suresh.b.siddha@intel.com>
Subject: x86, ioapic: fix io-apic version check for the EOI reg presence

Maciej W. Rozycki reported:
> 82093AA I/O APIC has its version set to 0x11 and it 
> does not support the EOI register.  Similarly I/O APICs integrated into 
> the 82379AB south bridge and the 82374EB/SB EISA component.

IO-APIC versions below 0x20 don't support EOI register.

Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
version as 0x2. This is an error with documentation and these ICH chips
use io-apic's of version 0x20 and indeed has a working EOI register
for the io-apic.

Fix the ioapic_supports_eoi() to check for version 0x20 and beyond.

Reported-by: Maciej W. Rozycki <macro@linux-mips.org>
Acked-by: Rajesh Sankaran <rajesh.sankaran@intel.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
---

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 47d95c3..68510d4 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2520,13 +2520,29 @@ static void eoi_ioapic_irq(struct irq_desc *desc)
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
+/*
+ * IO-APIC versions below 0x20 don't support EOI register.
+ * For the record, here is the information about various versions:
+ * 	0Xh	82489DX
+ * 	1Xh	I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
+ * 	2Xh     I/O(x)APIC which is PCI 2.2 Compliant
+ *	30h-FFh Reserved
+ *
+ * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
+ * version as 0x2. This is an error with documentation and these ICH chips
+ * use io-apic's of version 0x20.
+ */
 static int ioapic_supports_eoi(void)
 {
 	struct pci_dev *root;
 
 	root = pci_get_bus_and_slot(0, PCI_DEVFN(0, 0));
+	/*
+ 	 * Perhaps we can do this for all vendors. But for now,
+ 	 * no one else seems to have version >= 0x20 ??
+ 	 */
 	if (root && root->vendor == PCI_VENDOR_ID_INTEL &&
-	    mp_ioapics[0].apicver >= 0x2) {
+	    mp_ioapics[0].apicver >= 0x20) {
 		use_eoi_reg = 1;
 		printk(KERN_INFO "IO-APIC supports EOI register\n");
 	} else



^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [tip:x86/apic] x86: Use EOI register in io-apic on intel platforms
  2009-11-04 23:04         ` Suresh Siddha
@ 2009-11-05 14:46           ` Maciej W. Rozycki
  2009-11-06  0:01             ` Suresh Siddha
  0 siblings, 1 reply; 25+ messages in thread
From: Maciej W. Rozycki @ 2009-11-05 14:46 UTC (permalink / raw)
  To: Suresh Siddha
  Cc: Ingo Molnar, hpa, linux-kernel, ebiederm, garyhade, tglx,
	Sankaran, Rajesh

On Wed, 4 Nov 2009, Suresh Siddha wrote:

> > Maciej, There might be some confusion (mostly on my side). When I looked
> > at ICH2 spec http://www.intel.com/assets/pdf/datasheet/290687.pdf it
> > says it has an EOI register and it is version 2.

 For the record: the original ICH/ICH0 (82801AA/AB) seems to have the 
version set to 0x11 as well.  Now its datasheet mentions the EOI register, 
but even if it is not a mistake, its usefulness is rather limited as the 
APIC does not support FSB delivery.

 Also for the record: the 82489DX is a combined local and I/O APIC chip -- 
quite reasonable given it was intended for 486-class processors which 
needed both units and the cost of manufacturing a separate chip for each 
unit plus the extra PCB space needed in that case certainly outweighed the 
loss from having a part of silicon unused on some chips.

> Ok, Issue is in the specs documentation. ICH2 to ICH5 seems to document
> that they use version 0x2 ioapic's, while infact they use 0x20 version
> and has a working EOI register. So we should use 0x20 as the version
> check and not 0x2.

 I've checked some docs too and you may want to check whether to qualify 
the use of the EOI register further with the DT (Delivery Type) bit in the 
Boot Configuration register.  It affects ICH2 at least (but some I/O APICs 
do this differently; e.g. the PID or 82094AA changes its version between 
0x13 and 0x21 depending on whether FSB delivery is used or not).  
Otherwise you may have interrupts delivered twice (though it may not be a 
big problem after all).

 Note that we have a piece of code to report the existence of the DT bit 
already, so we've got the knowledge about APIC versions already. ;)  I 
guess it should be modified a bit though as reportedly the PID does not 
have the Boot Configuration register, so the version checked there should 
be 0x20 exactly, rather than greater or equal.

 Also you may want to see whether the complication in ack_apic_level() 
that is meant to deal with an APIC erratum really matters for FSB delivery 
-- I guess not, because if you explicitly ACK an interrupt in the I/O 
unit, then even if it was incorrectly recorded as edge-triggered in the 
local unit, the IRR bit will be correctly reset and the next message 
delivered properly.  Given you introduce a conditional statement anyway, 
you can place more code within it and there will be no performance hit for 
the other path and certainly a gain for this one.

 Additionally it seems to me that code to migrate an IRQ is placed 
incorrectly -- the erratum workaround should be placed above it as it 
complements ack_APIC_irq() -- the IRR bit will not have been reset before 
the workaround has been executed if the erratum was hit.  Will you care 
about this problem?

> Maciej, can I have your ack for the appended patch?

 Certainly, it looks good to me.

Acked-by: Maciej W. Rozycki <macro@linux-mips.org>

> ---
> 
> From: Suresh Siddha <suresh.b.siddha@intel.com>
> Subject: x86, ioapic: fix io-apic version check for the EOI reg presence
> 
> Maciej W. Rozycki reported:
> > 82093AA I/O APIC has its version set to 0x11 and it 
> > does not support the EOI register.  Similarly I/O APICs integrated into 
> > the 82379AB south bridge and the 82374EB/SB EISA component.
> 
> IO-APIC versions below 0x20 don't support EOI register.
> 
> Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
> version as 0x2. This is an error with documentation and these ICH chips
> use io-apic's of version 0x20 and indeed has a working EOI register
> for the io-apic.
> 
> Fix the ioapic_supports_eoi() to check for version 0x20 and beyond.
> 
> Reported-by: Maciej W. Rozycki <macro@linux-mips.org>
> Acked-by: Rajesh Sankaran <rajesh.sankaran@intel.com>
> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
> ---
> 
> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
> index 47d95c3..68510d4 100644
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -2520,13 +2520,29 @@ static void eoi_ioapic_irq(struct irq_desc *desc)
>  	spin_unlock_irqrestore(&ioapic_lock, flags);
>  }
>  
> +/*
> + * IO-APIC versions below 0x20 don't support EOI register.
> + * For the record, here is the information about various versions:
> + * 	0Xh	82489DX
> + * 	1Xh	I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
> + * 	2Xh     I/O(x)APIC which is PCI 2.2 Compliant
> + *	30h-FFh Reserved
> + *
> + * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
> + * version as 0x2. This is an error with documentation and these ICH chips
> + * use io-apic's of version 0x20.
> + */
>  static int ioapic_supports_eoi(void)
>  {
>  	struct pci_dev *root;
>  
>  	root = pci_get_bus_and_slot(0, PCI_DEVFN(0, 0));
> +	/*
> + 	 * Perhaps we can do this for all vendors. But for now,
> + 	 * no one else seems to have version >= 0x20 ??
> + 	 */
>  	if (root && root->vendor == PCI_VENDOR_ID_INTEL &&
> -	    mp_ioapics[0].apicver >= 0x2) {
> +	    mp_ioapics[0].apicver >= 0x20) {
>  		use_eoi_reg = 1;
>  		printk(KERN_INFO "IO-APIC supports EOI register\n");
>  	} else
> 
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [tip:x86/apic] x86: Use EOI register in io-apic on intel platforms
  2009-11-05 14:46           ` Maciej W. Rozycki
@ 2009-11-06  0:01             ` Suresh Siddha
  2009-11-06  6:53               ` Maciej W. Rozycki
  0 siblings, 1 reply; 25+ messages in thread
From: Suresh Siddha @ 2009-11-06  0:01 UTC (permalink / raw)
  To: Maciej W. Rozycki
  Cc: Ingo Molnar, hpa, linux-kernel, ebiederm, garyhade, tglx,
	Sankaran, Rajesh

On Thu, 2009-11-05 at 06:46 -0800, Maciej W. Rozycki wrote:
>  For the record: the original ICH/ICH0 (82801AA/AB) seems to have the 
> version set to 0x11 as well.  Now its datasheet mentions the EOI register, 
> but even if it is not a mistake, 

hmm. what a mess.

I found a kernel log in an archive
(http://www.mail-archive.com/linux-usb-users@lists.sourceforge.net/msg13255.html) for ICH0 which shows that it uses an io-apic of version 0x20. So mostly the version in the datasheet is wrong here aswell :-(

And our IO/chipset architect (Rajesh) also confirmed that ICH0 indeed
has an EOI reg.

So, version 0x20 seems to be pretty safe to use EOI register. And
according to Gary Hade's experiments on IBM platforms having  an io-apic
version 0x11, magic of mask+edge and unmask+level seems to clear
remoteIRR.

So the current logic looks safe to me.

>  Also for the record: the 82489DX is a combined local and I/O APIC chip -- 

Thanks. This is what I learnt internally too yesterday and hence the
documentation update ;)

>  I've checked some docs too and you may want to check whether to qualify 
> the use of the EOI register further with the DT (Delivery Type) bit in the 
> Boot Configuration register.  It affects ICH2 at least

For relatively newer ICH's like ICH5, boot configuration register is
marked as a reserved register (perhaps with the serial bus going away,
so did this register but again with out the io-apic version change).

Anyways, our understanding is that EOI register in ICH2 should work
irrespective of DT bit in the boot config register.

>  Also you may want to see whether the complication in ack_apic_level() 
> that is meant to deal with an APIC erratum really matters for FSB delivery 
> -- I guess not, because if you explicitly ACK an interrupt in the I/O 
> unit, then even if it was incorrectly recorded as edge-triggered in the 
> local unit, the IRR bit will be correctly reset and the next message 
> delivered properly.  Given you introduce a conditional statement anyway, 
> you can place more code within it and there will be no performance hit for 
> the other path and certainly a gain for this one.

I am not sure if I follow. With the recent changes (tip commit
5231a68614b94f60e8f6c56bc6e3d75955b9e75e), we use ipi on a new cpu to
handle a pending level-triggered  interrupt on the cpu that is going
offline. It's just not only in the case of io-apic erratum for 0x11, we
see level triggered interrupt as edge interrupt at the cpu.


>  Additionally it seems to me that code to migrate an IRQ is placed 
> incorrectly -- the erratum workaround should be placed above it as it 
> complements ack_APIC_irq() -- the IRR bit will not have been reset before 
> the workaround has been executed if the erratum was hit.  Will you care 
> about this problem?

Yes. I see this issue and agree with your assesment. The result is that
we missed an irq migration attempt and delay it to the next arrival.

I will post a different fix and also update some of the code comments
around this to reflect new changes in the code.

> > Maciej, can I have your ack for the appended patch?
> 
>  Certainly, it looks good to me.
> 
> Acked-by: Maciej W. Rozycki <macro@linux-mips.org>

Thanks. Ingo, Can you please queue this patch too? I am planning to do
couple of more cleanups on top of this. I will post them shortly.

thanks,
suresh


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [tip:x86/apic] x86: Use EOI register in io-apic on intel platforms
  2009-11-06  0:01             ` Suresh Siddha
@ 2009-11-06  6:53               ` Maciej W. Rozycki
  2009-11-07  7:27                 ` Suresh Siddha
  0 siblings, 1 reply; 25+ messages in thread
From: Maciej W. Rozycki @ 2009-11-06  6:53 UTC (permalink / raw)
  To: Suresh Siddha
  Cc: Ingo Molnar, hpa, linux-kernel, ebiederm, garyhade, tglx,
	Sankaran, Rajesh

On Thu, 5 Nov 2009, Suresh Siddha wrote:

> >  I've checked some docs too and you may want to check whether to qualify 
> > the use of the EOI register further with the DT (Delivery Type) bit in the 
> > Boot Configuration register.  It affects ICH2 at least
> 
> For relatively newer ICH's like ICH5, boot configuration register is
> marked as a reserved register (perhaps with the serial bus going away,
> so did this register but again with out the io-apic version change).
> 
> Anyways, our understanding is that EOI register in ICH2 should work
> irrespective of DT bit in the boot config register.

 What I mean is if the serial delivery type is used, then an interrupt 
will be acked twice -- once via an EOI message sent from the local APIC 
over the serial bus and then again via the write to the EOI register.  
There is a race condition here, so if the IRQ line is still/again 
asserted, then most likely two consecutive IRQ messages will be sent by 
the I/O APIC and they may be accepted by two different local units and 
eventually delivered to the OS.  Linux will cope, but still this is sloppy 
programming, so it should be best avoided -- if possible that is.

> >  Also you may want to see whether the complication in ack_apic_level() 
> > that is meant to deal with an APIC erratum really matters for FSB delivery 
> > -- I guess not, because if you explicitly ACK an interrupt in the I/O 
> > unit, then even if it was incorrectly recorded as edge-triggered in the 
> > local unit, the IRR bit will be correctly reset and the next message 
> > delivered properly.  Given you introduce a conditional statement anyway, 
> > you can place more code within it and there will be no performance hit for 
> > the other path and certainly a gain for this one.
> 
> I am not sure if I follow. With the recent changes (tip commit
> 5231a68614b94f60e8f6c56bc6e3d75955b9e75e), we use ipi on a new cpu to
> handle a pending level-triggered  interrupt on the cpu that is going
> offline. It's just not only in the case of io-apic erratum for 0x11, we
> see level triggered interrupt as edge interrupt at the cpu.

 I don't get it, sorry -- an interrupt has its trigger mode implied by the 
IRQ_LEVEL status flag being set or clear in the IRQ's descriptor.  What's 
set in the local APIC's TMR does not (or should not) matter IMO.

 Has the trigger mode mismatch erratum been seen for FSB delivered 
interrupts anyway?  My proposed patch is below.  Includes the code 
rearrangement I proposed as well.  Untested, not even built.

  Maciej
---
 Complete the EOI dance for the trigger mode mismatch APIC erratum before 
proceeding to IRQ migration.  Omit the erratum workaround for I/O APICs 
using FSB delivery as they get their EOI message delivered by hand.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
---
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 31e9db3..6e4edad 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2555,33 +2555,50 @@ static void ack_apic_level(unsigned int irq)
 #endif
 
 	/*
-	 * It appears there is an erratum which affects at least version 0x11
-	 * of I/O APIC (that's the 82093AA and cores integrated into various
-	 * chipsets).  Under certain conditions a level-triggered interrupt is
-	 * erroneously delivered as edge-triggered one but the respective IRR
-	 * bit gets set nevertheless.  As a result the I/O unit expects an EOI
-	 * message but it will never arrive and further interrupts are blocked
-	 * from the source.  The exact reason is so far unknown, but the
-	 * phenomenon was observed when two consecutive interrupt requests
-	 * from a given source get delivered to the same CPU and the source is
-	 * temporarily disabled in between.
-	 *
-	 * A workaround is to simulate an EOI message manually.  We achieve it
-	 * by setting the trigger mode to edge and then to level when the edge
-	 * trigger mode gets detected in the TMR of a local APIC for a
-	 * level-triggered interrupt.  We mask the source for the time of the
-	 * operation to prevent an edge-triggered interrupt escaping meanwhile.
-	 * The idea is from Manfred Spraul.  --macro
+	 * We must acknowledge the irq before we move it or the acknowledge
+	 * will not propagate properly.
 	 */
-	cfg = desc->chip_data;
-	i = cfg->vector;
-	v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
+	if (use_eoi_reg) {
+		ack_APIC_irq();
+		eoi_ioapic_irq(desc);
+	} else {
+		/*
+		 * It appears there is an erratum which affects at least
+		 * version 0x11 of I/O APIC (that's the 82093AA and cores
+		 * integrated into various chipsets).  Under certain
+		 * conditions a level-triggered interrupt is erroneously
+		 * delivered as edge-triggered one but the respective IRR
+		 * bit gets set nevertheless.  As a result the I/O unit
+		 * expects an EOI message but it will never arrive and
+		 * further interrupts are blocked from the source.  The
+		 * exact reason is so far unknown, but the phenomenon was
+		 * observed when two consecutive interrupt requests from
+		 * a given source get delivered to the same CPU and the
+		 * source is temporarily disabled in between.
+		 *
+		 * A workaround is to simulate an EOI message manually.
+		 * We achieve it by setting the trigger mode to edge and
+		 * then to level when the edge trigger mode gets detected
+		 * in the TMR of a local APIC for a level-triggered
+		 * interrupt.  We mask the source for the time of the
+		 * operation to prevent an edge-triggered interrupt
+		 * escaping meanwhile.
+		 *
+		 * The idea is from Manfred Spraul.  --macro
+		 */
+		cfg = desc->chip_data;
+		i = cfg->vector;
+		v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
 
-	/*
-	 * We must acknowledge the irq before we move it or the acknowledge will
-	 * not propagate properly.
-	 */
-	ack_APIC_irq();
+		ack_APIC_irq();
+
+		/* Tail end of version 0x11 I/O APIC bug workaround. */
+		atomic_inc(&irq_mis_count);
+		spin_lock(&ioapic_lock);
+		__mask_and_edge_IO_APIC_irq(cfg);
+		__unmask_and_level_IO_APIC_irq(cfg);
+		spin_unlock(&ioapic_lock);
+	}
 
 	/* Now we can move and renable the irq */
 	if (unlikely(do_unmask_irq)) {
@@ -2616,20 +2633,6 @@ static void ack_apic_level(unsigned int irq)
 			move_masked_irq(irq);
 		unmask_IO_APIC_irq_desc(desc);
 	}
-
-	/* Tail end of version 0x11 I/O APIC bug workaround */
-	if (!(v & (1 << (i & 0x1f)))) {
-		atomic_inc(&irq_mis_count);
-
-		if (use_eoi_reg)
-			eoi_ioapic_irq(desc);
-		else {
-			spin_lock(&ioapic_lock);
-			__mask_and_edge_IO_APIC_irq(cfg);
-			__unmask_and_level_IO_APIC_irq(cfg);
-			spin_unlock(&ioapic_lock);
-		}
-	}
 }
 
 #ifdef CONFIG_INTR_REMAP

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [tip:x86/apic] x86: Use EOI register in io-apic on intel platforms
  2009-11-06  6:53               ` Maciej W. Rozycki
@ 2009-11-07  7:27                 ` Suresh Siddha
  2009-11-08 19:06                   ` Maciej W. Rozycki
  0 siblings, 1 reply; 25+ messages in thread
From: Suresh Siddha @ 2009-11-07  7:27 UTC (permalink / raw)
  To: Maciej W. Rozycki
  Cc: Ingo Molnar, hpa, linux-kernel, ebiederm, garyhade, tglx,
	Sankaran, Rajesh

On Thu, 2009-11-05 at 22:53 -0800, Maciej W. Rozycki wrote:
>  What I mean is if the serial delivery type is used, then an interrupt 
> will be acked twice -- once via an EOI message sent from the local APIC 
> over the serial bus and then again via the write to the EOI register.  

Maciej, Case where we are doing an explicit EOI to the io-apic (using
EOI register) is when the level triggered interrupt gets registered at
the cpu as an edge interrupt (in the local apic's trigger mode
register).

It will arrive as an edge interrupt for two cases.
a) for corner conditions which hit the 82093AA (io-apic version 0x11)
erratum
b) with my recent patch in -tip, during a cpu offline, when we send an
ipi (IPI is always registered as an edge triggered at the cpu) to
service the interrupt at the new cpu destination, instead of servicing
at it's old destination cpu (as it has already disabled interrupts and
going down -- like not being on the cpu_online_map etc).

So we are not acking the io-apic twice in this case, as the eoi to the
local apic won't brodcast the eoi to the io-apic (because of the edge
mode in trigger mode register of the local apic).

> There is a race condition here, so if the IRQ line is still/again 
> asserted, then most likely two consecutive IRQ messages will be sent by 
> the I/O APIC and they may be accepted by two different local units and 
> eventually delivered to the OS.  Linux will cope, but still this is sloppy 
> programming, so it should be best avoided -- if possible that is.

I hope the above clarified.

> > 
> > I am not sure if I follow. With the recent changes (tip commit
> > 5231a68614b94f60e8f6c56bc6e3d75955b9e75e), we use ipi on a new cpu to
> > handle a pending level-triggered  interrupt on the cpu that is going
> > offline. It's just not only in the case of io-apic erratum for 0x11, we
> > see level triggered interrupt as edge interrupt at the cpu.
> 
>  I don't get it, sorry -- an interrupt has its trigger mode implied by the 
> IRQ_LEVEL status flag being set or clear in the IRQ's descriptor.  What's 
> set in the local APIC's TMR does not (or should not) matter IMO.

Same, did the above clarify?

> 
>  Has the trigger mode mismatch erratum been seen for FSB delivered 
> interrupts anyway? 

No.

>  Complete the EOI dance for the trigger mode mismatch APIC erratum before 
> proceeding to IRQ migration. 

I am ok with what you are tying to fix, but not your patch. please see
below.

> +	if (use_eoi_reg) {
> +		ack_APIC_irq();
> +		eoi_ioapic_irq(desc);

We shouldn't do this unconditionally. i.e., we should do this double ack
only when the level-triggered interrupt is seen as an edge triggered
interrupt at the  cpu (specified by the trigger mode register in local
apic).

Otherwise as you mentioned above, we will see two EOI msg's at the
io-apic (one by the cpu's eoi broadcast and another by explicit eoi to
the io-apic).

Best way to fix the issue you noticed is by simply moving the tail end
of that erratum fix before we try to migrate the irq to a new place.

Do you agree?

---
From: "Maciej W. Rozycki" <macro@linux-mips.org>
Subject: x86, io-apic: move the effort of clearing remoteIRR explicitly
before migrating the irq

When the level-triggered interrupt is seen as an edge interrupt, we try
to clear the remoteIRR explicitly (using either an io-apic eoi register
when present or through the idea of changing trigger mode of the io-apic
RTE to edge and then back to level). But this explicit try also needs to
happen before we try to migrate the irq. Otherwise irq migration attempt
will fail anyhow, as it postpones the irq migration to a later attempt
when it sees the remoteIRR in the io-apic RTE still set.

Signed-off-by: "Maciej W. Rozycki" <macro@linux-mips.org>
Reviewed-by: Suresh Siddha <suresh.b.siddha@intel.com>
---
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 47d95c3..9a26ea1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2583,6 +2583,20 @@ static void ack_apic_level(unsigned int irq)
 	 */
 	ack_APIC_irq();
 
+	/* Tail end of version 0x11 I/O APIC bug workaround */
+	if (!(v & (1 << (i & 0x1f)))) {
+		atomic_inc(&irq_mis_count);
+
+		if (use_eoi_reg)
+			eoi_ioapic_irq(desc);
+		else {
+			spin_lock(&ioapic_lock);
+			__mask_and_edge_IO_APIC_irq(cfg);
+			__unmask_and_level_IO_APIC_irq(cfg);
+			spin_unlock(&ioapic_lock);
+		}
+	}
+
 	/* Now we can move and renable the irq */
 	if (unlikely(do_unmask_irq)) {
 		/* Only migrate the irq if the ack has been received.
@@ -2616,20 +2630,6 @@ static void ack_apic_level(unsigned int irq)
 			move_masked_irq(irq);
 		unmask_IO_APIC_irq_desc(desc);
 	}
-
-	/* Tail end of version 0x11 I/O APIC bug workaround */
-	if (!(v & (1 << (i & 0x1f)))) {
-		atomic_inc(&irq_mis_count);
-
-		if (use_eoi_reg)
-			eoi_ioapic_irq(desc);
-		else {
-			spin_lock(&ioapic_lock);
-			__mask_and_edge_IO_APIC_irq(cfg);
-			__unmask_and_level_IO_APIC_irq(cfg);
-			spin_unlock(&ioapic_lock);
-		}
-	}
 }
 
 #ifdef CONFIG_INTR_REMAP



^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [tip:x86/apic] x86: Use EOI register in io-apic on intel platforms
  2009-11-07  7:27                 ` Suresh Siddha
@ 2009-11-08 19:06                   ` Maciej W. Rozycki
  2009-12-02  0:56                     ` Suresh Siddha
  0 siblings, 1 reply; 25+ messages in thread
From: Maciej W. Rozycki @ 2009-11-08 19:06 UTC (permalink / raw)
  To: Suresh Siddha
  Cc: Ingo Molnar, hpa, linux-kernel, ebiederm, garyhade, tglx,
	Sankaran, Rajesh

On Fri, 6 Nov 2009, Suresh Siddha wrote:

> >  What I mean is if the serial delivery type is used, then an interrupt 
> > will be acked twice -- once via an EOI message sent from the local APIC 
> > over the serial bus and then again via the write to the EOI register.  
> 
> Maciej, Case where we are doing an explicit EOI to the io-apic (using
> EOI register) is when the level triggered interrupt gets registered at
> the cpu as an edge interrupt (in the local apic's trigger mode
> register).
> 
> It will arrive as an edge interrupt for two cases.
> a) for corner conditions which hit the 82093AA (io-apic version 0x11)
> erratum
> b) with my recent patch in -tip, during a cpu offline, when we send an
> ipi (IPI is always registered as an edge triggered at the cpu) to
> service the interrupt at the new cpu destination, instead of servicing
> at it's old destination cpu (as it has already disabled interrupts and
> going down -- like not being on the cpu_online_map etc).
> 
> So we are not acking the io-apic twice in this case, as the eoi to the
> local apic won't brodcast the eoi to the io-apic (because of the edge
> mode in trigger mode register of the local apic).

 OK, I see what you mean, but that makes me wonder why you are going 
through such contortions.  In the case of a CPU going offline I would 
expect it to be done more or less in such a way:

1. Write all-zeroes to its local APIC's LDR register and set its TPR to 
   0xef.  This will take this APIC out from LoPri arbitration and thus 
   from accepting any I/O APIC interrupts.  Fixed delivery mode IPIs will 
   still be accepted (if that's not needed then the TPR can be set to 
   0xff; any received IPIs will be lost).

2. Service any outstanding interrupts that have already been accepted by 
   the local APIC (you may have to poll on the local IRR register with 
   interrupts enabled for a short while).

3. Disable the local APIC via the SVR register, mask local interrupts in 
   the processor's EFLAGS register and start the offline procedure.  This 
   is the point of no-return, further IPIs won't be accepted and the CPU 
   has to be put through an INIT-IPI+StartUp-IPI cycle to get in control 
   again.

   If IPI reception was not needed through stage #2 above, then the local 
   APIC could have been disabled at #1 instead -- interrupts pending in 
   the local APIC as recorded in the IRR or marked as in-progress in the 
   ISR are guaranteed to be delivered to the CPU and EOIed (as 
   appropriate) normally even in the disabled state of the local APIC.

> Do you agree?

 If the scenario I have outlined above cannot be made to work for some 
reason, then please do me and the others a favour and since with this 
change you are tying new functionality to code originally meant as a 
workaround for an obscure erratum only, do write a proper explanation and 
place it next to the original comment describing previous use of the code. 
With your change as it is, it is all but obvious what this piece of code 
is meant to do.

 Your change is OK with me once accompanied with said comment, but please 
investigate my scenario first -- your approach looks like a horrible hack 
to me, sorry.

  Maciej

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [tip:x86/apic] x86: Use EOI register in io-apic on intel platforms
  2009-11-08 19:06                   ` Maciej W. Rozycki
@ 2009-12-02  0:56                     ` Suresh Siddha
  0 siblings, 0 replies; 25+ messages in thread
From: Suresh Siddha @ 2009-12-02  0:56 UTC (permalink / raw)
  To: Maciej W. Rozycki
  Cc: Ingo Molnar, hpa, linux-kernel, ebiederm, garyhade, tglx,
	Sankaran, Rajesh

Maciej,

Sorry for delayed response. I was busy with other stuff and didn't get a
chance till now to get back on this. I just posted few patches which
came up as a result of our past discussions here in this thread.

Please see my responses inline for your earlier comments:

On Sun, 2009-11-08 at 11:06 -0800, Maciej W. Rozycki wrote:
> OK, I see what you mean, but that makes me wonder why you are going 
> through such contortions.  In the case of a CPU going offline I would 
> expect it to be done more or less in such a way:
> 
> 1. Write all-zeroes to its local APIC's LDR register and set its TPR to 
>    0xef.  This will take this APIC out from LoPri arbitration and thus 
>    from accepting any I/O APIC interrupts.  Fixed delivery mode IPIs will 
>    still be accepted (if that's not needed then the TPR can be set to 
>    0xff; any received IPIs will be lost).
> 
> 2. Service any outstanding interrupts that have already been accepted by 
>    the local APIC (you may have to poll on the local IRR register with 
>    interrupts enabled for a short while).
> 
> 3. Disable the local APIC via the SVR register, mask local interrupts in 
>    the processor's EFLAGS register and start the offline procedure.  This 
>    is the point of no-return, further IPIs won't be accepted and the CPU 
>    has to be put through an INIT-IPI+StartUp-IPI cycle to get in control 
>    again.
> 
>    If IPI reception was not needed through stage #2 above, then the local 
>    APIC could have been disabled at #1 instead -- interrupts pending in 
>    the local APIC as recorded in the IRR or marked as in-progress in the 
>    ISR are guaranteed to be delivered to the CPU and EOIed (as 
>    appropriate) normally even in the disabled state of the local APIC.

But before these 3 steps you listed here, we need to migrate the irq to
the new destination. And that step will modify the IO-APIC RTE with the
new vector and new destination. And during this process, remoteIRR of
the IO-APIC RTE might be set and this inflight interrupt will get
registered at the original destination that is going offline.

So when we come to step 2 you listed above and service any outstanding
interrupts, EOI broadcast as part of that handling won't clear the
remoteIRR of the IO-APIC RTE, as the vector information in the io-apic
RTE got modified because of irq migration and is different from the
vector information in EOI broadcast message sent by the cpu. This will
result in stuck level interrupt.

This is one of the challenges Eric Biederman had in the past and he
tried things like polling from the process context and modifying the
IO-APIC RTE (with new destination and vector information) only when the
remoteIRR is not set etc. But Eric still saw some hangs and stuck
interrupt conditions with his experimental patches.

We took a route which needed minor changes to the existing code and fix
the local_irq_enable()/local_irq_disable() issue and stuck interrupt
issue in the cpu offline path by using the IO-APIC EOI register. Our
tests on Intel platforms having an EOI register for io-apic's and IBM's
(Gary) tests on io-apic's which don't have EOI register using AMD
platforms worked fine with our approach.

> > Do you agree?
> 
>  If the scenario I have outlined above cannot be made to work for some 
> reason, 

Perhaps we can make it work but it needs more changes and validation.
And atleast Eric's similar experiments in the past didn't yield good
results.

> then please do me and the others a favour and since with this 
> change you are tying new functionality to code originally meant as a 
> workaround for an obscure erratum only, do write a proper explanation and 
> place it next to the original comment describing previous use of the code. 
> With your change as it is, it is all but obvious what this piece of code 
> is meant to do.

This patch was also in the series that I just sent.

>  Your change is OK with me once accompanied with said comment, but please 
> investigate my scenario first -- your approach looks like a horrible hack 
> to me, sorry.

This being a fragile area and considering our experiences in the past, I
leveraged the existing code (of clearing remoteIRR manually on ioapic's
which don't have an EOI register) and luckily we had good success so
far.

thanks,
suresh


^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2009-12-02  0:57 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-10-26 22:24 [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline Suresh Siddha
2009-10-26 22:24 ` [patch 1/6] x86: unify fixup_irqs() for 32-bit and 64-bit kernels Suresh Siddha
2009-11-02 16:16   ` [tip:x86/apic] x86: Unify " tip-bot for Suresh Siddha
2009-10-26 22:24 ` [patch 2/6] x86, intr-remap: Avoid irq_chip mask/unmask in fixup_irqs() for intr-remapping Suresh Siddha
2009-11-02 16:16   ` [tip:x86/apic] " tip-bot for Suresh Siddha
2009-10-26 22:24 ` [patch 3/6] x86: remove move_cleanup_count from irq_cfg Suresh Siddha
2009-11-02 16:17   ` [tip:x86/apic] x86: Remove " tip-bot for Suresh Siddha
2009-10-26 22:24 ` [patch 4/6] x86: force irq complete move during cpu offline Suresh Siddha
2009-11-02 16:17   ` [tip:x86/apic] x86: Force " tip-bot for Suresh Siddha
2009-10-26 22:24 ` [RFC patch 5/6] x86: Use EOI register in io-apic on intel platforms Suresh Siddha
2009-11-02 16:17   ` [tip:x86/apic] " tip-bot for Suresh Siddha
2009-11-04  0:53     ` Maciej W. Rozycki
2009-11-04  2:24       ` Suresh Siddha
2009-11-04 23:04         ` Suresh Siddha
2009-11-05 14:46           ` Maciej W. Rozycki
2009-11-06  0:01             ` Suresh Siddha
2009-11-06  6:53               ` Maciej W. Rozycki
2009-11-07  7:27                 ` Suresh Siddha
2009-11-08 19:06                   ` Maciej W. Rozycki
2009-12-02  0:56                     ` Suresh Siddha
2009-10-26 22:24 ` [RFC patch 6/6] x86: remove local_irq_enable()/local_irq_disable() in fixup_irqs() Suresh Siddha
2009-11-02 16:17   ` [tip:x86/apic] x86: Remove " tip-bot for Suresh Siddha
2009-10-30 19:25 ` [patch 0/6] x86: cleanups and fixes for irq migration code during cpu offline Suresh Siddha
2009-11-02 14:59   ` Ingo Molnar
2009-11-02 17:35     ` Gary Hade

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.