linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 5/5] irq: move irq_desc according to smp_affinity v5
@ 2008-12-06  3:00 Yinghai Lu
  2008-12-08 13:42 ` Ingo Molnar
  0 siblings, 1 reply; 6+ messages in thread
From: Yinghai Lu @ 2008-12-06  3:00 UTC (permalink / raw)
  To: Ingo Molnar, Thomas Gleixner, H. Peter Anvin, Andrew Morton
  Cc: linux-kernel, Yinghai Lu

impact: new feature move irq_desc with sparseirq

if CONFIG_MOVE_IRQ_DESC is set
  make irq_desc to go with affinity aka irq_desc moving etc
  call move_irq_desc in irq_complete_move()
  legacy irq_desc is not moved, because they are allocated via static array

v3: add calling to irq_to_desc after calling ack/eoi instead of passing desc

for logical apic mode, need to add move_desc_in_progress_in_same_domain. otherwise it will not get moved. ==> also could need two phase to get irq_desc moved.
	for example: 0xff is old affinity, and need to set 0xf, and then set to 0xf0.
	[ or we need to change domain definition to cpus on the same node ? ]

LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,000000ff
LBSuse:~ # echo f > /proc/irq/22/smp_affinity
LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,0000000f
LBSuse:~ # tail /var/log/messages
...
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
LBSuse:~ # echo f0 > /proc/irq/22/smp_affinity
LBSuse:~ # tail /var/log/messages
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
Oct 27 12:36:46 LBSuse kernel:   move irq_desc for 22 aka 0x16 to cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc kstat_irqs on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_cfg on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_2_pin on cpu 7 node 1

so assume the user space program should update /proc/irq/XX/smp_affinity to 03 or 0f at first on boot
or we change irq_default_affinity ?

for physical apic is much simple
on 4 sockets 16 cores system
irq_desc is moving..
when
# echo 10 > /proc/irq/134483967/smp_affinity
# echo 100 > /proc/irq/134483967/smp_affinity
# echo 1000 > /proc/irq/134483967/smp_affinity
got
Nov  9 21:39:51 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc kstat_irqs on cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc irq_cfg on cpu 4 node 1
Nov  9 21:40:05 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc kstat_irqs on cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc irq_cfg on cpu 8 node 2
Nov  9 21:40:18 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc kstat_irqs on cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc irq_cfg on cpu 12 node 3

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/Kconfig          |    9 ++
 arch/x86/kernel/io_apic.c |  143 +++++++++++++++++++++++++++++++++++++++++++++-
 kernel/irq/chip.c         |   30 +++++++++
 kernel/irq/handle.c       |  124 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 301 insertions(+), 5 deletions(-)

Index: linux-2.6/arch/x86/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig
+++ linux-2.6/arch/x86/Kconfig
@@ -253,6 +253,15 @@ config SPARSE_IRQ
 
 	  If you don't know what to do here, say Y.
 
+config MOVE_IRQ_DESC
+	bool "Move irq desc when changing irq smp_affinity"
+	depends on SPARSE_IRQ && SMP
+	default y
+	help
+	  This enables moving irq_desc to cpu/node that irq will use handled.
+
+	  If you don't know what to do here, say Y.
+
 config X86_FIND_SMP_CONFIG
 	def_bool y
 	depends on X86_MPPARSE || X86_VOYAGER
Index: linux-2.6/arch/x86/kernel/io_apic.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/io_apic.c
+++ linux-2.6/arch/x86/kernel/io_apic.c
@@ -141,6 +141,9 @@ struct irq_cfg {
 	unsigned move_cleanup_count;
 	u8 vector;
 	u8 move_in_progress : 1;
+#ifdef CONFIG_MOVE_IRQ_DESC
+	u8 move_desc_in_progress_in_same_domain : 1;
+#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -223,6 +226,122 @@ void arch_init_chip_data(struct irq_desc
 	}
 }
 
+#ifdef CONFIG_MOVE_IRQ_DESC
+
+static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg,
+				 int cpu)
+{
+	struct irq_pin_list *old_entry, *head, *tail, *entry;
+
+	cfg->irq_2_pin = NULL;
+	old_entry = old_cfg->irq_2_pin;
+	if (!old_entry)
+		return;
+
+	entry = get_one_free_irq_2_pin(cpu);
+	if (!entry)
+		return;
+
+	entry->apic = old_entry->apic;
+	entry->pin = old_entry->pin;
+	head = entry;
+	tail = entry;
+	old_entry = old_entry->next;
+
+	while (old_entry) {
+		entry = get_one_free_irq_2_pin(cpu);
+		if (!entry) {
+			entry = head;
+			while (entry) {
+				head = entry->next;
+				kfree(entry);
+				entry = head;
+			}
+			/* still use the old one */
+			return;
+		}
+		entry->apic = old_entry->apic;
+		entry->pin = old_entry->pin;
+		tail->next = entry;
+		tail = entry;
+		old_entry = old_entry->next;
+	}
+
+	tail->next = NULL;
+	cfg->irq_2_pin = head;
+}
+
+static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
+{
+	struct irq_pin_list *entry, *next;
+
+	if (old_cfg->irq_2_pin == cfg->irq_2_pin)
+		return;
+
+	entry = old_cfg->irq_2_pin;
+
+	while (entry) {
+		next = entry->next;
+		kfree(entry);
+		entry = next;
+	}
+	old_cfg->irq_2_pin = NULL;
+}
+
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+				 struct irq_desc *desc, int cpu)
+{
+	struct irq_cfg *cfg;
+	struct irq_cfg *old_cfg;
+
+	cfg = get_one_free_irq_cfg(cpu);
+
+	if (!cfg)
+		return;
+
+	desc->chip_data = cfg;
+
+	old_cfg = old_desc->chip_data;
+
+	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+	init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_cfg(struct irq_cfg *old_cfg)
+{
+	kfree(old_cfg);
+}
+
+void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	struct irq_cfg *old_cfg, *cfg;
+
+	old_cfg = old_desc->chip_data;
+	cfg = desc->chip_data;
+
+	if (old_cfg == cfg)
+		return;
+
+	if (old_cfg) {
+		free_irq_2_pin(old_cfg, cfg);
+		free_irq_cfg(old_cfg);
+		old_desc->chip_data = NULL;
+	}
+}
+
+static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+	struct irq_cfg *cfg = desc->chip_data;
+
+	if (!cfg->move_in_progress) {
+		/* it means that domain is not changed */
+		if (!cpus_intersects(desc->affinity, mask))
+			cfg->move_desc_in_progress_in_same_domain = 1;
+	}
+}
+#endif
+
 #else
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
@@ -231,9 +350,11 @@ static struct irq_cfg *irq_cfg(unsigned
 
 #endif
 
+#ifndef CONFIG_MOVE_IRQ_DESC
 static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
 {
 }
+#endif
 
 struct io_apic {
 	unsigned int index;
@@ -2346,14 +2467,34 @@ static void irq_complete_move(struct irq
 	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
-	if (likely(!cfg->move_in_progress))
+	if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_MOVE_IRQ_DESC
+		if (likely(!cfg->move_desc_in_progress_in_same_domain))
+			return;
+
+		/* domain is not change, but affinity is changed */
+		me = smp_processor_id();
+		if (cpu_isset(me, desc->affinity)) {
+			*descp = desc = move_irq_desc(desc, me);
+			/* get the new one */
+			cfg = desc->chip_data;
+			cfg->move_desc_in_progress_in_same_domain = 0;
+		}
+#endif
 		return;
+	}
 
 	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
 		cpumask_t cleanup_mask;
 
+#ifdef CONFIG_MOVE_IRQ_DESC
+		*descp = desc = move_irq_desc(desc, me);
+		/* get the new one */
+		cfg = desc->chip_data;
+#endif
+
 		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
Index: linux-2.6/kernel/irq/handle.c
===================================================================
--- linux-2.6.orig/kernel/irq/handle.c
+++ linux-2.6/kernel/irq/handle.c
@@ -90,6 +90,32 @@ static void init_kstat_irqs(struct irq_d
 		desc->kstat_irqs = (unsigned int *)ptr;
 }
 
+#ifdef CONFIG_MOVE_IRQ_DESC
+static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc,
+				 int cpu, int nr)
+{
+	unsigned long bytes;
+
+	init_kstat_irqs(desc, cpu, nr);
+
+	if (desc->kstat_irqs != old_desc->kstat_irqs) {
+		/* Compute how many bytes we need per irq and allocate them */
+		bytes = nr * sizeof(unsigned int);
+
+		memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
+	}
+}
+
+static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	if (old_desc->kstat_irqs == desc->kstat_irqs)
+		return;
+
+	kfree(old_desc->kstat_irqs);
+	old_desc->kstat_irqs = NULL;
+}
+#endif
+
 void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
 {
 }
@@ -110,6 +136,23 @@ static void init_one_irq_desc(int irq, s
 	arch_init_chip_data(desc, cpu);
 }
 
+#ifdef CONFIG_MOVE_IRQ_DESC
+static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+		 struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, old_desc, sizeof(struct irq_desc));
+	desc->cpu = cpu;
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	arch_init_copy_chip_data(old_desc, desc, cpu);
+}
+
+static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	free_kstat_irqs(old_desc, desc);
+	arch_free_chip_data(old_desc, desc);
+}
+#endif
 /*
  * Protect the sparse_irqs:
  */
@@ -203,6 +246,73 @@ out_unlock:
 	return desc;
 }
 
+#ifdef CONFIG_MOVE_IRQ_DESC
+static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
+						int cpu)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+	unsigned long flags;
+	int node;
+
+	irq = old_desc->irq;
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+
+	/* We have to check it to avoid races with another CPU */
+	desc = irq_desc_ptrs[irq];
+
+	if (desc && old_desc != desc)
+			goto out_unlock;
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  move irq_desc for %d to cpu %d node %d\n",
+		 irq, cpu, node);
+	if (!desc) {
+		printk(KERN_ERR "can not get new irq_desc for moving\n");
+		/* still use old one */
+		desc = old_desc;
+		goto out_unlock;
+	}
+	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
+
+	irq_desc_ptrs[irq] = desc;
+
+	/* free the old one */
+	free_one_irq_desc(old_desc, desc);
+	kfree(old_desc);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+{
+	int old_cpu;
+	int node, old_node;
+
+	/* those all static, do move them */
+	if (desc->irq < NR_IRQS_LEGACY)
+		return desc;
+
+	old_cpu = desc->cpu;
+	printk(KERN_DEBUG "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
+	if (old_cpu != cpu) {
+		node = cpu_to_node(cpu);
+		old_node = cpu_to_node(old_cpu);
+		if (old_node != node)
+			desc = __real_move_irq_desc(desc, cpu);
+		else
+			desc->cpu = cpu;
+	}
+
+	return desc;
+}
+#endif
+
 #else
 
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
@@ -337,8 +447,13 @@ unsigned int __do_IRQ(unsigned int irq)
 		/*
 		 * No locking required for CPU-local interrupts:
 		 */
-		if (desc->chip->ack)
+		if (desc->chip->ack) {
 			desc->chip->ack(irq);
+#ifdef CONFIG_MOVE_IRQ_DESC
+			/* get new one */
+			desc = irq_to_desc(irq);
+#endif
+		}
 		if (likely(!(desc->status & IRQ_DISABLED))) {
 			action_ret = handle_IRQ_event(irq, desc->action);
 			if (!noirqdebug)
@@ -349,8 +464,13 @@ unsigned int __do_IRQ(unsigned int irq)
 	}
 
 	spin_lock(&desc->lock);
-	if (desc->chip->ack)
+	if (desc->chip->ack) {
 		desc->chip->ack(irq);
+#ifdef CONFIG_MOVE_IRQ_DESC
+			/* get new one */
+		desc = irq_to_desc(irq);
+#endif
+	}
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
Index: linux-2.6/kernel/irq/chip.c
===================================================================
--- linux-2.6.orig/kernel/irq/chip.c
+++ linux-2.6/kernel/irq/chip.c
@@ -354,6 +354,10 @@ handle_level_irq(unsigned int irq, struc
 
 	spin_lock(&desc->lock);
 	mask_ack_irq(desc, irq);
+#ifdef CONFIG_MOVE_IRQ_DESC
+	/* get new one */
+	desc = irq_to_desc(irq);
+#endif
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
@@ -431,6 +435,10 @@ handle_fasteoi_irq(unsigned int irq, str
 	desc->status &= ~IRQ_INPROGRESS;
 out:
 	desc->chip->eoi(irq);
+#ifdef CONFIG_MOVE_IRQ_DESC
+	/* get new one */
+	desc = irq_to_desc(irq);
+#endif
 
 	spin_unlock(&desc->lock);
 }
@@ -467,12 +475,20 @@ handle_edge_irq(unsigned int irq, struct
 		    !desc->action)) {
 		desc->status |= (IRQ_PENDING | IRQ_MASKED);
 		mask_ack_irq(desc, irq);
+#ifdef CONFIG_MOVE_IRQ_DESC
+		/* get new one */
+		desc = irq_to_desc(irq);
+#endif
 		goto out_unlock;
 	}
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/* Start handling the irq */
 	desc->chip->ack(irq);
+#ifdef CONFIG_MOVE_IRQ_DESC
+	/* get new one */
+	desc = irq_to_desc(irq);
+#endif
 
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
@@ -533,8 +549,13 @@ handle_percpu_irq(unsigned int irq, stru
 	if (!noirqdebug)
 		note_interrupt(irq, desc, action_ret);
 
-	if (desc->chip->eoi)
+	if (desc->chip->eoi) {
 		desc->chip->eoi(irq);
+#ifdef CONFIG_MOVE_IRQ_DESC
+		/* get new one */
+		desc = irq_to_desc(irq);
+#endif
+	}
 }
 
 void
@@ -569,8 +590,13 @@ __set_irq_handler(unsigned int irq, irq_
 
 	/* Uninstall? */
 	if (handle == handle_bad_irq) {
-		if (desc->chip != &no_irq_chip)
+		if (desc->chip != &no_irq_chip) {
 			mask_ack_irq(desc, irq);
+#ifdef CONFIG_MOVE_IRQ_DESC
+			/* get new one */
+			desc = irq_to_desc(irq);
+#endif
+		}
 		desc->status |= IRQ_DISABLED;
 		desc->depth = 1;
 	}

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 5/5] irq: move irq_desc according to smp_affinity v5
  2008-12-06  3:00 [PATCH 5/5] irq: move irq_desc according to smp_affinity v5 Yinghai Lu
@ 2008-12-08 13:42 ` Ingo Molnar
  2008-12-08 19:18   ` Yinghai Lu
  2008-12-08 22:07   ` [PATCH] irq: move irq_desc according to smp_affinity v6 Yinghai Lu
  0 siblings, 2 replies; 6+ messages in thread
From: Ingo Molnar @ 2008-12-08 13:42 UTC (permalink / raw)
  To: Yinghai Lu; +Cc: Thomas Gleixner, H. Peter Anvin, Andrew Morton, linux-kernel


* Yinghai Lu <yinghai@kernel.org> wrote:

> +#ifdef CONFIG_MOVE_IRQ_DESC
> +	/* get new one */
> +	desc = irq_to_desc(irq);
> +#endif
>  
>  	spin_unlock(&desc->lock);
>  }
> @@ -467,12 +475,20 @@ handle_edge_irq(unsigned int irq, struct
>  		    !desc->action)) {
>  		desc->status |= (IRQ_PENDING | IRQ_MASKED);
>  		mask_ack_irq(desc, irq);
> +#ifdef CONFIG_MOVE_IRQ_DESC
> +		/* get new one */
> +		desc = irq_to_desc(irq);
> +#endif
>  		goto out_unlock;
>  	}
>  	kstat_incr_irqs_this_cpu(irq, desc);
>  
>  	/* Start handling the irq */
>  	desc->chip->ack(irq);
> +#ifdef CONFIG_MOVE_IRQ_DESC
> +	/* get new one */
> +	desc = irq_to_desc(irq);
> +#endif
>  
>  	/* Mark the IRQ currently in progress.*/
>  	desc->status |= IRQ_INPROGRESS;
> @@ -533,8 +549,13 @@ handle_percpu_irq(unsigned int irq, stru
>  	if (!noirqdebug)
>  		note_interrupt(irq, desc, action_ret);
>  
> -	if (desc->chip->eoi)
> +	if (desc->chip->eoi) {
>  		desc->chip->eoi(irq);
> +#ifdef CONFIG_MOVE_IRQ_DESC
> +		/* get new one */
> +		desc = irq_to_desc(irq);
> +#endif
> +	}
>  }
>  
>  void
> @@ -569,8 +590,13 @@ __set_irq_handler(unsigned int irq, irq_
>  
>  	/* Uninstall? */
>  	if (handle == handle_bad_irq) {
> -		if (desc->chip != &no_irq_chip)
> +		if (desc->chip != &no_irq_chip) {
>  			mask_ack_irq(desc, irq);
> +#ifdef CONFIG_MOVE_IRQ_DESC
> +			/* get new one */
> +			desc = irq_to_desc(irq);
> +#endif

this patch adds a ton of #ifdefs to important .c files, which could all 
have been avoided by introducing a new method:

	desc = irq_remap_to_desc(irq, desc);

which would do something like:

 static struct irq_desc *
 irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
 {
 #ifdef CONFIG_MOVE_IRQ_DESC
	return irq_to_desc(irq);
 #else
	return desc;
 #endif
 }

right?

	Ingo

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 5/5] irq: move irq_desc according to smp_affinity v5
  2008-12-08 13:42 ` Ingo Molnar
@ 2008-12-08 19:18   ` Yinghai Lu
  2008-12-08 22:07   ` [PATCH] irq: move irq_desc according to smp_affinity v6 Yinghai Lu
  1 sibling, 0 replies; 6+ messages in thread
From: Yinghai Lu @ 2008-12-08 19:18 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Thomas Gleixner, H. Peter Anvin, Andrew Morton, linux-kernel

On Mon, Dec 8, 2008 at 5:42 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
>>                       mask_ack_irq(desc, irq);
>> +#ifdef CONFIG_MOVE_IRQ_DESC
>> +                     /* get new one */
>> +                     desc = irq_to_desc(irq);
>> +#endif
>
> this patch adds a ton of #ifdefs to important .c files, which could all
> have been avoided by introducing a new method:
>
>        desc = irq_remap_to_desc(irq, desc);
>
> which would do something like:
>
>  static struct irq_desc *
>  irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
>  {
>  #ifdef CONFIG_MOVE_IRQ_DESC
>        return irq_to_desc(irq);
>  #else
>        return desc;
>  #endif
>  }
>
> right?

yes. will work on it.

YH

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] irq: move irq_desc according to smp_affinity v6
  2008-12-08 13:42 ` Ingo Molnar
  2008-12-08 19:18   ` Yinghai Lu
@ 2008-12-08 22:07   ` Yinghai Lu
  2008-12-09  3:41     ` Ingo Molnar
  1 sibling, 1 reply; 6+ messages in thread
From: Yinghai Lu @ 2008-12-08 22:07 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Thomas Gleixner, H. Peter Anvin, Andrew Morton, linux-kernel


impact: new feature move irq_desc with sparseirq

if CONFIG_MOVE_IRQ_DESC is set
  make irq_desc to go with affinity aka irq_desc moving etc
  call move_irq_desc in irq_complete_move()
  legacy irq_desc is not moved, because they are allocated via static array

v3: add calling to irq_to_desc after calling ack/eoi instead of passing desc

v6: use irq_remap_to_desc to avoid some #ifdef according to Ingo

for logical apic mode, need to add move_desc_in_progress_in_same_domain. otherwise it will not get moved. ==> also could need two phase to get irq_desc moved.
	for example: 0xff is old affinity, and need to set 0xf, and then set to 0xf0.
	[ or we need to change domain definition to cpus on the same node ? ]

LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,000000ff
LBSuse:~ # echo f > /proc/irq/22/smp_affinity
LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,0000000f
LBSuse:~ # tail /var/log/messages
...
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
LBSuse:~ # echo f0 > /proc/irq/22/smp_affinity
LBSuse:~ # tail /var/log/messages
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
Oct 27 12:36:46 LBSuse kernel:   move irq_desc for 22 aka 0x16 to cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc kstat_irqs on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_cfg on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_2_pin on cpu 7 node 1

so assume the user space program should update /proc/irq/XX/smp_affinity to 03 or 0f at first on boot
or we change irq_default_affinity ?

for physical apic is much simple
on 4 sockets 16 cores system
irq_desc is moving..
when
# echo 10 > /proc/irq/134483967/smp_affinity
# echo 100 > /proc/irq/134483967/smp_affinity
# echo 1000 > /proc/irq/134483967/smp_affinity
got
Nov  9 21:39:51 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc kstat_irqs on cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc irq_cfg on cpu 4 node 1
Nov  9 21:40:05 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc kstat_irqs on cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc irq_cfg on cpu 8 node 2
Nov  9 21:40:18 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc kstat_irqs on cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc irq_cfg on cpu 12 node 3

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/Kconfig          |    9 ++
 arch/x86/kernel/io_apic.c |  143 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/irq.h       |   10 +++
 kernel/irq/chip.c         |   12 +++
 kernel/irq/handle.c       |  119 +++++++++++++++++++++++++++++++++++++-
 5 files changed, 288 insertions(+), 5 deletions(-)

Index: linux-2.6/arch/x86/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig
+++ linux-2.6/arch/x86/Kconfig
@@ -253,6 +253,15 @@ config SPARSE_IRQ
 
 	  If you don't know what to do here, say Y.
 
+config MOVE_IRQ_DESC
+	bool "Move irq desc when changing irq smp_affinity"
+	depends on SPARSE_IRQ && SMP
+	default y
+	help
+	  This enables moving irq_desc to cpu/node that irq will use handled.
+
+	  If you don't know what to do here, say Y.
+
 config X86_FIND_SMP_CONFIG
 	def_bool y
 	depends on X86_MPPARSE || X86_VOYAGER
Index: linux-2.6/arch/x86/kernel/io_apic.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/io_apic.c
+++ linux-2.6/arch/x86/kernel/io_apic.c
@@ -141,6 +141,9 @@ struct irq_cfg {
 	unsigned move_cleanup_count;
 	u8 vector;
 	u8 move_in_progress : 1;
+#ifdef CONFIG_MOVE_IRQ_DESC
+	u8 move_desc_in_progress_in_same_domain : 1;
+#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -223,6 +226,122 @@ void arch_init_chip_data(struct irq_desc
 	}
 }
 
+#ifdef CONFIG_MOVE_IRQ_DESC
+
+static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg,
+				 int cpu)
+{
+	struct irq_pin_list *old_entry, *head, *tail, *entry;
+
+	cfg->irq_2_pin = NULL;
+	old_entry = old_cfg->irq_2_pin;
+	if (!old_entry)
+		return;
+
+	entry = get_one_free_irq_2_pin(cpu);
+	if (!entry)
+		return;
+
+	entry->apic = old_entry->apic;
+	entry->pin = old_entry->pin;
+	head = entry;
+	tail = entry;
+	old_entry = old_entry->next;
+
+	while (old_entry) {
+		entry = get_one_free_irq_2_pin(cpu);
+		if (!entry) {
+			entry = head;
+			while (entry) {
+				head = entry->next;
+				kfree(entry);
+				entry = head;
+			}
+			/* still use the old one */
+			return;
+		}
+		entry->apic = old_entry->apic;
+		entry->pin = old_entry->pin;
+		tail->next = entry;
+		tail = entry;
+		old_entry = old_entry->next;
+	}
+
+	tail->next = NULL;
+	cfg->irq_2_pin = head;
+}
+
+static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
+{
+	struct irq_pin_list *entry, *next;
+
+	if (old_cfg->irq_2_pin == cfg->irq_2_pin)
+		return;
+
+	entry = old_cfg->irq_2_pin;
+
+	while (entry) {
+		next = entry->next;
+		kfree(entry);
+		entry = next;
+	}
+	old_cfg->irq_2_pin = NULL;
+}
+
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+				 struct irq_desc *desc, int cpu)
+{
+	struct irq_cfg *cfg;
+	struct irq_cfg *old_cfg;
+
+	cfg = get_one_free_irq_cfg(cpu);
+
+	if (!cfg)
+		return;
+
+	desc->chip_data = cfg;
+
+	old_cfg = old_desc->chip_data;
+
+	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+	init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_cfg(struct irq_cfg *old_cfg)
+{
+	kfree(old_cfg);
+}
+
+void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	struct irq_cfg *old_cfg, *cfg;
+
+	old_cfg = old_desc->chip_data;
+	cfg = desc->chip_data;
+
+	if (old_cfg == cfg)
+		return;
+
+	if (old_cfg) {
+		free_irq_2_pin(old_cfg, cfg);
+		free_irq_cfg(old_cfg);
+		old_desc->chip_data = NULL;
+	}
+}
+
+static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+	struct irq_cfg *cfg = desc->chip_data;
+
+	if (!cfg->move_in_progress) {
+		/* it means that domain is not changed */
+		if (!cpus_intersects(desc->affinity, mask))
+			cfg->move_desc_in_progress_in_same_domain = 1;
+	}
+}
+#endif
+
 #else
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
@@ -231,9 +350,11 @@ static struct irq_cfg *irq_cfg(unsigned
 
 #endif
 
+#ifndef CONFIG_MOVE_IRQ_DESC
 static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
 {
 }
+#endif
 
 struct io_apic {
 	unsigned int index;
@@ -2346,14 +2467,34 @@ static void irq_complete_move(struct irq
 	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
-	if (likely(!cfg->move_in_progress))
+	if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_MOVE_IRQ_DESC
+		if (likely(!cfg->move_desc_in_progress_in_same_domain))
+			return;
+
+		/* domain is not change, but affinity is changed */
+		me = smp_processor_id();
+		if (cpu_isset(me, desc->affinity)) {
+			*descp = desc = move_irq_desc(desc, me);
+			/* get the new one */
+			cfg = desc->chip_data;
+			cfg->move_desc_in_progress_in_same_domain = 0;
+		}
+#endif
 		return;
+	}
 
 	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
 		cpumask_t cleanup_mask;
 
+#ifdef CONFIG_MOVE_IRQ_DESC
+		*descp = desc = move_irq_desc(desc, me);
+		/* get the new one */
+		cfg = desc->chip_data;
+#endif
+
 		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
Index: linux-2.6/kernel/irq/handle.c
===================================================================
--- linux-2.6.orig/kernel/irq/handle.c
+++ linux-2.6/kernel/irq/handle.c
@@ -90,6 +90,32 @@ static void init_kstat_irqs(struct irq_d
 		desc->kstat_irqs = (unsigned int *)ptr;
 }
 
+#ifdef CONFIG_MOVE_IRQ_DESC
+static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc,
+				 int cpu, int nr)
+{
+	unsigned long bytes;
+
+	init_kstat_irqs(desc, cpu, nr);
+
+	if (desc->kstat_irqs != old_desc->kstat_irqs) {
+		/* Compute how many bytes we need per irq and allocate them */
+		bytes = nr * sizeof(unsigned int);
+
+		memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
+	}
+}
+
+static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	if (old_desc->kstat_irqs == desc->kstat_irqs)
+		return;
+
+	kfree(old_desc->kstat_irqs);
+	old_desc->kstat_irqs = NULL;
+}
+#endif
+
 void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
 {
 }
@@ -110,6 +136,23 @@ static void init_one_irq_desc(int irq, s
 	arch_init_chip_data(desc, cpu);
 }
 
+#ifdef CONFIG_MOVE_IRQ_DESC
+static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+		 struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, old_desc, sizeof(struct irq_desc));
+	desc->cpu = cpu;
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	arch_init_copy_chip_data(old_desc, desc, cpu);
+}
+
+static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	free_kstat_irqs(old_desc, desc);
+	arch_free_chip_data(old_desc, desc);
+}
+#endif
 /*
  * Protect the sparse_irqs:
  */
@@ -203,6 +246,73 @@ out_unlock:
 	return desc;
 }
 
+#ifdef CONFIG_MOVE_IRQ_DESC
+static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
+						int cpu)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+	unsigned long flags;
+	int node;
+
+	irq = old_desc->irq;
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+
+	/* We have to check it to avoid races with another CPU */
+	desc = irq_desc_ptrs[irq];
+
+	if (desc && old_desc != desc)
+			goto out_unlock;
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  move irq_desc for %d to cpu %d node %d\n",
+		 irq, cpu, node);
+	if (!desc) {
+		printk(KERN_ERR "can not get new irq_desc for moving\n");
+		/* still use old one */
+		desc = old_desc;
+		goto out_unlock;
+	}
+	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
+
+	irq_desc_ptrs[irq] = desc;
+
+	/* free the old one */
+	free_one_irq_desc(old_desc, desc);
+	kfree(old_desc);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+{
+	int old_cpu;
+	int node, old_node;
+
+	/* those all static, do move them */
+	if (desc->irq < NR_IRQS_LEGACY)
+		return desc;
+
+	old_cpu = desc->cpu;
+	printk(KERN_DEBUG "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
+	if (old_cpu != cpu) {
+		node = cpu_to_node(cpu);
+		old_node = cpu_to_node(old_cpu);
+		if (old_node != node)
+			desc = __real_move_irq_desc(desc, cpu);
+		else
+			desc->cpu = cpu;
+	}
+
+	return desc;
+}
+#endif
+
 #else
 
 struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
@@ -337,8 +447,11 @@ unsigned int __do_IRQ(unsigned int irq)
 		/*
 		 * No locking required for CPU-local interrupts:
 		 */
-		if (desc->chip->ack)
+		if (desc->chip->ack) {
 			desc->chip->ack(irq);
+			/* get new one */
+			desc = irq_remap_to_desc(irq, desc);
+		}
 		if (likely(!(desc->status & IRQ_DISABLED))) {
 			action_ret = handle_IRQ_event(irq, desc->action);
 			if (!noirqdebug)
@@ -349,8 +462,10 @@ unsigned int __do_IRQ(unsigned int irq)
 	}
 
 	spin_lock(&desc->lock);
-	if (desc->chip->ack)
+	if (desc->chip->ack) {
 		desc->chip->ack(irq);
+		desc = irq_remap_to_desc(irq, desc);
+	}
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
Index: linux-2.6/kernel/irq/chip.c
===================================================================
--- linux-2.6.orig/kernel/irq/chip.c
+++ linux-2.6/kernel/irq/chip.c
@@ -354,6 +354,7 @@ handle_level_irq(unsigned int irq, struc
 
 	spin_lock(&desc->lock);
 	mask_ack_irq(desc, irq);
+	desc = irq_remap_to_desc(irq, desc);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
@@ -431,6 +432,7 @@ handle_fasteoi_irq(unsigned int irq, str
 	desc->status &= ~IRQ_INPROGRESS;
 out:
 	desc->chip->eoi(irq);
+	desc = irq_remap_to_desc(irq, desc);
 
 	spin_unlock(&desc->lock);
 }
@@ -467,12 +469,14 @@ handle_edge_irq(unsigned int irq, struct
 		    !desc->action)) {
 		desc->status |= (IRQ_PENDING | IRQ_MASKED);
 		mask_ack_irq(desc, irq);
+		desc = irq_remap_to_desc(irq, desc);
 		goto out_unlock;
 	}
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/* Start handling the irq */
 	desc->chip->ack(irq);
+	desc = irq_remap_to_desc(irq, desc);
 
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
@@ -533,8 +537,10 @@ handle_percpu_irq(unsigned int irq, stru
 	if (!noirqdebug)
 		note_interrupt(irq, desc, action_ret);
 
-	if (desc->chip->eoi)
+	if (desc->chip->eoi) {
 		desc->chip->eoi(irq);
+		desc = irq_remap_to_desc(irq, desc);
+	}
 }
 
 void
@@ -569,8 +575,10 @@ __set_irq_handler(unsigned int irq, irq_
 
 	/* Uninstall? */
 	if (handle == handle_bad_irq) {
-		if (desc->chip != &no_irq_chip)
+		if (desc->chip != &no_irq_chip) {
 			mask_ack_irq(desc, irq);
+			desc = irq_remap_to_desc(irq, desc);
+		}
 		desc->status |= IRQ_DISABLED;
 		desc->depth = 1;
 	}
Index: linux-2.6/include/linux/irq.h
===================================================================
--- linux-2.6.orig/include/linux/irq.h
+++ linux-2.6/include/linux/irq.h
@@ -227,6 +227,16 @@ extern struct irq_desc *move_irq_desc(st
 
 #endif
 
+static inline struct irq_desc *
+irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
+{
+#ifdef CONFIG_MOVE_IRQ_DESC
+	return irq_to_desc(irq);
+#else
+	return desc;
+#endif
+}
+
 /*
  * Migration helpers for obsolete names, they will go away:
  */


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] irq: move irq_desc according to smp_affinity v6
  2008-12-08 22:07   ` [PATCH] irq: move irq_desc according to smp_affinity v6 Yinghai Lu
@ 2008-12-09  3:41     ` Ingo Molnar
  2008-12-11  8:15       ` [PATCH] irq: move irq_desc according to smp_affinity v7 Yinghai Lu
  0 siblings, 1 reply; 6+ messages in thread
From: Ingo Molnar @ 2008-12-09  3:41 UTC (permalink / raw)
  To: Yinghai Lu; +Cc: Thomas Gleixner, H. Peter Anvin, Andrew Morton, linux-kernel


* Yinghai Lu <yinghai@kernel.org> wrote:

> for physical apic is much simple
> on 4 sockets 16 cores system
> irq_desc is moving..
> when
> # echo 10 > /proc/irq/134483967/smp_affinity
> # echo 100 > /proc/irq/134483967/smp_affinity
> # echo 1000 > /proc/irq/134483967/smp_affinity
> got
> Nov  9 21:39:51 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 4 node 1
> Nov  9 21:39:51 LBSuse kernel:   alloc kstat_irqs on cpu 4 node 1
> Nov  9 21:39:51 LBSuse kernel:   alloc irq_cfg on cpu 4 node 1
> Nov  9 21:40:05 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 8 node 2
> Nov  9 21:40:05 LBSuse kernel:   alloc kstat_irqs on cpu 8 node 2
> Nov  9 21:40:05 LBSuse kernel:   alloc irq_cfg on cpu 8 node 2
> Nov  9 21:40:18 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 12 node 3
> Nov  9 21:40:18 LBSuse kernel:   alloc kstat_irqs on cpu 12 node 3
> Nov  9 21:40:18 LBSuse kernel:   alloc irq_cfg on cpu 12 node 3
> 
> Signed-off-by: Yinghai Lu <yinghai@kernel.org>

Neat feature!

i'm wondering, have you tried to characterise the cost savings of moving 
the irq desc? It will certainly save three heavy cross-NUMA cachemisses 
on x86 per rare irq source.

A way to attempt to measure this would be to write some quick debug hack 
that prints the cycle count of one specific IRQ source, in do_IRQ(), from 
the entry of do_IRQ() to the exit of do_IRQ(), using rdtscl(). Pick an 
IRQ that you can trigger arbitrarily, and printk the cycle cost at the 
end of do_IRQ(). [if irq == your_debug_irq - otherwise you can get a lot 
of printks and not too good measurements].

plus perhaps add some quick hack that makes the 
irq_desc/chip_data/kstat_irqs migration dependent on a sysctl, such as 
'panic_timeout' (tunable via 'echo 1 > /proc/sys/kernel/panic'). Then you 
could try to trigger your debug IRQ and the cycle cost printk in two 
modes:

  echo 0 > /proc/sys/kernel/panic

  [ migrate the IRQ to another domain and trigger the IRQ - wait for the 
    cycle printout. Both cache-cold and cache-hot numbers are 
    interesting. ]

  echo 1 > /proc/sys/kernel/panic

  [ re-migrate the debug IRQ via /proc/irq/*/smp_affinity to make sure 
    it's NUMA-local, then trigger the debug IRQ and record cache-cold and 
    cache-hot cycle counts. ]

it's hard to measure this reliably, as on x86 the numa factor is usually 
pretty low, so the local versus remote cachemiss cost is hard to 
separate.

A few comments about the patch too:

> +config MOVE_IRQ_DESC
> +	bool "Move irq desc when changing irq smp_affinity"
> +	depends on SPARSE_IRQ && SMP
> +	default y

new feature - should be default-no.

> +	help
> +	  This enables moving irq_desc to cpu/node that irq will use handled.
> +
> +	  If you don't know what to do here, say Y.

Later on i think we should just select this in the NUMA case, instead of 
complicating the user's selection. It's OK to have it configurable now - 
should it cause problems.

> +
>  config X86_FIND_SMP_CONFIG
>  	def_bool y
>  	depends on X86_MPPARSE || X86_VOYAGER
> Index: linux-2.6/arch/x86/kernel/io_apic.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/io_apic.c
> +++ linux-2.6/arch/x86/kernel/io_apic.c
> @@ -141,6 +141,9 @@ struct irq_cfg {
>  	unsigned move_cleanup_count;
>  	u8 vector;
>  	u8 move_in_progress : 1;
> +#ifdef CONFIG_MOVE_IRQ_DESC
> +	u8 move_desc_in_progress_in_same_domain : 1;
> +#endif

way too long field name - please rename to move_desc_pending or so.

> @@ -223,6 +226,122 @@ void arch_init_chip_data(struct irq_desc
>  	}
>  }
>  
> +#ifdef CONFIG_MOVE_IRQ_DESC
> +
> +static void init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg,
> +				 int cpu)
> +{

small style nit, it's a tiny bit tidier to break the line the following 
way:

static void
init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)

[ as this way we have all the parameters on a single line, and the return 
  type stands out on a separate line. ]

> +	struct irq_pin_list *old_entry, *head, *tail, *entry;
> +
> +	cfg->irq_2_pin = NULL;
> +	old_entry = old_cfg->irq_2_pin;
> +	if (!old_entry)
> +		return;
> +
> +	entry = get_one_free_irq_2_pin(cpu);
> +	if (!entry)
> +		return;
> +
> +	entry->apic = old_entry->apic;
> +	entry->pin = old_entry->pin;
> +	head = entry;
> +	tail = entry;
> +	old_entry = old_entry->next;

for mass-initialization please try to structure it a bit:

> +	entry->apic	= old_entry->apic;
> +	entry->pin	= old_entry->pin;
> +	head		= entry;
> +	tail		= entry;
> +
> +	old_entry	= old_entry->next;

it's much easier to validate such constructs. For example, once 
vertically aligned, i immediately saw an oddity in it - why is 
'old_entry' initialized twice?

> +
> +	while (old_entry) {
> +		entry = get_one_free_irq_2_pin(cpu);
> +		if (!entry) {
> +			entry = head;
> +			while (entry) {
> +				head = entry->next;
> +				kfree(entry);
> +				entry = head;
> +			}
> +			/* still use the old one */
> +			return;
> +		}

same here:

> +		entry->apic = old_entry->apic;
> +		entry->pin = old_entry->pin;
> +		tail->next = entry;
> +		tail = entry;
> +		old_entry = old_entry->next;
> +	}
> +
> +	tail->next = NULL;
> +	cfg->irq_2_pin = head;
> +}
> +
> +static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
> +{
> +	struct irq_pin_list *entry, *next;
> +
> +	if (old_cfg->irq_2_pin == cfg->irq_2_pin)
> +		return;
> +
> +	entry = old_cfg->irq_2_pin;
> +
> +	while (entry) {
> +		next = entry->next;
> +		kfree(entry);
> +		entry = next;
> +	}
> +	old_cfg->irq_2_pin = NULL;
> +}
> +
> +void arch_init_copy_chip_data(struct irq_desc *old_desc,
> +				 struct irq_desc *desc, int cpu)
> +{
> +	struct irq_cfg *cfg;
> +	struct irq_cfg *old_cfg;
> +
> +	cfg = get_one_free_irq_cfg(cpu);
> +
> +	if (!cfg)
> +		return;
> +
> +	desc->chip_data = cfg;
> +
> +	old_cfg = old_desc->chip_data;
> +
> +	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
> +
> +	init_copy_irq_2_pin(old_cfg, cfg, cpu);
> +}
> +
> +static void free_irq_cfg(struct irq_cfg *old_cfg)
> +{
> +	kfree(old_cfg);
> +}
> +
> +void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
> +{
> +	struct irq_cfg *old_cfg, *cfg;
> +
> +	old_cfg = old_desc->chip_data;
> +	cfg = desc->chip_data;
> +
> +	if (old_cfg == cfg)
> +		return;
> +
> +	if (old_cfg) {
> +		free_irq_2_pin(old_cfg, cfg);
> +		free_irq_cfg(old_cfg);
> +		old_desc->chip_data = NULL;
> +	}
> +}
> +
> +static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
> +{
> +	struct irq_cfg *cfg = desc->chip_data;
> +
> +	if (!cfg->move_in_progress) {
> +		/* it means that domain is not changed */
> +		if (!cpus_intersects(desc->affinity, mask))
> +			cfg->move_desc_in_progress_in_same_domain = 1;
> +	}
> +}
> +#endif
> +
>  #else
>  static struct irq_cfg *irq_cfg(unsigned int irq)
>  {
> @@ -231,9 +350,11 @@ static struct irq_cfg *irq_cfg(unsigned
>  
>  #endif
>  
> +#ifndef CONFIG_MOVE_IRQ_DESC
>  static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
>  {
>  }
> +#endif
>  
>  struct io_apic {
>  	unsigned int index;
> @@ -2346,14 +2467,34 @@ static void irq_complete_move(struct irq
>  	struct irq_cfg *cfg = desc->chip_data;
>  	unsigned vector, me;
>  
> -	if (likely(!cfg->move_in_progress))
> +	if (likely(!cfg->move_in_progress)) {
> +#ifdef CONFIG_MOVE_IRQ_DESC
> +		if (likely(!cfg->move_desc_in_progress_in_same_domain))
> +			return;
> +
> +		/* domain is not change, but affinity is changed */
> +		me = smp_processor_id();
> +		if (cpu_isset(me, desc->affinity)) {
> +			*descp = desc = move_irq_desc(desc, me);
> +			/* get the new one */
> +			cfg = desc->chip_data;
> +			cfg->move_desc_in_progress_in_same_domain = 0;
> +		}
> +#endif
>  		return;
> +	}
>  
>  	vector = ~get_irq_regs()->orig_ax;
>  	me = smp_processor_id();
>  	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
>  		cpumask_t cleanup_mask;
>  
> +#ifdef CONFIG_MOVE_IRQ_DESC
> +		*descp = desc = move_irq_desc(desc, me);
> +		/* get the new one */
> +		cfg = desc->chip_data;
> +#endif
> +
>  		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
>  		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
>  		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
> Index: linux-2.6/kernel/irq/handle.c
> ===================================================================
> --- linux-2.6.orig/kernel/irq/handle.c
> +++ linux-2.6/kernel/irq/handle.c
> @@ -90,6 +90,32 @@ static void init_kstat_irqs(struct irq_d
>  		desc->kstat_irqs = (unsigned int *)ptr;
>  }
>  
> +#ifdef CONFIG_MOVE_IRQ_DESC
> +static void init_copy_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc,
> +				 int cpu, int nr)
> +{
> +	unsigned long bytes;
> +
> +	init_kstat_irqs(desc, cpu, nr);
> +
> +	if (desc->kstat_irqs != old_desc->kstat_irqs) {
> +		/* Compute how many bytes we need per irq and allocate them */
> +		bytes = nr * sizeof(unsigned int);
> +
> +		memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
> +	}
> +}
> +
> +static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
> +{
> +	if (old_desc->kstat_irqs == desc->kstat_irqs)
> +		return;
> +
> +	kfree(old_desc->kstat_irqs);
> +	old_desc->kstat_irqs = NULL;
> +}
> +#endif
> +
>  void __attribute__((weak)) arch_init_chip_data(struct irq_desc *desc, int cpu)
>  {
>  }
> @@ -110,6 +136,23 @@ static void init_one_irq_desc(int irq, s
>  	arch_init_chip_data(desc, cpu);
>  }
>  
> +#ifdef CONFIG_MOVE_IRQ_DESC
> +static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
> +		 struct irq_desc *desc, int cpu)
> +{
> +	memcpy(desc, old_desc, sizeof(struct irq_desc));
> +	desc->cpu = cpu;
> +	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
> +	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
> +	arch_init_copy_chip_data(old_desc, desc, cpu);
> +}
> +
> +static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
> +{
> +	free_kstat_irqs(old_desc, desc);
> +	arch_free_chip_data(old_desc, desc);
> +}
> +#endif
>  /*
>   * Protect the sparse_irqs:
>   */
> @@ -203,6 +246,73 @@ out_unlock:
>  	return desc;
>  }
>  
> +#ifdef CONFIG_MOVE_IRQ_DESC
> +static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
> +						int cpu)
> +{
> +	struct irq_desc *desc;
> +	unsigned int irq;
> +	unsigned long flags;
> +	int node;
> +
> +	irq = old_desc->irq;
> +
> +	spin_lock_irqsave(&sparse_irq_lock, flags);
> +
> +	/* We have to check it to avoid races with another CPU */
> +	desc = irq_desc_ptrs[irq];
> +
> +	if (desc && old_desc != desc)
> +			goto out_unlock;
> +
> +	node = cpu_to_node(cpu);
> +	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
> +	printk(KERN_DEBUG "  move irq_desc for %d to cpu %d node %d\n",
> +		 irq, cpu, node);
> +	if (!desc) {
> +		printk(KERN_ERR "can not get new irq_desc for moving\n");
> +		/* still use old one */
> +		desc = old_desc;
> +		goto out_unlock;
> +	}
> +	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
> +
> +	irq_desc_ptrs[irq] = desc;
> +
> +	/* free the old one */
> +	free_one_irq_desc(old_desc, desc);
> +	kfree(old_desc);
> +
> +out_unlock:
> +	spin_unlock_irqrestore(&sparse_irq_lock, flags);
> +
> +	return desc;
> +}
> +
> +struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
> +{
> +	int old_cpu;
> +	int node, old_node;
> +
> +	/* those all static, do move them */
> +	if (desc->irq < NR_IRQS_LEGACY)
> +		return desc;
> +
> +	old_cpu = desc->cpu;
> +	printk(KERN_DEBUG "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
> +	if (old_cpu != cpu) {
> +		node = cpu_to_node(cpu);
> +		old_node = cpu_to_node(old_cpu);
> +		if (old_node != node)
> +			desc = __real_move_irq_desc(desc, cpu);
> +		else
> +			desc->cpu = cpu;
> +	}
> +
> +	return desc;
> +}
> +#endif

Still a bit too much of #ifdeffery for my taste in kernel/irq/*.c, we 
tend to have higher maintenance costs in files that have a lot of
#ifdefs.

Wouldnt it look neater if you introduced a new kernel/irq/numa_migrate.c 
function that would provide these methods, with the prototypes being
#ifdef-ed to inlines in the !CONFIG_MOVE_IRQ_DESC case in
kernel/irq/internals.h?

i'd also suggest to rename the config option to the more descriptive: 
CONFIG_NUMA_MIGRATE_IRQ_DESC name.

>  		/*
>  		 * No locking required for CPU-local interrupts:
>  		 */
> -		if (desc->chip->ack)
> +		if (desc->chip->ack) {
>  			desc->chip->ack(irq);
> +			/* get new one */
> +			desc = irq_remap_to_desc(irq, desc);
> +		}

thanks for fixing this - it looks much nicer now!

	Ingo

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] irq: move irq_desc according to smp_affinity v7
  2008-12-09  3:41     ` Ingo Molnar
@ 2008-12-11  8:15       ` Yinghai Lu
  0 siblings, 0 replies; 6+ messages in thread
From: Yinghai Lu @ 2008-12-11  8:15 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: Thomas Gleixner, H. Peter Anvin, Andrew Morton, linux-kernel


impact: new feature move irq_desc with sparseirq

if CONFIG_NUMA_MIGRATE_IRQ_DESC is set
  make irq_desc to go with affinity aka irq_desc moving etc
  call move_irq_desc in irq_complete_move()
  legacy irq_desc is not moved, because they are allocated via static array

v3: add calling to irq_to_desc after calling ack/eoi instead of passing desc

v6: use irq_remap_to_desc to avoid some #ifdef according to Ingo

for logical apic mode, need to add move_desc_in_progress_in_same_domain. otherwise it will not get moved. ==> also could need two phase to get irq_desc moved.
	for example: 0xff is old affinity, and need to set 0xf, and then set to 0xf0.
	[ or we need to change domain definition to cpus on the same node ? ]

LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,000000ff
LBSuse:~ # echo f > /proc/irq/22/smp_affinity
LBSuse:~ # cat /proc/irq/22/smp_affinity
00000000,00000000,00000000,0000000f
LBSuse:~ # tail /var/log/messages
...
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
LBSuse:~ # echo f0 > /proc/irq/22/smp_affinity
LBSuse:~ # tail /var/log/messages
Oct 27 12:35:34 LBSuse kernel: klogd 1.4.1, log source = /proc/kmsg started.
Oct 27 12:35:34 LBSuse kernel: eth0: no IPv6 routers present
Oct 27 12:36:46 LBSuse kernel:   move irq_desc for 22 aka 0x16 to cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc kstat_irqs on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_cfg on cpu 7 node 1
Oct 27 12:36:46 LBSuse kernel:   alloc irq_2_pin on cpu 7 node 1

so assume the user space program should update /proc/irq/XX/smp_affinity to 03 or 0f at first on boot
or we change irq_default_affinity ?

for physical apic is much simple
on 4 sockets 16 cores system
irq_desc is moving..
when
# echo 10 > /proc/irq/134483967/smp_affinity
# echo 100 > /proc/irq/134483967/smp_affinity
# echo 1000 > /proc/irq/134483967/smp_affinity
got
Nov  9 21:39:51 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc kstat_irqs on cpu 4 node 1
Nov  9 21:39:51 LBSuse kernel:   alloc irq_cfg on cpu 4 node 1
Nov  9 21:40:05 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc kstat_irqs on cpu 8 node 2
Nov  9 21:40:05 LBSuse kernel:   alloc irq_cfg on cpu 8 node 2
Nov  9 21:40:18 LBSuse kernel:   move irq_desc for 134483967 aka 0x8040fff to cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc kstat_irqs on cpu 12 node 3
Nov  9 21:40:18 LBSuse kernel:   alloc irq_cfg on cpu 12 node 3

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/Kconfig          |    9 ++
 arch/x86/kernel/io_apic.c |  142 +++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/irq.h       |   10 +++
 kernel/irq/Makefile       |    1 
 kernel/irq/chip.c         |   12 +++
 kernel/irq/handle.c       |   15 +++-
 kernel/irq/internals.h    |    5 +
 kernel/irq/numa_migrate.c |  125 ++++++++++++++++++++++++++++++++++++++++
 8 files changed, 311 insertions(+), 8 deletions(-)

Index: linux-2.6/arch/x86/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86/Kconfig
+++ linux-2.6/arch/x86/Kconfig
@@ -253,6 +253,15 @@ config SPARSE_IRQ
 
 	  If you don't know what to do here, say Y.
 
+config NUMA_MIGRATE_IRQ_DESC
+	bool "Move irq desc when changing irq smp_affinity"
+	depends on SPARSE_IRQ && SMP
+	default n
+	help
+	  This enables moving irq_desc to cpu/node that irq will use handled.
+
+	  If you don't know what to do here, say N.
+
 config X86_FIND_SMP_CONFIG
 	def_bool y
 	depends on X86_MPPARSE || X86_VOYAGER
Index: linux-2.6/arch/x86/kernel/io_apic.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/io_apic.c
+++ linux-2.6/arch/x86/kernel/io_apic.c
@@ -141,6 +141,9 @@ struct irq_cfg {
 	unsigned move_cleanup_count;
 	u8 vector;
 	u8 move_in_progress : 1;
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+	u8 move_desc_pending : 1;
+#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -223,6 +226,121 @@ void arch_init_chip_data(struct irq_desc
 	}
 }
 
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+
+static void
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
+{
+	struct irq_pin_list *old_entry, *head, *tail, *entry;
+
+	cfg->irq_2_pin = NULL;
+	old_entry = old_cfg->irq_2_pin;
+	if (!old_entry)
+		return;
+
+	entry = get_one_free_irq_2_pin(cpu);
+	if (!entry)
+		return;
+
+	entry->apic	= old_entry->apic;
+	entry->pin	= old_entry->pin;
+	head		= entry;
+	tail		= entry;
+	old_entry	= old_entry->next;
+	while (old_entry) {
+		entry = get_one_free_irq_2_pin(cpu);
+		if (!entry) {
+			entry = head;
+			while (entry) {
+				head = entry->next;
+				kfree(entry);
+				entry = head;
+			}
+			/* still use the old one */
+			return;
+		}
+		entry->apic	= old_entry->apic;
+		entry->pin	= old_entry->pin;
+		tail->next	= entry;
+		tail		= entry;
+		old_entry	= old_entry->next;
+	}
+
+	tail->next = NULL;
+	cfg->irq_2_pin = head;
+}
+
+static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
+{
+	struct irq_pin_list *entry, *next;
+
+	if (old_cfg->irq_2_pin == cfg->irq_2_pin)
+		return;
+
+	entry = old_cfg->irq_2_pin;
+
+	while (entry) {
+		next = entry->next;
+		kfree(entry);
+		entry = next;
+	}
+	old_cfg->irq_2_pin = NULL;
+}
+
+void arch_init_copy_chip_data(struct irq_desc *old_desc,
+				 struct irq_desc *desc, int cpu)
+{
+	struct irq_cfg *cfg;
+	struct irq_cfg *old_cfg;
+
+	cfg = get_one_free_irq_cfg(cpu);
+
+	if (!cfg)
+		return;
+
+	desc->chip_data = cfg;
+
+	old_cfg = old_desc->chip_data;
+
+	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
+
+	init_copy_irq_2_pin(old_cfg, cfg, cpu);
+}
+
+static void free_irq_cfg(struct irq_cfg *old_cfg)
+{
+	kfree(old_cfg);
+}
+
+void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	struct irq_cfg *old_cfg, *cfg;
+
+	old_cfg = old_desc->chip_data;
+	cfg = desc->chip_data;
+
+	if (old_cfg == cfg)
+		return;
+
+	if (old_cfg) {
+		free_irq_2_pin(old_cfg, cfg);
+		free_irq_cfg(old_cfg);
+		old_desc->chip_data = NULL;
+	}
+}
+
+static void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
+{
+	struct irq_cfg *cfg = desc->chip_data;
+
+	if (!cfg->move_in_progress) {
+		/* it means that domain is not changed */
+		if (!cpus_intersects(desc->affinity, mask))
+			cfg->move_desc_pending = 1;
+	}
+}
+#endif
+
 #else
 static struct irq_cfg *irq_cfg(unsigned int irq)
 {
@@ -231,9 +349,11 @@ static struct irq_cfg *irq_cfg(unsigned
 
 #endif
 
+#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
 static inline void set_extra_move_desc(struct irq_desc *desc, cpumask_t mask)
 {
 }
+#endif
 
 struct io_apic {
 	unsigned int index;
@@ -2346,14 +2466,34 @@ static void irq_complete_move(struct irq
 	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
-	if (likely(!cfg->move_in_progress))
+	if (likely(!cfg->move_in_progress)) {
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+		if (likely(!cfg->move_desc_pending))
+			return;
+
+		/* domain is not change, but affinity is changed */
+		me = smp_processor_id();
+		if (cpu_isset(me, desc->affinity)) {
+			*descp = desc = move_irq_desc(desc, me);
+			/* get the new one */
+			cfg = desc->chip_data;
+			cfg->move_desc_pending = 0;
+		}
+#endif
 		return;
+	}
 
 	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 	if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) {
 		cpumask_t cleanup_mask;
 
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+		*descp = desc = move_irq_desc(desc, me);
+		/* get the new one */
+		cfg = desc->chip_data;
+#endif
+
 		cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
 		cfg->move_cleanup_count = cpus_weight(cleanup_mask);
 		send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
Index: linux-2.6/kernel/irq/handle.c
===================================================================
--- linux-2.6.orig/kernel/irq/handle.c
+++ linux-2.6/kernel/irq/handle.c
@@ -23,7 +23,7 @@
 /*
  * lockdep: we want to handle all irq_desc locks as a single lock-class:
  */
-static struct lock_class_key irq_desc_lock_class;
+struct lock_class_key irq_desc_lock_class;
 
 /**
  * handle_bad_irq - handle spurious and unhandled irqs
@@ -73,7 +73,7 @@ static struct irq_desc irq_desc_init = {
 #endif
 };
 
-static void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 {
 	unsigned long bytes;
 	char *ptr;
@@ -113,7 +113,7 @@ static void init_one_irq_desc(int irq, s
 /*
  * Protect the sparse_irqs:
  */
-static DEFINE_SPINLOCK(sparse_irq_lock);
+DEFINE_SPINLOCK(sparse_irq_lock);
 
 struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
 
@@ -337,8 +337,11 @@ unsigned int __do_IRQ(unsigned int irq)
 		/*
 		 * No locking required for CPU-local interrupts:
 		 */
-		if (desc->chip->ack)
+		if (desc->chip->ack) {
 			desc->chip->ack(irq);
+			/* get new one */
+			desc = irq_remap_to_desc(irq, desc);
+		}
 		if (likely(!(desc->status & IRQ_DISABLED))) {
 			action_ret = handle_IRQ_event(irq, desc->action);
 			if (!noirqdebug)
@@ -349,8 +352,10 @@ unsigned int __do_IRQ(unsigned int irq)
 	}
 
 	spin_lock(&desc->lock);
-	if (desc->chip->ack)
+	if (desc->chip->ack) {
 		desc->chip->ack(irq);
+		desc = irq_remap_to_desc(irq, desc);
+	}
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
Index: linux-2.6/kernel/irq/chip.c
===================================================================
--- linux-2.6.orig/kernel/irq/chip.c
+++ linux-2.6/kernel/irq/chip.c
@@ -354,6 +354,7 @@ handle_level_irq(unsigned int irq, struc
 
 	spin_lock(&desc->lock);
 	mask_ack_irq(desc, irq);
+	desc = irq_remap_to_desc(irq, desc);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
@@ -431,6 +432,7 @@ handle_fasteoi_irq(unsigned int irq, str
 	desc->status &= ~IRQ_INPROGRESS;
 out:
 	desc->chip->eoi(irq);
+	desc = irq_remap_to_desc(irq, desc);
 
 	spin_unlock(&desc->lock);
 }
@@ -467,12 +469,14 @@ handle_edge_irq(unsigned int irq, struct
 		    !desc->action)) {
 		desc->status |= (IRQ_PENDING | IRQ_MASKED);
 		mask_ack_irq(desc, irq);
+		desc = irq_remap_to_desc(irq, desc);
 		goto out_unlock;
 	}
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/* Start handling the irq */
 	desc->chip->ack(irq);
+	desc = irq_remap_to_desc(irq, desc);
 
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
@@ -533,8 +537,10 @@ handle_percpu_irq(unsigned int irq, stru
 	if (!noirqdebug)
 		note_interrupt(irq, desc, action_ret);
 
-	if (desc->chip->eoi)
+	if (desc->chip->eoi) {
 		desc->chip->eoi(irq);
+		desc = irq_remap_to_desc(irq, desc);
+	}
 }
 
 void
@@ -569,8 +575,10 @@ __set_irq_handler(unsigned int irq, irq_
 
 	/* Uninstall? */
 	if (handle == handle_bad_irq) {
-		if (desc->chip != &no_irq_chip)
+		if (desc->chip != &no_irq_chip) {
 			mask_ack_irq(desc, irq);
+			desc = irq_remap_to_desc(irq, desc);
+		}
 		desc->status |= IRQ_DISABLED;
 		desc->depth = 1;
 	}
Index: linux-2.6/include/linux/irq.h
===================================================================
--- linux-2.6.orig/include/linux/irq.h
+++ linux-2.6/include/linux/irq.h
@@ -227,6 +227,16 @@ extern struct irq_desc *move_irq_desc(st
 
 #endif
 
+static inline struct irq_desc *
+irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
+{
+#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
+	return irq_to_desc(irq);
+#else
+	return desc;
+#endif
+}
+
 /*
  * Migration helpers for obsolete names, they will go away:
  */
Index: linux-2.6/kernel/irq/numa_migrate.c
===================================================================
--- /dev/null
+++ linux-2.6/kernel/irq/numa_migrate.c
@@ -0,0 +1,127 @@
+/*
+ * linux/kernel/irq/handle.c
+ *
+ * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
+ *
+ * This file contains the core interrupt handling code.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+
+#include "internals.h"
+
+static void init_copy_kstat_irqs(struct irq_desc *old_desc,
+				 struct irq_desc *desc,
+				 int cpu, int nr)
+{
+	unsigned long bytes;
+
+	init_kstat_irqs(desc, cpu, nr);
+
+	if (desc->kstat_irqs != old_desc->kstat_irqs) {
+		/* Compute how many bytes we need per irq and allocate them */
+		bytes = nr * sizeof(unsigned int);
+
+		memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
+	}
+}
+
+static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	if (old_desc->kstat_irqs == desc->kstat_irqs)
+		return;
+
+	kfree(old_desc->kstat_irqs);
+	old_desc->kstat_irqs = NULL;
+}
+
+static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+		 struct irq_desc *desc, int cpu)
+{
+	memcpy(desc, old_desc, sizeof(struct irq_desc));
+	desc->cpu = cpu;
+	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	arch_init_copy_chip_data(old_desc, desc, cpu);
+}
+
+static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
+{
+	free_kstat_irqs(old_desc, desc);
+	arch_free_chip_data(old_desc, desc);
+}
+
+static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
+						int cpu)
+{
+	struct irq_desc *desc;
+	unsigned int irq;
+	unsigned long flags;
+	int node;
+
+	irq = old_desc->irq;
+
+	spin_lock_irqsave(&sparse_irq_lock, flags);
+
+	/* We have to check it to avoid races with another CPU */
+	desc = irq_desc_ptrs[irq];
+
+	if (desc && old_desc != desc)
+			goto out_unlock;
+
+	node = cpu_to_node(cpu);
+	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+	printk(KERN_DEBUG "  move irq_desc for %d to cpu %d node %d\n",
+		 irq, cpu, node);
+	if (!desc) {
+		printk(KERN_ERR "can not get new irq_desc for moving\n");
+		/* still use old one */
+		desc = old_desc;
+		goto out_unlock;
+	}
+	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
+
+	irq_desc_ptrs[irq] = desc;
+
+	/* free the old one */
+	free_one_irq_desc(old_desc, desc);
+	kfree(old_desc);
+
+out_unlock:
+	spin_unlock_irqrestore(&sparse_irq_lock, flags);
+
+	return desc;
+}
+
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+{
+	int old_cpu;
+	int node, old_node;
+
+	/* those all static, do move them */
+	if (desc->irq < NR_IRQS_LEGACY)
+		return desc;
+
+	old_cpu = desc->cpu;
+	printk(KERN_DEBUG
+		 "try to move irq_desc from cpu %d to %d\n", old_cpu, cpu);
+	if (old_cpu != cpu) {
+		node = cpu_to_node(cpu);
+		old_node = cpu_to_node(old_cpu);
+		if (old_node != node)
+			desc = __real_move_irq_desc(desc, cpu);
+		else
+			desc->cpu = cpu;
+	}
+
+	return desc;
+}
+
Index: linux-2.6/kernel/irq/Makefile
===================================================================
--- linux-2.6.orig/kernel/irq/Makefile
+++ linux-2.6/kernel/irq/Makefile
@@ -3,3 +3,4 @@ obj-y := handle.o manage.o spurious.o re
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
+obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
Index: linux-2.6/kernel/irq/internals.h
===================================================================
--- linux-2.6.orig/kernel/irq/internals.h
+++ linux-2.6/kernel/irq/internals.h
@@ -13,6 +13,11 @@ extern void compat_irq_chip_set_default_
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		unsigned long flags);
 
+extern struct lock_class_key irq_desc_lock_class;
+extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+extern spinlock_t sparse_irq_lock;
+extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
+
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
 extern void register_handler_proc(unsigned int irq, struct irqaction *action);


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2008-12-11  8:16 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-12-06  3:00 [PATCH 5/5] irq: move irq_desc according to smp_affinity v5 Yinghai Lu
2008-12-08 13:42 ` Ingo Molnar
2008-12-08 19:18   ` Yinghai Lu
2008-12-08 22:07   ` [PATCH] irq: move irq_desc according to smp_affinity v6 Yinghai Lu
2008-12-09  3:41     ` Ingo Molnar
2008-12-11  8:15       ` [PATCH] irq: move irq_desc according to smp_affinity v7 Yinghai Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).