linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH][RFC] make cpu_sibling_map a cpumask_t
@ 2003-12-08  4:25 Nick Piggin
  2003-12-08 15:59 ` Anton Blanchard
                   ` (4 more replies)
  0 siblings, 5 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-08  4:25 UTC (permalink / raw)
  To: Ingo Molnar, Anton Blanchard, Linus Torvalds, Andrew Morton,
	Rusty Russell, Zwane Mwaikambo
  Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1232 bytes --]

Hi guys,
This is rather a trivial change but is not for 2.6.0.
The patch is actually on top of some of my HT stuff so one part is
slightly different, but its purpose is to gather feedback.

It turns the cpu_sibling_map array from an array of cpus to an array
of cpumasks. This allows handling of more than 2 siblings, not that
I know of any immediate need.

I think it generalises cpu_sibling_map sufficiently that it can become
generic code. This would allow architecture specific code to build the
sibling map, and then Ingo's or my HT implementations to build their
scheduling descriptions in generic code.

I'm not aware of any reason why the kernel should not become generally
SMT aware. It is sufficiently different to SMP that it is worth
specialising it, although I am only aware of P4 and POWER5 implementations.

Best regards
Nick

P.S.
I have an alternative to Ingo's HT scheduler which basically does
the same thing. It is showing a 20% elapsed time improvement with a
make -j3 on a 2xP4 Xeon (4 logical CPUs).

Before Ingo's is merged, I would like to discuss the pros and cons of
both approaches with those interested. If Ingo's is accepted I should
still be able to port my other SMP/NUMA improvements on top of it.


[-- Attachment #2: sibling-map-to-cpumask.patch --]
[-- Type: text/plain, Size: 7229 bytes --]

 linux-2.6-npiggin/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c |    9 --
 linux-2.6-npiggin/arch/i386/kernel/io_apic.c                 |   19 ++--
 linux-2.6-npiggin/arch/i386/kernel/smpboot.c                 |   46 +++++------
 linux-2.6-npiggin/arch/i386/oprofile/op_model_p4.c           |    7 -
 linux-2.6-npiggin/include/asm-i386/smp.h                     |    2 
 5 files changed, 37 insertions(+), 46 deletions(-)

diff -puN include/asm-i386/smp.h~sibling-map-to-cpumask include/asm-i386/smp.h
--- linux-2.6/include/asm-i386/smp.h~sibling-map-to-cpumask	2003-12-08 14:42:28.000000000 +1100
+++ linux-2.6-npiggin/include/asm-i386/smp.h	2003-12-08 14:42:28.000000000 +1100
@@ -34,7 +34,7 @@
 extern void smp_alloc_memory(void);
 extern int pic_mode;
 extern int smp_num_siblings;
-extern int cpu_sibling_map[];
+extern cpumask_t cpu_sibling_map[];
 
 extern void smp_flush_tlb(void);
 extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
diff -puN arch/i386/kernel/smpboot.c~sibling-map-to-cpumask arch/i386/kernel/smpboot.c
--- linux-2.6/arch/i386/kernel/smpboot.c~sibling-map-to-cpumask	2003-12-08 14:42:28.000000000 +1100
+++ linux-2.6-npiggin/arch/i386/kernel/smpboot.c	2003-12-08 15:07:48.000000000 +1100
@@ -933,7 +933,7 @@ static int boot_cpu_logical_apicid;
 /* Where the IO area was mapped on multiquad, always 0 otherwise */
 void *xquad_portio;
 
-int cpu_sibling_map[NR_CPUS] __cacheline_aligned;
+cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
 
 static void __init smp_boot_cpus(unsigned int max_cpus)
 {
@@ -1079,32 +1079,28 @@ static void __init smp_boot_cpus(unsigne
 	Dprintk("Boot done.\n");
 
 	/*
-	 * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so
-	 * that we can tell the sibling CPU efficiently.
+	 * construct cpu_sibling_map[], so that we can tell sibling CPUs
+	 * efficiently.
 	 */
-	if (cpu_has_ht && smp_num_siblings > 1) {
-		for (cpu = 0; cpu < NR_CPUS; cpu++)
-			cpu_sibling_map[cpu] = NO_PROC_ID;
-
-		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-			int 	i;
-			if (!cpu_isset(cpu, cpu_callout_map))
-				continue;
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		cpu_sibling_map[cpu] = CPU_MASK_NONE;
 
-			for (i = 0; i < NR_CPUS; i++) {
-				if (i == cpu || !cpu_isset(i, cpu_callout_map))
-					continue;
-				if (phys_proc_id[cpu] == phys_proc_id[i]) {
-					cpu_sibling_map[cpu] = i;
-					printk("cpu_sibling_map[%d] = %d\n", cpu, cpu_sibling_map[cpu]);
-					break;
-				}
-			}
-			if (cpu_sibling_map[cpu] == NO_PROC_ID) {
-				smp_num_siblings = 1;
-				printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu);
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		int siblings = 0;
+		int 	i;
+		if (!cpu_isset(cpu, cpu_callout_map))
+			continue;
+
+		for (i = 0; i < NR_CPUS; i++) {
+			if (!cpu_isset(i, cpu_callout_map))
+				continue;
+			if (phys_proc_id[cpu] == phys_proc_id[i]) {
+				siblings++;
+				cpu_set(i, cpu_sibling_map[cpu]);
 			}
 		}
+		if (siblings != smp_num_siblings)
+			printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings);
 	}
 
 	smpboot_setup_io_apic();
@@ -1143,8 +1139,8 @@ __init void arch_init_sched_domains(void
 
 		*cpu_domain = SD_CPU_INIT;
 		cpu_domain->span = CPU_MASK_NONE;
-		cpu_set(i, cpu_domain->span);
-		cpu_set(cpu_sibling_map[i], cpu_domain->span);
+		cpu_domain->span = cpu_sibling_map[i];
+		WARN_ON(!cpu_isset(i, cpu_sibling_map[i]));
 		cpu_domain->flags |= SD_FLAG_WAKE | SD_FLAG_IDLE;
 		cpu_domain->cache_hot_time = 0;
 		cpu_domain->cache_nice_tries = 0;
diff -puN arch/i386/kernel/io_apic.c~sibling-map-to-cpumask arch/i386/kernel/io_apic.c
--- linux-2.6/arch/i386/kernel/io_apic.c~sibling-map-to-cpumask	2003-12-08 14:42:28.000000000 +1100
+++ linux-2.6-npiggin/arch/i386/kernel/io_apic.c	2003-12-08 14:42:28.000000000 +1100
@@ -309,8 +309,7 @@ struct irq_cpu_info {
 
 #define IRQ_ALLOWED(cpu, allowed_mask)	cpu_isset(cpu, allowed_mask)
 
-#define CPU_TO_PACKAGEINDEX(i) \
-		((physical_balance && i > cpu_sibling_map[i]) ? cpu_sibling_map[i] : i)
+#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
 
 #define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
 #define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
@@ -393,6 +392,7 @@ static void do_irq_balance(void)
 	unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
 	unsigned long move_this_load = 0;
 	int max_loaded = 0, min_loaded = 0;
+	int load;
 	unsigned long useful_load_threshold = balanced_irq_interval + 10;
 	int selected_irq;
 	int tmp_loaded, first_attempt = 1;
@@ -444,7 +444,7 @@ static void do_irq_balance(void)
 	for (i = 0; i < NR_CPUS; i++) {
 		if (!cpu_online(i))
 			continue;
-		if (physical_balance && i > cpu_sibling_map[i])
+		if (i != CPU_TO_PACKAGEINDEX(i))
 			continue;
 		if (min_cpu_irq > CPU_IRQ(i)) {
 			min_cpu_irq = CPU_IRQ(i);
@@ -463,7 +463,7 @@ tryanothercpu:
 	for (i = 0; i < NR_CPUS; i++) {
 		if (!cpu_online(i))
 			continue;
-		if (physical_balance && i > cpu_sibling_map[i])
+		if (i != CPU_TO_PACKAGEINDEX(i))
 			continue;
 		if (max_cpu_irq <= CPU_IRQ(i)) 
 			continue;
@@ -543,9 +543,14 @@ tryanotherirq:
 	 * We seek the least loaded sibling by making the comparison
 	 * (A+B)/2 vs B
 	 */
-	if (physical_balance && (CPU_IRQ(min_loaded) >> 1) >
-					CPU_IRQ(cpu_sibling_map[min_loaded]))
-		min_loaded = cpu_sibling_map[min_loaded];
+	load = CPU_IRQ(min_loaded) >> 1;
+	for_each_cpu(j, cpu_sibling_map[min_loaded]) {
+		if (load > CPU_IRQ(j)) {
+			/* This won't change cpu_sibling_map[min_loaded] */
+			load = CPU_IRQ(j);
+			min_loaded = j;
+		}
+	}
 
 	cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]);
 	target_cpu_mask = cpumask_of_cpu(min_loaded);
diff -puN arch/i386/kernel/cpu/cpufreq/p4-clockmod.c~sibling-map-to-cpumask arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
--- linux-2.6/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c~sibling-map-to-cpumask	2003-12-08 14:42:28.000000000 +1100
+++ linux-2.6-npiggin/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c	2003-12-08 14:42:28.000000000 +1100
@@ -67,14 +67,7 @@ static int cpufreq_p4_setdc(unsigned int
 	cpus_allowed = current->cpus_allowed;
 
 	/* only run on CPU to be set, or on its sibling */
-       affected_cpu_map = cpumask_of_cpu(cpu);
-#ifdef CONFIG_X86_HT
-	hyperthreading = ((cpu_has_ht) && (smp_num_siblings == 2));
-	if (hyperthreading) {
-		sibling = cpu_sibling_map[cpu];
-                cpu_set(sibling, affected_cpu_map);
-	}
-#endif
+	affected_cpu_map = cpu_sibling_map[cpu];
 	set_cpus_allowed(current, affected_cpu_map);
         BUG_ON(!cpu_isset(smp_processor_id(), affected_cpu_map));
 
diff -puN arch/i386/oprofile/op_model_p4.c~sibling-map-to-cpumask arch/i386/oprofile/op_model_p4.c
--- linux-2.6/arch/i386/oprofile/op_model_p4.c~sibling-map-to-cpumask	2003-12-08 14:42:28.000000000 +1100
+++ linux-2.6-npiggin/arch/i386/oprofile/op_model_p4.c	2003-12-08 14:42:28.000000000 +1100
@@ -382,11 +382,8 @@ static struct p4_event_binding p4_events
 static unsigned int get_stagger(void)
 {
 #ifdef CONFIG_SMP
-	int cpu;
-	if (smp_num_siblings > 1) {
-		cpu = smp_processor_id();
-		return (cpu_sibling_map[cpu] > cpu) ? 0 : 1;
-	}
+	int cpu = smp_processor_id();
+	return (cpu != first_cpu(cpu_sibling_map[cpu]));
 #endif	
 	return 0;
 }

_

^ permalink raw reply	[flat|nested] 80+ messages in thread
[parent not found: <20031213022038.300B22C2C1@lists.samba.org.suse.lists.linux.kernel>]
[parent not found: <200312161127.13691.habanero@us.ibm.com>]
* RE: [CFT][RFC] HT scheduler
@ 2003-12-16 19:03 Nakajima, Jun
  0 siblings, 0 replies; 80+ messages in thread
From: Nakajima, Jun @ 2003-12-16 19:03 UTC (permalink / raw)
  To: Linus Torvalds, Jamie Lokier; +Cc: Nick Piggin, bill davidsen, linux-kernel


> > Regarding the overhead of the shared runqueue lock:
> >
> > Is the "lock" prefix actually required for locking between x86
> > siblings which share the same L1 cache?
> 
> I bet it is. In a big way.

Of course it is.

> 
> The lock does two independent things:
>  - it tells the core that it can't just crack up the load and store.
>  - it also tells other memory ops that they can't re-order around it.
> 
> Neither of these have anything to do with the L1 cache.
> 
> In short, I'd be very very surprised if you didn't need a "lock"
prefix
> even between hyperthreaded cores. It might be true in some specific
> implementation of HT, but quite frankly I'd doubt it, and I'd be
willing
> to guarantee that Intel would never make that architectural even if it
was
> true today (ie it would then break on future versions).

Correct. If such a code happens to be working today, it would be broken
anytime.


> 
> It should be easy enough to test in user space.
> 
> [ Time passes ]
> 
> Done. Check this program out with and without the "lock ;" prefix.
With
> the "lock" it will run forever on a HT CPU. Without the lock, it will
show
> errors pretty much immediately when the two threads start accessing
"nr"
> concurrently.
> 
> 		Linus
> 
> ----
> #include <pthread.h>
> #include <signal.h>
> #include <unistd.h>
> #include <stdio.h>
> 
> unsigned long nr;
> 
> #define LOCK "lock ;"
> 
> void * check_bit(int bit)
> {
> 	int set, reset;
> 	do {
> 		asm(LOCK "btsl %1,%2; sbbl %0,%0": "=r" (set): "r"
(bit), "m"
> (nr):"memory");
> 		asm(LOCK "btcl %1,%2; sbbl %0,%0": "=r" (reset): "r"
(bit),
> "m" (nr):"memory");
> 	} while (reset && !set);
> 	fprintf(stderr, "bit %d: %d %d (%08x)\n", bit, set, reset, nr);
> 	return NULL;
> }
> 
> static void * thread1(void* dummy)
> {
> 	return check_bit(0);
> }
> 
> static void * thread2(void *dummy)
> {
> 	return check_bit(1);
> }
> 
> int main(int argc, char ** argv)
> {
> 	pthread_t p;
> 
> 	pthread_create(&p, NULL, thread1, NULL);
> 	sleep(1);
> 	thread2(NULL);
> 	return 1;
> }
> -
> To unsubscribe from this list: send the line "unsubscribe
linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 80+ messages in thread
* RE: [CFT][RFC] HT scheduler
@ 2003-12-17  0:38 Nakajima, Jun
  0 siblings, 0 replies; 80+ messages in thread
From: Nakajima, Jun @ 2003-12-17  0:38 UTC (permalink / raw)
  To: Davide Libenzi, Linus Torvalds
  Cc: Jamie Lokier, Nick Piggin, bill davidsen, Linux Kernel Mailing List

> You forgot the one where no HT knowledge can be used for
optimizations.
> 
> - Asserting the CPU's #LOCK pin to take control of the system bus.
That in
>   MP system translate into the signaling CPU taking full control of
the
>   system bus until the pin is deasserted.
>

It is not the case with UP (single socket) using HT, where some
optimization for locks could be available. But you still need locks for
the logical processors.

	Jun

> -----Original Message-----
> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> owner@vger.kernel.org] On Behalf Of Davide Libenzi
> Sent: Tuesday, December 16, 2003 4:25 PM
> To: Linus Torvalds
> Cc: Jamie Lokier; Nick Piggin; bill davidsen; Linux Kernel Mailing
List
> Subject: Re: [CFT][RFC] HT scheduler
> 
> On Tue, 16 Dec 2003, Linus Torvalds wrote:
> 
> > I bet it is. In a big way.
> >
> > The lock does two independent things:
> >  - it tells the core that it can't just crack up the load and store.
> >  - it also tells other memory ops that they can't re-order around
it.
> 
> You forgot the one where no HT knowledge can be used for
optimizations.
> 
> - Asserting the CPU's #LOCK pin to take control of the system bus.
That in
>   MP system translate into the signaling CPU taking full control of
the
>   system bus until the pin is deasserted.
> 
> 
> 
> - Davide
> 
> 
> -
> To unsubscribe from this list: send the line "unsubscribe
linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 80+ messages in thread

end of thread, other threads:[~2004-01-03 18:58 UTC | newest]

Thread overview: 80+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-12-08  4:25 [PATCH][RFC] make cpu_sibling_map a cpumask_t Nick Piggin
2003-12-08 15:59 ` Anton Blanchard
2003-12-08 23:08   ` Nick Piggin
2003-12-09  0:14     ` Anton Blanchard
2003-12-11  4:25       ` [CFT][RFC] HT scheduler Nick Piggin
2003-12-11  7:24         ` Nick Piggin
2003-12-11  8:57           ` Nick Piggin
2003-12-11 11:52             ` William Lee Irwin III
2003-12-11 13:09               ` Nick Piggin
2003-12-11 13:23                 ` William Lee Irwin III
2003-12-11 13:30                   ` Nick Piggin
2003-12-11 13:32                     ` William Lee Irwin III
2003-12-11 15:30                       ` Nick Piggin
2003-12-11 15:38                         ` William Lee Irwin III
2003-12-11 15:51                           ` Nick Piggin
2003-12-11 15:56                             ` William Lee Irwin III
2003-12-11 16:37                               ` Nick Piggin
2003-12-11 16:40                                 ` William Lee Irwin III
2003-12-12  1:52                         ` [PATCH] improve rwsem scalability (was Re: [CFT][RFC] HT scheduler) Nick Piggin
2003-12-12  2:02                           ` Nick Piggin
2003-12-12  9:41                           ` Ingo Molnar
2003-12-13  0:07                             ` Nick Piggin
2003-12-14  0:44                               ` Nick Piggin
2003-12-17  5:27                                 ` Nick Piggin
2003-12-19 11:52                                   ` Nick Piggin
2003-12-19 15:06                                     ` Martin J. Bligh
2003-12-20  0:08                                       ` Nick Piggin
2003-12-12  0:58             ` [CFT][RFC] HT scheduler Rusty Russell
2003-12-11 10:01           ` Rhino
2003-12-11  8:14             ` Nick Piggin
2003-12-11 16:49               ` Rhino
2003-12-11 15:16                 ` Nick Piggin
2003-12-11 11:40             ` William Lee Irwin III
2003-12-11 17:05               ` Rhino
2003-12-11 15:17                 ` William Lee Irwin III
2003-12-11 16:28         ` Kevin P. Fleming
2003-12-11 16:41           ` Nick Piggin
2003-12-12  2:24         ` Rusty Russell
2003-12-12  7:00           ` Nick Piggin
2003-12-12  7:23             ` Rusty Russell
2003-12-13  6:43               ` Nick Piggin
2003-12-14  1:35                 ` bill davidsen
2003-12-14  2:18                   ` Nick Piggin
2003-12-14  4:32                     ` Jamie Lokier
2003-12-14  9:40                       ` Nick Piggin
2003-12-14 10:46                         ` Arjan van de Ven
2003-12-16 17:46                         ` Bill Davidsen
2003-12-16 18:22                       ` Linus Torvalds
2003-12-17  0:24                         ` Davide Libenzi
2003-12-17  0:41                           ` Linus Torvalds
2003-12-17  0:54                             ` Davide Libenzi
2003-12-16 17:34                     ` Bill Davidsen
2003-12-15  5:53                 ` Rusty Russell
2003-12-15 23:08                   ` Nick Piggin
2003-12-19  4:57                     ` Nick Piggin
2003-12-19  5:13                       ` Nick Piggin
2003-12-20  2:43                       ` Rusty Russell
2003-12-21  2:56                         ` Nick Piggin
2004-01-03 18:57                   ` Bill Davidsen
2003-12-15 20:21                 ` Zwane Mwaikambo
2003-12-15 23:20                   ` Nick Piggin
2003-12-16  0:11                     ` Zwane Mwaikambo
2003-12-12  8:59             ` Nick Piggin
2003-12-12 15:14               ` Martin J. Bligh
2003-12-08 19:44 ` [PATCH][RFC] make cpu_sibling_map a cpumask_t James Cleverdon
2003-12-08 20:38 ` Ingo Molnar
2003-12-08 20:51 ` Zwane Mwaikambo
2003-12-08 20:55   ` Ingo Molnar
2003-12-08 23:17     ` Nick Piggin
2003-12-08 23:36       ` Ingo Molnar
2003-12-08 23:58         ` Nick Piggin
2003-12-08 23:46 ` Rusty Russell
2003-12-09 13:36   ` Nick Piggin
2003-12-11 21:41     ` bill davidsen
     [not found] <20031213022038.300B22C2C1@lists.samba.org.suse.lists.linux.kernel>
     [not found] ` <3FDAB517.4000309@cyberone.com.au.suse.lists.linux.kernel>
     [not found]   ` <brgeo7$huv$1@gatekeeper.tmr.com.suse.lists.linux.kernel>
     [not found]     ` <3FDBC876.3020603@cyberone.com.au.suse.lists.linux.kernel>
     [not found]       ` <20031214043245.GC21241@mail.shareable.org.suse.lists.linux.kernel>
     [not found]         ` <3FDC3023.9030708@cyberone.com.au.suse.lists.linux.kernel>
     [not found]           ` <1071398761.5233.1.camel@laptop.fenrus.com.suse.lists.linux.kernel>
2003-12-14 16:26             ` [CFT][RFC] HT scheduler Andi Kleen
2003-12-14 16:54               ` Arjan van de Ven
     [not found] <200312161127.13691.habanero@us.ibm.com>
2003-12-16 17:37 ` Andrew Theurer
2003-12-17  2:41   ` Nick Piggin
2003-12-16 19:03 Nakajima, Jun
2003-12-17  0:38 Nakajima, Jun

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).