linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH][RFC] make cpu_sibling_map a cpumask_t
@ 2003-12-08  4:25 Nick Piggin
  2003-12-08 15:59 ` Anton Blanchard
                   ` (4 more replies)
  0 siblings, 5 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-08  4:25 UTC (permalink / raw)
  To: Ingo Molnar, Anton Blanchard, Linus Torvalds, Andrew Morton,
	Rusty Russell, Zwane Mwaikambo
  Cc: linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1232 bytes --]

Hi guys,
This is rather a trivial change but is not for 2.6.0.
The patch is actually on top of some of my HT stuff so one part is
slightly different, but its purpose is to gather feedback.

It turns the cpu_sibling_map array from an array of cpus to an array
of cpumasks. This allows handling of more than 2 siblings, not that
I know of any immediate need.

I think it generalises cpu_sibling_map sufficiently that it can become
generic code. This would allow architecture specific code to build the
sibling map, and then Ingo's or my HT implementations to build their
scheduling descriptions in generic code.

I'm not aware of any reason why the kernel should not become generally
SMT aware. It is sufficiently different to SMP that it is worth
specialising it, although I am only aware of P4 and POWER5 implementations.

Best regards
Nick

P.S.
I have an alternative to Ingo's HT scheduler which basically does
the same thing. It is showing a 20% elapsed time improvement with a
make -j3 on a 2xP4 Xeon (4 logical CPUs).

Before Ingo's is merged, I would like to discuss the pros and cons of
both approaches with those interested. If Ingo's is accepted I should
still be able to port my other SMP/NUMA improvements on top of it.


[-- Attachment #2: sibling-map-to-cpumask.patch --]
[-- Type: text/plain, Size: 7229 bytes --]

 linux-2.6-npiggin/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c |    9 --
 linux-2.6-npiggin/arch/i386/kernel/io_apic.c                 |   19 ++--
 linux-2.6-npiggin/arch/i386/kernel/smpboot.c                 |   46 +++++------
 linux-2.6-npiggin/arch/i386/oprofile/op_model_p4.c           |    7 -
 linux-2.6-npiggin/include/asm-i386/smp.h                     |    2 
 5 files changed, 37 insertions(+), 46 deletions(-)

diff -puN include/asm-i386/smp.h~sibling-map-to-cpumask include/asm-i386/smp.h
--- linux-2.6/include/asm-i386/smp.h~sibling-map-to-cpumask	2003-12-08 14:42:28.000000000 +1100
+++ linux-2.6-npiggin/include/asm-i386/smp.h	2003-12-08 14:42:28.000000000 +1100
@@ -34,7 +34,7 @@
 extern void smp_alloc_memory(void);
 extern int pic_mode;
 extern int smp_num_siblings;
-extern int cpu_sibling_map[];
+extern cpumask_t cpu_sibling_map[];
 
 extern void smp_flush_tlb(void);
 extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
diff -puN arch/i386/kernel/smpboot.c~sibling-map-to-cpumask arch/i386/kernel/smpboot.c
--- linux-2.6/arch/i386/kernel/smpboot.c~sibling-map-to-cpumask	2003-12-08 14:42:28.000000000 +1100
+++ linux-2.6-npiggin/arch/i386/kernel/smpboot.c	2003-12-08 15:07:48.000000000 +1100
@@ -933,7 +933,7 @@ static int boot_cpu_logical_apicid;
 /* Where the IO area was mapped on multiquad, always 0 otherwise */
 void *xquad_portio;
 
-int cpu_sibling_map[NR_CPUS] __cacheline_aligned;
+cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
 
 static void __init smp_boot_cpus(unsigned int max_cpus)
 {
@@ -1079,32 +1079,28 @@ static void __init smp_boot_cpus(unsigne
 	Dprintk("Boot done.\n");
 
 	/*
-	 * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so
-	 * that we can tell the sibling CPU efficiently.
+	 * construct cpu_sibling_map[], so that we can tell sibling CPUs
+	 * efficiently.
 	 */
-	if (cpu_has_ht && smp_num_siblings > 1) {
-		for (cpu = 0; cpu < NR_CPUS; cpu++)
-			cpu_sibling_map[cpu] = NO_PROC_ID;
-
-		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-			int 	i;
-			if (!cpu_isset(cpu, cpu_callout_map))
-				continue;
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		cpu_sibling_map[cpu] = CPU_MASK_NONE;
 
-			for (i = 0; i < NR_CPUS; i++) {
-				if (i == cpu || !cpu_isset(i, cpu_callout_map))
-					continue;
-				if (phys_proc_id[cpu] == phys_proc_id[i]) {
-					cpu_sibling_map[cpu] = i;
-					printk("cpu_sibling_map[%d] = %d\n", cpu, cpu_sibling_map[cpu]);
-					break;
-				}
-			}
-			if (cpu_sibling_map[cpu] == NO_PROC_ID) {
-				smp_num_siblings = 1;
-				printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu);
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		int siblings = 0;
+		int 	i;
+		if (!cpu_isset(cpu, cpu_callout_map))
+			continue;
+
+		for (i = 0; i < NR_CPUS; i++) {
+			if (!cpu_isset(i, cpu_callout_map))
+				continue;
+			if (phys_proc_id[cpu] == phys_proc_id[i]) {
+				siblings++;
+				cpu_set(i, cpu_sibling_map[cpu]);
 			}
 		}
+		if (siblings != smp_num_siblings)
+			printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings);
 	}
 
 	smpboot_setup_io_apic();
@@ -1143,8 +1139,8 @@ __init void arch_init_sched_domains(void
 
 		*cpu_domain = SD_CPU_INIT;
 		cpu_domain->span = CPU_MASK_NONE;
-		cpu_set(i, cpu_domain->span);
-		cpu_set(cpu_sibling_map[i], cpu_domain->span);
+		cpu_domain->span = cpu_sibling_map[i];
+		WARN_ON(!cpu_isset(i, cpu_sibling_map[i]));
 		cpu_domain->flags |= SD_FLAG_WAKE | SD_FLAG_IDLE;
 		cpu_domain->cache_hot_time = 0;
 		cpu_domain->cache_nice_tries = 0;
diff -puN arch/i386/kernel/io_apic.c~sibling-map-to-cpumask arch/i386/kernel/io_apic.c
--- linux-2.6/arch/i386/kernel/io_apic.c~sibling-map-to-cpumask	2003-12-08 14:42:28.000000000 +1100
+++ linux-2.6-npiggin/arch/i386/kernel/io_apic.c	2003-12-08 14:42:28.000000000 +1100
@@ -309,8 +309,7 @@ struct irq_cpu_info {
 
 #define IRQ_ALLOWED(cpu, allowed_mask)	cpu_isset(cpu, allowed_mask)
 
-#define CPU_TO_PACKAGEINDEX(i) \
-		((physical_balance && i > cpu_sibling_map[i]) ? cpu_sibling_map[i] : i)
+#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
 
 #define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
 #define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
@@ -393,6 +392,7 @@ static void do_irq_balance(void)
 	unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
 	unsigned long move_this_load = 0;
 	int max_loaded = 0, min_loaded = 0;
+	int load;
 	unsigned long useful_load_threshold = balanced_irq_interval + 10;
 	int selected_irq;
 	int tmp_loaded, first_attempt = 1;
@@ -444,7 +444,7 @@ static void do_irq_balance(void)
 	for (i = 0; i < NR_CPUS; i++) {
 		if (!cpu_online(i))
 			continue;
-		if (physical_balance && i > cpu_sibling_map[i])
+		if (i != CPU_TO_PACKAGEINDEX(i))
 			continue;
 		if (min_cpu_irq > CPU_IRQ(i)) {
 			min_cpu_irq = CPU_IRQ(i);
@@ -463,7 +463,7 @@ tryanothercpu:
 	for (i = 0; i < NR_CPUS; i++) {
 		if (!cpu_online(i))
 			continue;
-		if (physical_balance && i > cpu_sibling_map[i])
+		if (i != CPU_TO_PACKAGEINDEX(i))
 			continue;
 		if (max_cpu_irq <= CPU_IRQ(i)) 
 			continue;
@@ -543,9 +543,14 @@ tryanotherirq:
 	 * We seek the least loaded sibling by making the comparison
 	 * (A+B)/2 vs B
 	 */
-	if (physical_balance && (CPU_IRQ(min_loaded) >> 1) >
-					CPU_IRQ(cpu_sibling_map[min_loaded]))
-		min_loaded = cpu_sibling_map[min_loaded];
+	load = CPU_IRQ(min_loaded) >> 1;
+	for_each_cpu(j, cpu_sibling_map[min_loaded]) {
+		if (load > CPU_IRQ(j)) {
+			/* This won't change cpu_sibling_map[min_loaded] */
+			load = CPU_IRQ(j);
+			min_loaded = j;
+		}
+	}
 
 	cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]);
 	target_cpu_mask = cpumask_of_cpu(min_loaded);
diff -puN arch/i386/kernel/cpu/cpufreq/p4-clockmod.c~sibling-map-to-cpumask arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
--- linux-2.6/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c~sibling-map-to-cpumask	2003-12-08 14:42:28.000000000 +1100
+++ linux-2.6-npiggin/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c	2003-12-08 14:42:28.000000000 +1100
@@ -67,14 +67,7 @@ static int cpufreq_p4_setdc(unsigned int
 	cpus_allowed = current->cpus_allowed;
 
 	/* only run on CPU to be set, or on its sibling */
-       affected_cpu_map = cpumask_of_cpu(cpu);
-#ifdef CONFIG_X86_HT
-	hyperthreading = ((cpu_has_ht) && (smp_num_siblings == 2));
-	if (hyperthreading) {
-		sibling = cpu_sibling_map[cpu];
-                cpu_set(sibling, affected_cpu_map);
-	}
-#endif
+	affected_cpu_map = cpu_sibling_map[cpu];
 	set_cpus_allowed(current, affected_cpu_map);
         BUG_ON(!cpu_isset(smp_processor_id(), affected_cpu_map));
 
diff -puN arch/i386/oprofile/op_model_p4.c~sibling-map-to-cpumask arch/i386/oprofile/op_model_p4.c
--- linux-2.6/arch/i386/oprofile/op_model_p4.c~sibling-map-to-cpumask	2003-12-08 14:42:28.000000000 +1100
+++ linux-2.6-npiggin/arch/i386/oprofile/op_model_p4.c	2003-12-08 14:42:28.000000000 +1100
@@ -382,11 +382,8 @@ static struct p4_event_binding p4_events
 static unsigned int get_stagger(void)
 {
 #ifdef CONFIG_SMP
-	int cpu;
-	if (smp_num_siblings > 1) {
-		cpu = smp_processor_id();
-		return (cpu_sibling_map[cpu] > cpu) ? 0 : 1;
-	}
+	int cpu = smp_processor_id();
+	return (cpu != first_cpu(cpu_sibling_map[cpu]));
 #endif	
 	return 0;
 }

_

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH][RFC] make cpu_sibling_map a cpumask_t
  2003-12-08  4:25 [PATCH][RFC] make cpu_sibling_map a cpumask_t Nick Piggin
@ 2003-12-08 15:59 ` Anton Blanchard
  2003-12-08 23:08   ` Nick Piggin
  2003-12-08 19:44 ` [PATCH][RFC] make cpu_sibling_map a cpumask_t James Cleverdon
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 80+ messages in thread
From: Anton Blanchard @ 2003-12-08 15:59 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Linus Torvalds, Andrew Morton, Rusty Russell,
	Zwane Mwaikambo, linux-kernel

 
> I'm not aware of any reason why the kernel should not become generally
> SMT aware. It is sufficiently different to SMP that it is worth
> specialising it, although I am only aware of P4 and POWER5 implementations.

I agree, SMT is likely to become more popular in the coming years.

> I have an alternative to Ingo's HT scheduler which basically does
> the same thing. It is showing a 20% elapsed time improvement with a
> make -j3 on a 2xP4 Xeon (4 logical CPUs).
> 
> Before Ingo's is merged, I would like to discuss the pros and cons of
> both approaches with those interested. If Ingo's is accepted I should
> still be able to port my other SMP/NUMA improvements on top of it.

Sounds good, have you got anything to test? I can throw it on a POWER5.

Anton

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH][RFC] make cpu_sibling_map a cpumask_t
  2003-12-08  4:25 [PATCH][RFC] make cpu_sibling_map a cpumask_t Nick Piggin
  2003-12-08 15:59 ` Anton Blanchard
@ 2003-12-08 19:44 ` James Cleverdon
  2003-12-08 20:38 ` Ingo Molnar
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 80+ messages in thread
From: James Cleverdon @ 2003-12-08 19:44 UTC (permalink / raw)
  To: Nick Piggin, Ingo Molnar, Anton Blanchard, Linus Torvalds,
	Andrew Morton, Rusty Russell, Zwane Mwaikambo
  Cc: linux-kernel

On Sunday 07 December 2003 8:25 pm, Nick Piggin wrote:
> Hi guys,
> This is rather a trivial change but is not for 2.6.0.
> The patch is actually on top of some of my HT stuff so one part is
> slightly different, but its purpose is to gather feedback.
>
> It turns the cpu_sibling_map array from an array of cpus to an array
> of cpumasks. This allows handling of more than 2 siblings, not that
> I know of any immediate need.

There will probably be a need, although I don't know how soon.  Some of the 
Intel folks have not been hinting that there will be more than two siblings 
in a future CPU.  No, they have not been hinting at all.   ;^)


> I think it generalises cpu_sibling_map sufficiently that it can become
> generic code. This would allow architecture specific code to build the
> sibling map, and then Ingo's or my HT implementations to build their
> scheduling descriptions in generic code.
>
> I'm not aware of any reason why the kernel should not become generally
> SMT aware. It is sufficiently different to SMP that it is worth
> specialising it, although I am only aware of P4 and POWER5 implementations.
>
> Best regards
> Nick
>
> P.S.
> I have an alternative to Ingo's HT scheduler which basically does
> the same thing. It is showing a 20% elapsed time improvement with a
> make -j3 on a 2xP4 Xeon (4 logical CPUs).
>
> Before Ingo's is merged, I would like to discuss the pros and cons of
> both approaches with those interested. If Ingo's is accepted I should
> still be able to port my other SMP/NUMA improvements on top of it.

-- 
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot comm

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH][RFC] make cpu_sibling_map a cpumask_t
  2003-12-08  4:25 [PATCH][RFC] make cpu_sibling_map a cpumask_t Nick Piggin
  2003-12-08 15:59 ` Anton Blanchard
  2003-12-08 19:44 ` [PATCH][RFC] make cpu_sibling_map a cpumask_t James Cleverdon
@ 2003-12-08 20:38 ` Ingo Molnar
  2003-12-08 20:51 ` Zwane Mwaikambo
  2003-12-08 23:46 ` Rusty Russell
  4 siblings, 0 replies; 80+ messages in thread
From: Ingo Molnar @ 2003-12-08 20:38 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Anton Blanchard, Linus Torvalds, Andrew Morton,
	Rusty Russell, Zwane Mwaikambo, linux-kernel


On Mon, 8 Dec 2003, Nick Piggin wrote:

> P.S.
> I have an alternative to Ingo's HT scheduler which basically does the
> same thing. It is showing a 20% elapsed time improvement with a make -j3
> on a 2xP4 Xeon (4 logical CPUs).

if it gets close/equivalent (or better!) performance than the SMT-aware
scheduler i've posted then i'd prefer your patch because yours is
fundamentally simpler. (Btw., there exist a number of similar,
balancing-based HT patches - they are all quite simple.)

	Ingo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH][RFC] make cpu_sibling_map a cpumask_t
  2003-12-08  4:25 [PATCH][RFC] make cpu_sibling_map a cpumask_t Nick Piggin
                   ` (2 preceding siblings ...)
  2003-12-08 20:38 ` Ingo Molnar
@ 2003-12-08 20:51 ` Zwane Mwaikambo
  2003-12-08 20:55   ` Ingo Molnar
  2003-12-08 23:46 ` Rusty Russell
  4 siblings, 1 reply; 80+ messages in thread
From: Zwane Mwaikambo @ 2003-12-08 20:51 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Anton Blanchard, Linus Torvalds, Andrew Morton,
	Rusty Russell, linux-kernel

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1078 bytes --]

On Mon, 8 Dec 2003, Nick Piggin wrote:

> It turns the cpu_sibling_map array from an array of cpus to an array
> of cpumasks. This allows handling of more than 2 siblings, not that
> I know of any immediate need.
>
> I think it generalises cpu_sibling_map sufficiently that it can become
> generic code. This would allow architecture specific code to build the
> sibling map, and then Ingo's or my HT implementations to build their
> scheduling descriptions in generic code.
>
> I'm not aware of any reason why the kernel should not become generally
> SMT aware. It is sufficiently different to SMP that it is worth
> specialising it, although I am only aware of P4 and POWER5 implementations.

Generally i agree, it's fairly unintrusive and appears to clean up some
things. Perhaps Andrew will take it a bit later after release.

> P.S.
> I have an alternative to Ingo's HT scheduler which basically does
> the same thing. It is showing a 20% elapsed time improvement with a
> make -j3 on a 2xP4 Xeon (4 logical CPUs).

-j3 is an odd number, what does -j4, -j8, -j16 look like?

[-- Attachment #2: Type: TEXT/PLAIN, Size: 7229 bytes --]

 linux-2.6-npiggin/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c |    9 --
 linux-2.6-npiggin/arch/i386/kernel/io_apic.c                 |   19 ++--
 linux-2.6-npiggin/arch/i386/kernel/smpboot.c                 |   46 +++++------
 linux-2.6-npiggin/arch/i386/oprofile/op_model_p4.c           |    7 -
 linux-2.6-npiggin/include/asm-i386/smp.h                     |    2 
 5 files changed, 37 insertions(+), 46 deletions(-)

diff -puN include/asm-i386/smp.h~sibling-map-to-cpumask include/asm-i386/smp.h
--- linux-2.6/include/asm-i386/smp.h~sibling-map-to-cpumask	2003-12-08 14:42:28.000000000 +1100
+++ linux-2.6-npiggin/include/asm-i386/smp.h	2003-12-08 14:42:28.000000000 +1100
@@ -34,7 +34,7 @@
 extern void smp_alloc_memory(void);
 extern int pic_mode;
 extern int smp_num_siblings;
-extern int cpu_sibling_map[];
+extern cpumask_t cpu_sibling_map[];
 
 extern void smp_flush_tlb(void);
 extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs);
diff -puN arch/i386/kernel/smpboot.c~sibling-map-to-cpumask arch/i386/kernel/smpboot.c
--- linux-2.6/arch/i386/kernel/smpboot.c~sibling-map-to-cpumask	2003-12-08 14:42:28.000000000 +1100
+++ linux-2.6-npiggin/arch/i386/kernel/smpboot.c	2003-12-08 15:07:48.000000000 +1100
@@ -933,7 +933,7 @@ static int boot_cpu_logical_apicid;
 /* Where the IO area was mapped on multiquad, always 0 otherwise */
 void *xquad_portio;
 
-int cpu_sibling_map[NR_CPUS] __cacheline_aligned;
+cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
 
 static void __init smp_boot_cpus(unsigned int max_cpus)
 {
@@ -1079,32 +1079,28 @@ static void __init smp_boot_cpus(unsigne
 	Dprintk("Boot done.\n");
 
 	/*
-	 * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so
-	 * that we can tell the sibling CPU efficiently.
+	 * construct cpu_sibling_map[], so that we can tell sibling CPUs
+	 * efficiently.
 	 */
-	if (cpu_has_ht && smp_num_siblings > 1) {
-		for (cpu = 0; cpu < NR_CPUS; cpu++)
-			cpu_sibling_map[cpu] = NO_PROC_ID;
-
-		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-			int 	i;
-			if (!cpu_isset(cpu, cpu_callout_map))
-				continue;
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		cpu_sibling_map[cpu] = CPU_MASK_NONE;
 
-			for (i = 0; i < NR_CPUS; i++) {
-				if (i == cpu || !cpu_isset(i, cpu_callout_map))
-					continue;
-				if (phys_proc_id[cpu] == phys_proc_id[i]) {
-					cpu_sibling_map[cpu] = i;
-					printk("cpu_sibling_map[%d] = %d\n", cpu, cpu_sibling_map[cpu]);
-					break;
-				}
-			}
-			if (cpu_sibling_map[cpu] == NO_PROC_ID) {
-				smp_num_siblings = 1;
-				printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu);
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		int siblings = 0;
+		int 	i;
+		if (!cpu_isset(cpu, cpu_callout_map))
+			continue;
+
+		for (i = 0; i < NR_CPUS; i++) {
+			if (!cpu_isset(i, cpu_callout_map))
+				continue;
+			if (phys_proc_id[cpu] == phys_proc_id[i]) {
+				siblings++;
+				cpu_set(i, cpu_sibling_map[cpu]);
 			}
 		}
+		if (siblings != smp_num_siblings)
+			printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings);
 	}
 
 	smpboot_setup_io_apic();
@@ -1143,8 +1139,8 @@ __init void arch_init_sched_domains(void
 
 		*cpu_domain = SD_CPU_INIT;
 		cpu_domain->span = CPU_MASK_NONE;
-		cpu_set(i, cpu_domain->span);
-		cpu_set(cpu_sibling_map[i], cpu_domain->span);
+		cpu_domain->span = cpu_sibling_map[i];
+		WARN_ON(!cpu_isset(i, cpu_sibling_map[i]));
 		cpu_domain->flags |= SD_FLAG_WAKE | SD_FLAG_IDLE;
 		cpu_domain->cache_hot_time = 0;
 		cpu_domain->cache_nice_tries = 0;
diff -puN arch/i386/kernel/io_apic.c~sibling-map-to-cpumask arch/i386/kernel/io_apic.c
--- linux-2.6/arch/i386/kernel/io_apic.c~sibling-map-to-cpumask	2003-12-08 14:42:28.000000000 +1100
+++ linux-2.6-npiggin/arch/i386/kernel/io_apic.c	2003-12-08 14:42:28.000000000 +1100
@@ -309,8 +309,7 @@ struct irq_cpu_info {
 
 #define IRQ_ALLOWED(cpu, allowed_mask)	cpu_isset(cpu, allowed_mask)
 
-#define CPU_TO_PACKAGEINDEX(i) \
-		((physical_balance && i > cpu_sibling_map[i]) ? cpu_sibling_map[i] : i)
+#define CPU_TO_PACKAGEINDEX(i) (first_cpu(cpu_sibling_map[i]))
 
 #define MAX_BALANCED_IRQ_INTERVAL	(5*HZ)
 #define MIN_BALANCED_IRQ_INTERVAL	(HZ/2)
@@ -393,6 +392,7 @@ static void do_irq_balance(void)
 	unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
 	unsigned long move_this_load = 0;
 	int max_loaded = 0, min_loaded = 0;
+	int load;
 	unsigned long useful_load_threshold = balanced_irq_interval + 10;
 	int selected_irq;
 	int tmp_loaded, first_attempt = 1;
@@ -444,7 +444,7 @@ static void do_irq_balance(void)
 	for (i = 0; i < NR_CPUS; i++) {
 		if (!cpu_online(i))
 			continue;
-		if (physical_balance && i > cpu_sibling_map[i])
+		if (i != CPU_TO_PACKAGEINDEX(i))
 			continue;
 		if (min_cpu_irq > CPU_IRQ(i)) {
 			min_cpu_irq = CPU_IRQ(i);
@@ -463,7 +463,7 @@ tryanothercpu:
 	for (i = 0; i < NR_CPUS; i++) {
 		if (!cpu_online(i))
 			continue;
-		if (physical_balance && i > cpu_sibling_map[i])
+		if (i != CPU_TO_PACKAGEINDEX(i))
 			continue;
 		if (max_cpu_irq <= CPU_IRQ(i)) 
 			continue;
@@ -543,9 +543,14 @@ tryanotherirq:
 	 * We seek the least loaded sibling by making the comparison
 	 * (A+B)/2 vs B
 	 */
-	if (physical_balance && (CPU_IRQ(min_loaded) >> 1) >
-					CPU_IRQ(cpu_sibling_map[min_loaded]))
-		min_loaded = cpu_sibling_map[min_loaded];
+	load = CPU_IRQ(min_loaded) >> 1;
+	for_each_cpu(j, cpu_sibling_map[min_loaded]) {
+		if (load > CPU_IRQ(j)) {
+			/* This won't change cpu_sibling_map[min_loaded] */
+			load = CPU_IRQ(j);
+			min_loaded = j;
+		}
+	}
 
 	cpus_and(allowed_mask, cpu_online_map, irq_affinity[selected_irq]);
 	target_cpu_mask = cpumask_of_cpu(min_loaded);
diff -puN arch/i386/kernel/cpu/cpufreq/p4-clockmod.c~sibling-map-to-cpumask arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
--- linux-2.6/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c~sibling-map-to-cpumask	2003-12-08 14:42:28.000000000 +1100
+++ linux-2.6-npiggin/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c	2003-12-08 14:42:28.000000000 +1100
@@ -67,14 +67,7 @@ static int cpufreq_p4_setdc(unsigned int
 	cpus_allowed = current->cpus_allowed;
 
 	/* only run on CPU to be set, or on its sibling */
-       affected_cpu_map = cpumask_of_cpu(cpu);
-#ifdef CONFIG_X86_HT
-	hyperthreading = ((cpu_has_ht) && (smp_num_siblings == 2));
-	if (hyperthreading) {
-		sibling = cpu_sibling_map[cpu];
-                cpu_set(sibling, affected_cpu_map);
-	}
-#endif
+	affected_cpu_map = cpu_sibling_map[cpu];
 	set_cpus_allowed(current, affected_cpu_map);
         BUG_ON(!cpu_isset(smp_processor_id(), affected_cpu_map));
 
diff -puN arch/i386/oprofile/op_model_p4.c~sibling-map-to-cpumask arch/i386/oprofile/op_model_p4.c
--- linux-2.6/arch/i386/oprofile/op_model_p4.c~sibling-map-to-cpumask	2003-12-08 14:42:28.000000000 +1100
+++ linux-2.6-npiggin/arch/i386/oprofile/op_model_p4.c	2003-12-08 14:42:28.000000000 +1100
@@ -382,11 +382,8 @@ static struct p4_event_binding p4_events
 static unsigned int get_stagger(void)
 {
 #ifdef CONFIG_SMP
-	int cpu;
-	if (smp_num_siblings > 1) {
-		cpu = smp_processor_id();
-		return (cpu_sibling_map[cpu] > cpu) ? 0 : 1;
-	}
+	int cpu = smp_processor_id();
+	return (cpu != first_cpu(cpu_sibling_map[cpu]));
 #endif	
 	return 0;
 }

_

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH][RFC] make cpu_sibling_map a cpumask_t
  2003-12-08 20:51 ` Zwane Mwaikambo
@ 2003-12-08 20:55   ` Ingo Molnar
  2003-12-08 23:17     ` Nick Piggin
  0 siblings, 1 reply; 80+ messages in thread
From: Ingo Molnar @ 2003-12-08 20:55 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Nick Piggin, Anton Blanchard, Linus Torvalds, Andrew Morton,
	Rusty Russell, linux-kernel


On Mon, 8 Dec 2003, Zwane Mwaikambo wrote:

> > P.S.
> > I have an alternative to Ingo's HT scheduler which basically does
> > the same thing. It is showing a 20% elapsed time improvement with a
> > make -j3 on a 2xP4 Xeon (4 logical CPUs).
> 
> -j3 is an odd number, what does -j4, -j8, -j16 look like?

let me guess: -j3 is the one that gives the highest performance boost on a 
2-way/4-logical P4 box?

for the SMT/HT scheduler to give any benefits there has to be idle time -
so that SMT decisions actually make a difference.

the higher -j values are only useful to make sure that SMT scheduling does
not introduce regressions - performance should be the same as the pure
SMP/NUMA scheduler's performance.

	Ingo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH][RFC] make cpu_sibling_map a cpumask_t
  2003-12-08 15:59 ` Anton Blanchard
@ 2003-12-08 23:08   ` Nick Piggin
  2003-12-09  0:14     ` Anton Blanchard
  0 siblings, 1 reply; 80+ messages in thread
From: Nick Piggin @ 2003-12-08 23:08 UTC (permalink / raw)
  To: Anton Blanchard
  Cc: Ingo Molnar, Linus Torvalds, Andrew Morton, Rusty Russell,
	Zwane Mwaikambo, linux-kernel



Anton Blanchard wrote:

> 
>
>>I'm not aware of any reason why the kernel should not become generally
>>SMT aware. It is sufficiently different to SMP that it is worth
>>specialising it, although I am only aware of P4 and POWER5 implementations.
>>
>
>I agree, SMT is likely to become more popular in the coming years.
>
>
>>I have an alternative to Ingo's HT scheduler which basically does
>>the same thing. It is showing a 20% elapsed time improvement with a
>>make -j3 on a 2xP4 Xeon (4 logical CPUs).
>>
>>Before Ingo's is merged, I would like to discuss the pros and cons of
>>both approaches with those interested. If Ingo's is accepted I should
>>still be able to port my other SMP/NUMA improvements on top of it.
>>
>
>Sounds good, have you got anything to test? I can throw it on a POWER5.
>

It would be great to get some testing on another architecture.

I don't have an architecture independant way to build SMT scheduling
descriptions, although with the cpu_sibling_map change, you can copy
and paste the code for the P4 if you are able to build a cpu_sibling_map.

I'll just have to add a some bits so SMT and NUMA work together which
I will be unable to test. I'll get back to you with some code.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH][RFC] make cpu_sibling_map a cpumask_t
  2003-12-08 20:55   ` Ingo Molnar
@ 2003-12-08 23:17     ` Nick Piggin
  2003-12-08 23:36       ` Ingo Molnar
  0 siblings, 1 reply; 80+ messages in thread
From: Nick Piggin @ 2003-12-08 23:17 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Zwane Mwaikambo, Anton Blanchard, Linus Torvalds, Andrew Morton,
	Rusty Russell, linux-kernel



Ingo Molnar wrote:

>On Mon, 8 Dec 2003, Zwane Mwaikambo wrote:
>
>
>>>P.S.
>>>I have an alternative to Ingo's HT scheduler which basically does
>>>the same thing. It is showing a 20% elapsed time improvement with a
>>>make -j3 on a 2xP4 Xeon (4 logical CPUs).
>>>
>>-j3 is an odd number, what does -j4, -j8, -j16 look like?
>>
>
>let me guess: -j3 is the one that gives the highest performance boost on a 
>2-way/4-logical P4 box?
>
Yep! Note: -j3 now gives you 2 compute "threads" in the build. Basically
(as you know Ingo), you'll see the best improvement when there is the
most opportunity for one physical CPU to be saturated and the other idle.

[piggin@ragnarok piggin]$ grep user compile-test11
319.56user 30.04system 2:53.61elapsed 201%CPU
320.50user 30.33system 2:54.04elapsed 201%CPU
318.44user 30.94system 2:53.44elapsed 201%CPU
[piggin@ragnarok piggin]$ grep user compile-w26p20
268.04user 27.48system 2:26.94elapsed 201%CPU
267.36user 27.69system 2:26.10elapsed 201%CPU
266.66user 27.63system 2:25.68elapsed 202%CPU

Note here that both sets of results saturate 2 CPUs. My changes (and
Ingo's too, of course) try to ensure that those 2 CPUs are different
physical ones which is why they are getting more work done.

>
>for the SMT/HT scheduler to give any benefits there has to be idle time -
>so that SMT decisions actually make a difference.
>
>the higher -j values are only useful to make sure that SMT scheduling does
>not introduce regressions - performance should be the same as the pure
>SMP/NUMA scheduler's performance.
>

I haven't run your HT patch Ingo, but of course I'll have to get more
comprehensive benchmarks before we go anywhere with this.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH][RFC] make cpu_sibling_map a cpumask_t
  2003-12-08 23:17     ` Nick Piggin
@ 2003-12-08 23:36       ` Ingo Molnar
  2003-12-08 23:58         ` Nick Piggin
  0 siblings, 1 reply; 80+ messages in thread
From: Ingo Molnar @ 2003-12-08 23:36 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Zwane Mwaikambo, Anton Blanchard, Linus Torvalds, Andrew Morton,
	Rusty Russell, linux-kernel


the thing that makes balancing-only driven SMT possible with current 2.6.0
is the overbalancing we do (to have cross-CPU fairness). Previous
iterations of the O(1) scheduler (all the 2.4 backports) didnt do this so
all the add-on SMT schedulers tended to have a problem achieving good SMT
distribution. Now that we more agressively balance, this isnt such a big
issue anymore.

so i tend to lean towards your SMT patch, it's much easier to maintain
than my runqueue-sharing approach. The performance is equivalent as far as
i can see (around 20%, and a stabilization of the distribution of
runtimes) - but please also boot my patch and repeat the exact same
measurements you did.

	Ingo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH][RFC] make cpu_sibling_map a cpumask_t
  2003-12-08  4:25 [PATCH][RFC] make cpu_sibling_map a cpumask_t Nick Piggin
                   ` (3 preceding siblings ...)
  2003-12-08 20:51 ` Zwane Mwaikambo
@ 2003-12-08 23:46 ` Rusty Russell
  2003-12-09 13:36   ` Nick Piggin
  4 siblings, 1 reply; 80+ messages in thread
From: Rusty Russell @ 2003-12-08 23:46 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Anton Blanchard, Linus Torvalds, Andrew Morton,
	Zwane Mwaikambo, linux-kernel

In message <3FD3FD52.7020001@cyberone.com.au> you write:
> I'm not aware of any reason why the kernel should not become generally
> SMT aware. It is sufficiently different to SMP that it is worth
> specialising it, although I am only aware of P4 and POWER5 implementations.

To do it properly, it should be done within the NUMA framework.  That
would allow generic slab cache optimizations, etc.  We'd really need a
multi-level NUMA framework for this though.

But patch looks fine.

> I have an alternative to Ingo's HT scheduler which basically does
> the same thing. It is showing a 20% elapsed time improvement with a
> make -j3 on a 2xP4 Xeon (4 logical CPUs).

Me too.

My main argument with Ingo's patch (last I looked) was technical: the
code becomes clearer if the structures are explicitly split into the
"per-runqueue stuff" and the "per-cpu stuff" (containing a my_runqueue
pointer).

I'd be very interested in your patch though, Nick.

Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH][RFC] make cpu_sibling_map a cpumask_t
  2003-12-08 23:36       ` Ingo Molnar
@ 2003-12-08 23:58         ` Nick Piggin
  0 siblings, 0 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-08 23:58 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Zwane Mwaikambo, Anton Blanchard, Linus Torvalds, Andrew Morton,
	Rusty Russell, linux-kernel



Ingo Molnar wrote:

>the thing that makes balancing-only driven SMT possible with current 2.6.0
>is the overbalancing we do (to have cross-CPU fairness). Previous
>iterations of the O(1) scheduler (all the 2.4 backports) didnt do this so
>all the add-on SMT schedulers tended to have a problem achieving good SMT
>distribution. Now that we more agressively balance, this isnt such a big
>issue anymore.
>

I'm glad you like my idea. I do like your shared runqueue approach,
its conceptually very elegant IMO, but implementation looks difficult.

You'll have to have a look at my patch, it is the "scheduling domains"
thing. Basically there are no ifdefs for NUMA or SMT in the scheduler,
the balancing depends on SMP and behaves according to a structure
describing desired properties.

I have to document it a little more though.

>
>so i tend to lean towards your SMT patch, it's much easier to maintain
>than my runqueue-sharing approach. The performance is equivalent as far as
>i can see (around 20%, and a stabilization of the distribution of
>runtimes) - but please also boot my patch and repeat the exact same
>measurements you did.
>

I will. I might not get time today, but I'll test various old
"favourites" like kernbench, hackbench, [dt]bench with both versions.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH][RFC] make cpu_sibling_map a cpumask_t
  2003-12-08 23:08   ` Nick Piggin
@ 2003-12-09  0:14     ` Anton Blanchard
  2003-12-11  4:25       ` [CFT][RFC] HT scheduler Nick Piggin
  0 siblings, 1 reply; 80+ messages in thread
From: Anton Blanchard @ 2003-12-09  0:14 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Linus Torvalds, Andrew Morton, Rusty Russell,
	Zwane Mwaikambo, linux-kernel


> I don't have an architecture independant way to build SMT scheduling
> descriptions, although with the cpu_sibling_map change, you can copy
> and paste the code for the P4 if you are able to build a cpu_sibling_map.

I can build a suitable table, I did that for Ingos HT scheduler.

> I'll just have to add a some bits so SMT and NUMA work together which
> I will be unable to test. I'll get back to you with some code.

Thanks.

Anton

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH][RFC] make cpu_sibling_map a cpumask_t
  2003-12-08 23:46 ` Rusty Russell
@ 2003-12-09 13:36   ` Nick Piggin
  2003-12-11 21:41     ` bill davidsen
  0 siblings, 1 reply; 80+ messages in thread
From: Nick Piggin @ 2003-12-09 13:36 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Ingo Molnar, Anton Blanchard, Linus Torvalds, Andrew Morton,
	Zwane Mwaikambo, linux-kernel



Rusty Russell wrote:

>In message <3FD3FD52.7020001@cyberone.com.au> you write:
>
>>I'm not aware of any reason why the kernel should not become generally
>>SMT aware. It is sufficiently different to SMP that it is worth
>>specialising it, although I am only aware of P4 and POWER5 implementations.
>>
>
>To do it properly, it should be done within the NUMA framework.  That
>would allow generic slab cache optimizations, etc.  We'd really need a
>multi-level NUMA framework for this though.
>
>But patch looks fine.
>

Well if (something like) cpu_sibling_map is to become architecture
independant code, it should probably get something like cpu_to_package,
package_to_cpumask sort of things like NUMA has.

I don't know if SMT should become just another level of NUMA because
its not NUMA of course. For the scheduler it seems to make sense because
SMT/SMP/NUMA basically want slight variations of the same thing.

Possibly as you say slab cache could be SMTed, but I'm not convinced
that SMT and NUMA should become inseperable. Anyway I doubt 2.6 will
see a general multi level NUMA framework.

>
>>I have an alternative to Ingo's HT scheduler which basically does
>>the same thing. It is showing a 20% elapsed time improvement with a
>>make -j3 on a 2xP4 Xeon (4 logical CPUs).
>>
>
>Me too.
>
>My main argument with Ingo's patch (last I looked) was technical: the
>code becomes clearer if the structures are explicitly split into the
>"per-runqueue stuff" and the "per-cpu stuff" (containing a my_runqueue
>pointer).
>

Ahh so yours isn't an earlier/later version of the same implementation..

>
>I'd be very interested in your patch though, Nick.
>

I've just had hard to trace bug report with it, but there is some
interest in it now so I'll work on sorting it out and getting
something out for testing and review. Sorry for the delay.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* [CFT][RFC] HT scheduler
  2003-12-09  0:14     ` Anton Blanchard
@ 2003-12-11  4:25       ` Nick Piggin
  2003-12-11  7:24         ` Nick Piggin
                           ` (2 more replies)
  0 siblings, 3 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-11  4:25 UTC (permalink / raw)
  To: linux-kernel
  Cc: Anton Blanchard, Ingo Molnar, Rusty Russell, Martin J. Bligh,
	Nakajima, Jun, Mark Wong

http://www.kerneltrap.org/~npiggin/w26/
Against 2.6.0-test11

This includes the SMT description for P4. Initial results shows comparable
performance to Ingo's shared runqueue's patch on a dual P4 Xeon.

It also includes the cpu_sibling_map to cpumask patch, which should apply
by itself. There should be very little if any left to make it work with > 2
siblings per package.

A bug causing rare crashes and hangs has been fixed. Its pretty stable on
the NUMAQ and the dual Xeon. Anyone testing should be using this version
please.




^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11  4:25       ` [CFT][RFC] HT scheduler Nick Piggin
@ 2003-12-11  7:24         ` Nick Piggin
  2003-12-11  8:57           ` Nick Piggin
  2003-12-11 10:01           ` Rhino
  2003-12-11 16:28         ` Kevin P. Fleming
  2003-12-12  2:24         ` Rusty Russell
  2 siblings, 2 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-11  7:24 UTC (permalink / raw)
  To: linux-kernel
  Cc: Anton Blanchard, Ingo Molnar, Rusty Russell, Martin J. Bligh,
	Nakajima, Jun, Mark Wong



Nick Piggin wrote:

> http://www.kerneltrap.org/~npiggin/w26/
> Against 2.6.0-test11


Oh, this patchset also (mostly) cures my pet hate for the
last few months: VolanoMark on the NUMA.

http://www.kerneltrap.org/~npiggin/w26/vmark.html

The "average" plot for w26 I think is a little misleading because
it got an unlucky result on the second last point making it look
like its has a downward curve. It is usually more linear with a
sharp downward spike at 150 rooms like the "maximum" plot.

Don't ask me why it runs out of steam at 150 rooms. hackbench does
something similar. I think it might be due to some resource running
short, or a scalability problem somewhere else.


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 10:01           ` Rhino
@ 2003-12-11  8:14             ` Nick Piggin
  2003-12-11 16:49               ` Rhino
  2003-12-11 11:40             ` William Lee Irwin III
  1 sibling, 1 reply; 80+ messages in thread
From: Nick Piggin @ 2003-12-11  8:14 UTC (permalink / raw)
  To: Rhino
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Rusty Russell,
	Martin J. Bligh, Nakajima, Jun, Mark Wong, wli



Rhino wrote:

>On Thu, 11 Dec 2003 18:24:20 +1100
>Nick Piggin <piggin@cyberone.com.au> wrote:
>
>
>>
>>Nick Piggin wrote:
>>
>>
>>>http://www.kerneltrap.org/~npiggin/w26/
>>>Against 2.6.0-test11
>>>
>>
>>Oh, this patchset also (mostly) cures my pet hate for the
>>last few months: VolanoMark on the NUMA.
>>
>>http://www.kerneltrap.org/~npiggin/w26/vmark.html
>>
>>The "average" plot for w26 I think is a little misleading because
>>it got an unlucky result on the second last point making it look
>>like its has a downward curve. It is usually more linear with a
>>sharp downward spike at 150 rooms like the "maximum" plot.
>>
>>Don't ask me why it runs out of steam at 150 rooms. hackbench does
>>something similar. I think it might be due to some resource running
>>short, or a scalability problem somewhere else.
>>
>
>i didn't had the time to apply the patches (w26 and C1 from ingo ) 
>on a vanilla t11, but i merged them with the wli-2,btw this one has really put 
>my box on steroids ;) .
>

You won't be able to merge mine with Ingo's. Which one put the box on
steroids? :)

>
>none of them finished a hackbench 320 run, the OOM killed all of my
>agetty's logging me out. the box is a 1way p4(HT) 1gb of ram 
>and no swap heh.
>
>
>	hackbench:
>
>	sched-rollup-w26		sched-SMT-2.6.0-test11-C1
>		
>	 50	 4.839			 50	5.200
>	100	 9.415			100	10.090
>	150	14.469			150	14.764
>
>	
>
>	time tar -xvjpf linux-2.6.0-test11.tar.bz2:
>
>	sched-rollup-w26		sched-SMT-2.6.0-test11-C1
>	
>	real		43.396		real		23.136
>
                           ^^^^^^

>
>	user		27.608		user		20.700
>	sys		 4.039		sys		 4.344
>

Wonder whats going on here? Is this my patch vs Ingo's with nothing else 
applied?
How does plain 2.6.0-test11 go?

Thanks for testing.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11  7:24         ` Nick Piggin
@ 2003-12-11  8:57           ` Nick Piggin
  2003-12-11 11:52             ` William Lee Irwin III
  2003-12-12  0:58             ` [CFT][RFC] HT scheduler Rusty Russell
  2003-12-11 10:01           ` Rhino
  1 sibling, 2 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-11  8:57 UTC (permalink / raw)
  To: linux-kernel, Ingo Molnar, Rusty Russell
  Cc: Anton Blanchard, Martin J. Bligh, Nakajima, Jun, Mark Wong



Nick Piggin wrote:

>
> Don't ask me why it runs out of steam at 150 rooms. hackbench does
> something similar. I think it might be due to some resource running
> short, or a scalability problem somewhere else.


OK, it is spinning on .text.lock.futex. The following results are
top 10 profiles from a 120 rooms run and a 150 rooms run. The 150
room run managed only 24.8% the throughput of the 120 room run.

Might this be a JVM problem?
I'm using Sun Java HotSpot(TM) Server VM (build 1.4.2_01-b06, mixed mode)

            ROOMS          120             150
PROFILES
total                   100.0%          100.0%
default_idle             81.0%           66.8%
.text.lock.rwsem          4.6%            1.3%
schedule                  1.9%            1.4%
.text.lock.futex          1.5%           19.1%
__wake_up                 1.1%            1.3%
futex_wait                0.7%            2.8%
futex_wake                0.7%            0.5%
.text.lock.dev            0.6%            0.2%
rwsem_down_read_failed    0.5%
unqueue_me                                3.2%



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11  7:24         ` Nick Piggin
  2003-12-11  8:57           ` Nick Piggin
@ 2003-12-11 10:01           ` Rhino
  2003-12-11  8:14             ` Nick Piggin
  2003-12-11 11:40             ` William Lee Irwin III
  1 sibling, 2 replies; 80+ messages in thread
From: Rhino @ 2003-12-11 10:01 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Rusty Russell,
	Martin J. Bligh, Nakajima, Jun, Mark Wong, wli

[-- Attachment #1: Type: text/plain, Size: 1461 bytes --]

On Thu, 11 Dec 2003 18:24:20 +1100
Nick Piggin <piggin@cyberone.com.au> wrote:

> 
> 
> Nick Piggin wrote:
> 
> > http://www.kerneltrap.org/~npiggin/w26/
> > Against 2.6.0-test11
> 
> 
> Oh, this patchset also (mostly) cures my pet hate for the
> last few months: VolanoMark on the NUMA.
> 
> http://www.kerneltrap.org/~npiggin/w26/vmark.html
> 
> The "average" plot for w26 I think is a little misleading because
> it got an unlucky result on the second last point making it look
> like its has a downward curve. It is usually more linear with a
> sharp downward spike at 150 rooms like the "maximum" plot.
> 
> Don't ask me why it runs out of steam at 150 rooms. hackbench does
> something similar. I think it might be due to some resource running
> short, or a scalability problem somewhere else.

i didn't had the time to apply the patches (w26 and C1 from ingo ) 
on a vanilla t11, but i merged them with the wli-2,btw this one has really put 
my box on steroids ;) .

none of them finished a hackbench 320 run, the OOM killed all of my
agetty's logging me out. the box is a 1way p4(HT) 1gb of ram 
and no swap heh.


	hackbench:

	sched-rollup-w26					sched-SMT-2.6.0-test11-C1
		
	 50	 4.839						 50	5.200
	100	 9.415						100	10.090
	150	14.469						150	14.764

	

	time tar -xvjpf linux-2.6.0-test11.tar.bz2:

	sched-rollup-w26					sched-SMT-2.6.0-test11-C1
	
	real		43.396					real		23.136
	user		27.608					user		20.700
	sys		 4.039					sys		 4.344

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 10:01           ` Rhino
  2003-12-11  8:14             ` Nick Piggin
@ 2003-12-11 11:40             ` William Lee Irwin III
  2003-12-11 17:05               ` Rhino
  1 sibling, 1 reply; 80+ messages in thread
From: William Lee Irwin III @ 2003-12-11 11:40 UTC (permalink / raw)
  To: Rhino
  Cc: Nick Piggin, linux-kernel, Anton Blanchard, Ingo Molnar,
	Rusty Russell, Martin J. Bligh, Nakajima, Jun, Mark Wong

On Thu, 11 Dec 2003 18:24:20 +1100 Nick Piggin wrote:
>> The "average" plot for w26 I think is a little misleading because
>> it got an unlucky result on the second last point making it look
>> like its has a downward curve. It is usually more linear with a
>> sharp downward spike at 150 rooms like the "maximum" plot.
>> Don't ask me why it runs out of steam at 150 rooms. hackbench does
>> something similar. I think it might be due to some resource running
>> short, or a scalability problem somewhere else.

On Thu, Dec 11, 2003 at 06:01:20AM -0400, Rhino wrote:
> i didn't had the time to apply the patches (w26 and C1 from ingo ) 
> on a vanilla t11, but i merged them with the wli-2,btw this one has
> really put my box on steroids ;) .
> none of them finished a hackbench 320 run, the OOM killed all of my
> agetty's logging me out. the box is a 1way p4(HT) 1gb of ram 
> and no swap heh.

It might help to check how many processes and/or threads are involved.
I've got process scalability stuff in there (I'm not sure how to read
your comments though they seem encouraging).


-- wli

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11  8:57           ` Nick Piggin
@ 2003-12-11 11:52             ` William Lee Irwin III
  2003-12-11 13:09               ` Nick Piggin
  2003-12-12  0:58             ` [CFT][RFC] HT scheduler Rusty Russell
  1 sibling, 1 reply; 80+ messages in thread
From: William Lee Irwin III @ 2003-12-11 11:52 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, Ingo Molnar, Rusty Russell, Anton Blanchard,
	Martin J. Bligh, Nakajima, Jun, Mark Wong

On Thu, Dec 11, 2003 at 07:57:31PM +1100, Nick Piggin wrote:
> OK, it is spinning on .text.lock.futex. The following results are
> top 10 profiles from a 120 rooms run and a 150 rooms run. The 150
> room run managed only 24.8% the throughput of the 120 room run.
> Might this be a JVM problem?
> I'm using Sun Java HotSpot(TM) Server VM (build 1.4.2_01-b06, mixed mode)
>            ROOMS          120             150
> PROFILES
> total                   100.0%          100.0%
> default_idle             81.0%           66.8%
> .text.lock.rwsem          4.6%            1.3%
> schedule                  1.9%            1.4%
> .text.lock.futex          1.5%           19.1%
> __wake_up                 1.1%            1.3%
> futex_wait                0.7%            2.8%
> futex_wake                0.7%            0.5%
> .text.lock.dev            0.6%            0.2%
> rwsem_down_read_failed    0.5%
> unqueue_me                                3.2%

If this thing is heavily threaded, it could be mm->page_table_lock.

-- wli

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 11:52             ` William Lee Irwin III
@ 2003-12-11 13:09               ` Nick Piggin
  2003-12-11 13:23                 ` William Lee Irwin III
  0 siblings, 1 reply; 80+ messages in thread
From: Nick Piggin @ 2003-12-11 13:09 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: linux-kernel, Ingo Molnar, Rusty Russell, Anton Blanchard,
	Martin J. Bligh, Nakajima, Jun, Mark Wong



William Lee Irwin III wrote:

>On Thu, Dec 11, 2003 at 07:57:31PM +1100, Nick Piggin wrote:
>
>>OK, it is spinning on .text.lock.futex. The following results are
>>top 10 profiles from a 120 rooms run and a 150 rooms run. The 150
>>room run managed only 24.8% the throughput of the 120 room run.
>>Might this be a JVM problem?
>>I'm using Sun Java HotSpot(TM) Server VM (build 1.4.2_01-b06, mixed mode)
>>           ROOMS          120             150
>>PROFILES
>>total                   100.0%          100.0%
>>default_idle             81.0%           66.8%
>>.text.lock.rwsem          4.6%            1.3%
>>schedule                  1.9%            1.4%
>>.text.lock.futex          1.5%           19.1%
>>__wake_up                 1.1%            1.3%
>>futex_wait                0.7%            2.8%
>>futex_wake                0.7%            0.5%
>>.text.lock.dev            0.6%            0.2%
>>rwsem_down_read_failed    0.5%
>>unqueue_me                                3.2%
>>
>
>If this thing is heavily threaded, it could be mm->page_table_lock.
>

I'm not sure how threaded it is, probably very. Would inline spinlocks
help show up mm->page_table_lock?

It really looks like .text.lock.futex though, doesn't it? Would that be
the hashed futex locks? I wonder why it suddenly goes downhill past about
140 rooms though.




^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 13:09               ` Nick Piggin
@ 2003-12-11 13:23                 ` William Lee Irwin III
  2003-12-11 13:30                   ` Nick Piggin
  0 siblings, 1 reply; 80+ messages in thread
From: William Lee Irwin III @ 2003-12-11 13:23 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, Ingo Molnar, Rusty Russell, Anton Blanchard,
	Martin J. Bligh, Nakajima, Jun, Mark Wong

William Lee Irwin III wrote:
>> If this thing is heavily threaded, it could be mm->page_table_lock.

On Fri, Dec 12, 2003 at 12:09:04AM +1100, Nick Piggin wrote:
> I'm not sure how threaded it is, probably very. Would inline spinlocks
> help show up mm->page_table_lock?
> It really looks like .text.lock.futex though, doesn't it? Would that be
> the hashed futex locks? I wonder why it suddenly goes downhill past about
> 140 rooms though.

It will get contention anyway if they're all pounding on the same futex.
OTOH, if they're all threads in the same process, they can hit other
problems. I'll try to find out more about hackbench.


-- wli

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 13:23                 ` William Lee Irwin III
@ 2003-12-11 13:30                   ` Nick Piggin
  2003-12-11 13:32                     ` William Lee Irwin III
  0 siblings, 1 reply; 80+ messages in thread
From: Nick Piggin @ 2003-12-11 13:30 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: linux-kernel, Ingo Molnar, Rusty Russell, Anton Blanchard,
	Martin J. Bligh, Nakajima, Jun, Mark Wong



William Lee Irwin III wrote:

>William Lee Irwin III wrote:
>
>>>If this thing is heavily threaded, it could be mm->page_table_lock.
>>>
>
>On Fri, Dec 12, 2003 at 12:09:04AM +1100, Nick Piggin wrote:
>
>>I'm not sure how threaded it is, probably very. Would inline spinlocks
>>help show up mm->page_table_lock?
>>It really looks like .text.lock.futex though, doesn't it? Would that be
>>the hashed futex locks? I wonder why it suddenly goes downhill past about
>>140 rooms though.
>>
>
>It will get contention anyway if they're all pounding on the same futex.
>OTOH, if they're all threads in the same process, they can hit other
>problems. I'll try to find out more about hackbench.
>


Oh, sorry I was talking about volanomark. hackbench AFAIK doesn't use
futexes at all, just pipes, and is not threaded at all, so it looks like
a different problem to the volanomark one.

hackbench runs into trouble at large numbers of tasks too though.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 13:30                   ` Nick Piggin
@ 2003-12-11 13:32                     ` William Lee Irwin III
  2003-12-11 15:30                       ` Nick Piggin
  0 siblings, 1 reply; 80+ messages in thread
From: William Lee Irwin III @ 2003-12-11 13:32 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, Ingo Molnar, Rusty Russell, Anton Blanchard,
	Martin J. Bligh, Nakajima, Jun, Mark Wong

William Lee Irwin III wrote:
>> It will get contention anyway if they're all pounding on the same futex.
>> OTOH, if they're all threads in the same process, they can hit other
>> problems. I'll try to find out more about hackbench.


On Fri, Dec 12, 2003 at 12:30:07AM +1100, Nick Piggin wrote:
> Oh, sorry I was talking about volanomark. hackbench AFAIK doesn't use
> futexes at all, just pipes, and is not threaded at all, so it looks like
> a different problem to the volanomark one.
> hackbench runs into trouble at large numbers of tasks too though.

Volano is all one process address space so it could be ->page_table_lock;
any chance you could find which spin_lock() call the pounded chunk of the
lock section jumps back to?


-- wli

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 16:49               ` Rhino
@ 2003-12-11 15:16                 ` Nick Piggin
  0 siblings, 0 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-11 15:16 UTC (permalink / raw)
  To: Rhino
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Rusty Russell,
	Martin J. Bligh, Nakajima, Jun, Mark Wong, wli



Rhino wrote:

>On Thu, 11 Dec 2003 19:14:45 +1100
>Nick Piggin <piggin@cyberone.com.au> wrote:
>
>
>>You won't be able to merge mine with Ingo's. Which one put the box on
>>steroids? :)
>>
>
>sorry if i wasn't clearly enough, but i merged each one of them separately on top of a wli-2. 
>that comment was for the wli changes. :)
>
>
>>Wonder whats going on here? Is this my patch vs Ingo's with nothing else 
>>applied?
>>How does plain 2.6.0-test11 go?
>>
>>Thanks for testing.
>>
>
>
>ok, this time on a plain test11.
>

I mean, what results does test11 get when running the benchmarks.

>
>hackbench:
>
>		w26						sched-SMT-C1
>
>	 50	 5.026				 50	 5.182
>	100	10.062				100	10.602
>	150	15.906				150	16.214
>
>
>time tar -xvjpf linux-2.6.0-test11.tar.bz2:
>
>		w26						sched-SMT-C1
>		
>	real	0m21.835s			real	0m21.827s
>	user	0m20.274s			user	0m20.412s
>	sys	0m4.178s				sys	0m4.260s
>
                  ^^^^^^^^
OK I guess the real 43s in your last set of results was a typo.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 17:05               ` Rhino
@ 2003-12-11 15:17                 ` William Lee Irwin III
  0 siblings, 0 replies; 80+ messages in thread
From: William Lee Irwin III @ 2003-12-11 15:17 UTC (permalink / raw)
  To: Rhino
  Cc: Nick Piggin, linux-kernel, Anton Blanchard, Ingo Molnar,
	Rusty Russell, Martin J. Bligh, Nakajima, Jun, Mark Wong

On Thu, 11 Dec 2003 03:40:18 -0800 William Lee Irwin III <wli@holomorphy.com> wrote:
>> It might help to check how many processes and/or threads are involved.
>> I've got process scalability stuff in there (I'm not sure how to read
>> your comments though they seem encouraging).

On Thu, Dec 11, 2003 at 01:05:36PM -0400, Rhino wrote:
> heh, they were supposed to .it really looks good. 
> if you provide me a test path to address your changes,
> i'll happily put it on.

I don't have anything to swap or reclaim the relevant data structures
yet, which is probably what you'll need.


-- wli

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 13:32                     ` William Lee Irwin III
@ 2003-12-11 15:30                       ` Nick Piggin
  2003-12-11 15:38                         ` William Lee Irwin III
  2003-12-12  1:52                         ` [PATCH] improve rwsem scalability (was Re: [CFT][RFC] HT scheduler) Nick Piggin
  0 siblings, 2 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-11 15:30 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: linux-kernel, Ingo Molnar, Rusty Russell, Anton Blanchard,
	Martin J. Bligh, Nakajima, Jun, Mark Wong



William Lee Irwin III wrote:

>William Lee Irwin III wrote:
>
>>>It will get contention anyway if they're all pounding on the same futex.
>>>OTOH, if they're all threads in the same process, they can hit other
>>>problems. I'll try to find out more about hackbench.
>>>
>
>
>On Fri, Dec 12, 2003 at 12:30:07AM +1100, Nick Piggin wrote:
>
>>Oh, sorry I was talking about volanomark. hackbench AFAIK doesn't use
>>futexes at all, just pipes, and is not threaded at all, so it looks like
>>a different problem to the volanomark one.
>>hackbench runs into trouble at large numbers of tasks too though.
>>
>
>Volano is all one process address space so it could be ->page_table_lock;
>any chance you could find which spin_lock() call the pounded chunk of the
>lock section jumps back to?
>
>

OK its in futex_wait, up_read(&current->mm->mmap_sem) right after
out_release_sem (line 517).

So you get half points. Looks like its waiting on the bus rather than
spinning on a lock. Or am I'm wrong?



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 15:30                       ` Nick Piggin
@ 2003-12-11 15:38                         ` William Lee Irwin III
  2003-12-11 15:51                           ` Nick Piggin
  2003-12-12  1:52                         ` [PATCH] improve rwsem scalability (was Re: [CFT][RFC] HT scheduler) Nick Piggin
  1 sibling, 1 reply; 80+ messages in thread
From: William Lee Irwin III @ 2003-12-11 15:38 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, Ingo Molnar, Rusty Russell, Anton Blanchard,
	Martin J. Bligh, Nakajima, Jun, Mark Wong

William Lee Irwin III wrote:
>> Volano is all one process address space so it could be ->page_table_lock;
>> any chance you could find which spin_lock() call the pounded chunk of the
>> lock section jumps back to?

On Fri, Dec 12, 2003 at 02:30:27AM +1100, Nick Piggin wrote:
> OK its in futex_wait, up_read(&current->mm->mmap_sem) right after
> out_release_sem (line 517).
> So you get half points. Looks like its waiting on the bus rather than
> spinning on a lock. Or am I'm wrong?

There is a spinloop in __up_read(), which is probably it.


-- wli

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 15:38                         ` William Lee Irwin III
@ 2003-12-11 15:51                           ` Nick Piggin
  2003-12-11 15:56                             ` William Lee Irwin III
  0 siblings, 1 reply; 80+ messages in thread
From: Nick Piggin @ 2003-12-11 15:51 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: linux-kernel, Ingo Molnar, Rusty Russell, Anton Blanchard,
	Martin J. Bligh, Nakajima, Jun, Mark Wong



William Lee Irwin III wrote:

>William Lee Irwin III wrote:
>
>>>Volano is all one process address space so it could be ->page_table_lock;
>>>any chance you could find which spin_lock() call the pounded chunk of the
>>>lock section jumps back to?
>>>
>
>On Fri, Dec 12, 2003 at 02:30:27AM +1100, Nick Piggin wrote:
>
>>OK its in futex_wait, up_read(&current->mm->mmap_sem) right after
>>out_release_sem (line 517).
>>So you get half points. Looks like its waiting on the bus rather than
>>spinning on a lock. Or am I'm wrong?
>>
>
>There is a spinloop in __up_read(), which is probably it.
>

Oh I see - in rwsem_wake?



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 15:51                           ` Nick Piggin
@ 2003-12-11 15:56                             ` William Lee Irwin III
  2003-12-11 16:37                               ` Nick Piggin
  0 siblings, 1 reply; 80+ messages in thread
From: William Lee Irwin III @ 2003-12-11 15:56 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, Ingo Molnar, Rusty Russell, Anton Blanchard,
	Martin J. Bligh, Nakajima, Jun, Mark Wong

William Lee Irwin III wrote:
>> There is a spinloop in __up_read(), which is probably it.

On Fri, Dec 12, 2003 at 02:51:32AM +1100, Nick Piggin wrote:
> Oh I see - in rwsem_wake?

Even before that:

static inline void __up_read(struct rw_semaphore *sem)
{
        __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
        __asm__ __volatile__(
                "# beginning __up_read\n\t"
LOCK_PREFIX     "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */
                "  js        2f\n\t" /* jump if the lock is being waited upon */
                "1:\n\t"
                LOCK_SECTION_START("")
                "2:\n\t"
                "  decw      %%dx\n\t" /* do nothing if still outstanding active readers
 */
                "  jnz       1b\n\t"
                "  pushl     %%ecx\n\t"
                "  call      rwsem_wake\n\t"
                "  popl      %%ecx\n\t"
                "  jmp       1b\n"
                LOCK_SECTION_END
                "# ending __up_read\n"
                : "=m"(sem->count), "=d"(tmp)
                : "a"(sem), "1"(tmp), "m"(sem->count)
                : "memory", "cc");
}


-- wli

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11  4:25       ` [CFT][RFC] HT scheduler Nick Piggin
  2003-12-11  7:24         ` Nick Piggin
@ 2003-12-11 16:28         ` Kevin P. Fleming
  2003-12-11 16:41           ` Nick Piggin
  2003-12-12  2:24         ` Rusty Russell
  2 siblings, 1 reply; 80+ messages in thread
From: Kevin P. Fleming @ 2003-12-11 16:28 UTC (permalink / raw)
  To: LKML

Nick Piggin wrote:

> http://www.kerneltrap.org/~npiggin/w26/
> Against 2.6.0-test11
> 
> This includes the SMT description for P4. Initial results shows comparable
> performance to Ingo's shared runqueue's patch on a dual P4 Xeon.
> 

Is there any value in testing/using this on a single CPU P4-HT system, 
or is it only targeted at multi-CPU systems?


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 15:56                             ` William Lee Irwin III
@ 2003-12-11 16:37                               ` Nick Piggin
  2003-12-11 16:40                                 ` William Lee Irwin III
  0 siblings, 1 reply; 80+ messages in thread
From: Nick Piggin @ 2003-12-11 16:37 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: linux-kernel, Ingo Molnar, Rusty Russell, Anton Blanchard,
	Martin J. Bligh, Nakajima, Jun, Mark Wong



William Lee Irwin III wrote:

>William Lee Irwin III wrote:
>
>>>There is a spinloop in __up_read(), which is probably it.
>>>
>
>On Fri, Dec 12, 2003 at 02:51:32AM +1100, Nick Piggin wrote:
>
>>Oh I see - in rwsem_wake?
>>
>
>Even before that:
>
>static inline void __up_read(struct rw_semaphore *sem)
>{
>        __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
>        __asm__ __volatile__(
>                "# beginning __up_read\n\t"
>LOCK_PREFIX     "  xadd      %%edx,(%%eax)\n\t" /* subtracts 1, returns the old value */
>                "  js        2f\n\t" /* jump if the lock is being waited upon */
>                "1:\n\t"
>                LOCK_SECTION_START("")
>                "2:\n\t"
>                "  decw      %%dx\n\t" /* do nothing if still outstanding active readers
> */
>                "  jnz       1b\n\t"
>                "  pushl     %%ecx\n\t"
>                "  call      rwsem_wake\n\t"
>                "  popl      %%ecx\n\t"
>                "  jmp       1b\n"
>                LOCK_SECTION_END
>

I think there shouldn't be a loop - remember the lock section.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 16:37                               ` Nick Piggin
@ 2003-12-11 16:40                                 ` William Lee Irwin III
  0 siblings, 0 replies; 80+ messages in thread
From: William Lee Irwin III @ 2003-12-11 16:40 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, Ingo Molnar, Rusty Russell, Anton Blanchard,
	Martin J. Bligh, Nakajima, Jun, Mark Wong

On Fri, Dec 12, 2003 at 03:37:23AM +1100, Nick Piggin wrote:
> I think there shouldn't be a loop - remember the lock section.

The lock section is irrelevant, but the 1: label isn't actually
behind a jump, so it must be rwsem_wake() or just frequently called.


-- wli

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 16:28         ` Kevin P. Fleming
@ 2003-12-11 16:41           ` Nick Piggin
  0 siblings, 0 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-11 16:41 UTC (permalink / raw)
  To: Kevin P. Fleming; +Cc: LKML



Kevin P. Fleming wrote:

> Nick Piggin wrote:
>
>> http://www.kerneltrap.org/~npiggin/w26/
>> Against 2.6.0-test11
>>
>> This includes the SMT description for P4. Initial results shows 
>> comparable
>> performance to Ingo's shared runqueue's patch on a dual P4 Xeon.
>>
>
> Is there any value in testing/using this on a single CPU P4-HT system, 
> or is it only targeted at multi-CPU systems?


Yes hopefully there is value in it. Its probably very minor, but it
recognises there is no cache penalty when moving between virtual CPUs,
so it should be able to keep them busy more often.

As I said it would be very minor.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11  8:14             ` Nick Piggin
@ 2003-12-11 16:49               ` Rhino
  2003-12-11 15:16                 ` Nick Piggin
  0 siblings, 1 reply; 80+ messages in thread
From: Rhino @ 2003-12-11 16:49 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Rusty Russell,
	Martin J. Bligh, Nakajima, Jun, Mark Wong, wli

[-- Attachment #1: Type: text/plain, Size: 772 bytes --]

On Thu, 11 Dec 2003 19:14:45 +1100
Nick Piggin <piggin@cyberone.com.au> wrote:

> You won't be able to merge mine with Ingo's. Which one put the box on
> steroids? :)

sorry if i wasn't clearly enough, but i merged each one of them separately on top of a wli-2. 
that comment was for the wli changes. :)

> Wonder whats going on here? Is this my patch vs Ingo's with nothing else 
> applied?
> How does plain 2.6.0-test11 go?
> 
> Thanks for testing.


ok, this time on a plain test11.

hackbench:

		w26						sched-SMT-C1

	 50	 5.026				 50	 5.182
	100	10.062				100	10.602
	150	15.906				150	16.214


time tar -xvjpf linux-2.6.0-test11.tar.bz2:

		w26						sched-SMT-C1
		
	real	0m21.835s			real	0m21.827s
	user	0m20.274s			user	0m20.412s
	sys	0m4.178s				sys	0m4.260s

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11 11:40             ` William Lee Irwin III
@ 2003-12-11 17:05               ` Rhino
  2003-12-11 15:17                 ` William Lee Irwin III
  0 siblings, 1 reply; 80+ messages in thread
From: Rhino @ 2003-12-11 17:05 UTC (permalink / raw)
  To: William Lee Irwin III
  Cc: Nick Piggin, linux-kernel, Anton Blanchard, Ingo Molnar,
	Rusty Russell, Martin J. Bligh, Nakajima, Jun, Mark Wong

[-- Attachment #1: Type: text/plain, Size: 413 bytes --]

On Thu, 11 Dec 2003 03:40:18 -0800
William Lee Irwin III <wli@holomorphy.com> wrote:


> It might help to check how many processes and/or threads are involved.
> I've got process scalability stuff in there (I'm not sure how to read
> your comments though they seem encouraging).
> 

heh, they were supposed to .it really looks good. 
if you provide me a test path to address your changes,
i'll happily put it on.

[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH][RFC] make cpu_sibling_map a cpumask_t
  2003-12-09 13:36   ` Nick Piggin
@ 2003-12-11 21:41     ` bill davidsen
  0 siblings, 0 replies; 80+ messages in thread
From: bill davidsen @ 2003-12-11 21:41 UTC (permalink / raw)
  To: linux-kernel

In article <3FD5CFE1.8080800@cyberone.com.au>,
Nick Piggin  <piggin@cyberone.com.au> wrote:

| Well if (something like) cpu_sibling_map is to become architecture
| independant code, it should probably get something like cpu_to_package,
| package_to_cpumask sort of things like NUMA has.
| 
| I don't know if SMT should become just another level of NUMA because
| its not NUMA of course. For the scheduler it seems to make sense because
| SMT/SMP/NUMA basically want slight variations of the same thing.
| 
| Possibly as you say slab cache could be SMTed, but I'm not convinced
| that SMT and NUMA should become inseperable. Anyway I doubt 2.6 will
| see a general multi level NUMA framework.

It would seem that what concerns all of these levels is the process
migration cost, so perhaps it will be possible in 2.6 after all. Right
now a lot of clever people are actively working on scheduling, so now
is the time for a breakthrough which gives a big boost. Absent that I
predict all of the clever folks and most of us "build a bunch of
kernels and play" types will conclude the low hanging fruit have all
been picked and move on. Scheduling will be stable again.

Before that perhaps one of you will suddenly see some way to make this
all very simple and elegant, so everyone can look and say "of course"
and everything will work optimally.

What we have in all of these patches is a big gain over base, so the
efforts have produced visable gains.
-- 
bill davidsen <davidsen@tmr.com>
  CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11  8:57           ` Nick Piggin
  2003-12-11 11:52             ` William Lee Irwin III
@ 2003-12-12  0:58             ` Rusty Russell
  1 sibling, 0 replies; 80+ messages in thread
From: Rusty Russell @ 2003-12-12  0:58 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, Ingo Molnar, Anton Blanchard, Martin J. Bligh,
	Nakajima, Jun, Mark Wong

In message <3FD8317B.4060207@cyberone.com.au> you write:
> 
> 
> Nick Piggin wrote:
> 
> >
> > Don't ask me why it runs out of steam at 150 rooms. hackbench does
> > something similar. I think it might be due to some resource running
> > short, or a scalability problem somewhere else.
> 
> 
> OK, it is spinning on .text.lock.futex. The following results are
> top 10 profiles from a 120 rooms run and a 150 rooms run. The 150
> room run managed only 24.8% the throughput of the 120 room run.
> 
> Might this be a JVM problem?

Not if hackbench is showing it.

> I'm using Sun Java HotSpot(TM) Server VM (build 1.4.2_01-b06, mixed mode)
> 
>             ROOMS          120             150
> PROFILES
> total                   100.0%          100.0%
> default_idle             81.0%           66.8%
> .text.lock.rwsem          4.6%            1.3%
> schedule                  1.9%            1.4%
> .text.lock.futex          1.5%           19.1%

Increase FUTEX_HASHBITS?

Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* [PATCH] improve rwsem scalability (was Re: [CFT][RFC] HT scheduler)
  2003-12-11 15:30                       ` Nick Piggin
  2003-12-11 15:38                         ` William Lee Irwin III
@ 2003-12-12  1:52                         ` Nick Piggin
  2003-12-12  2:02                           ` Nick Piggin
  2003-12-12  9:41                           ` Ingo Molnar
  1 sibling, 2 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-12  1:52 UTC (permalink / raw)
  To: linux-kernel
  Cc: William Lee Irwin III, Ingo Molnar, Rusty Russell,
	Anton Blanchard, Martin J. Bligh, Nakajima, Jun, Mark Wong

[-- Attachment #1: Type: text/plain, Size: 1423 bytes --]



Nick Piggin wrote:

>
>
> William Lee Irwin III wrote:
>
>> William Lee Irwin III wrote:
>>
>>>> It will get contention anyway if they're all pounding on the same 
>>>> futex.
>>>> OTOH, if they're all threads in the same process, they can hit other
>>>> problems. I'll try to find out more about hackbench.
>>>>
>>
>>
>> On Fri, Dec 12, 2003 at 12:30:07AM +1100, Nick Piggin wrote:
>>
>>> Oh, sorry I was talking about volanomark. hackbench AFAIK doesn't use
>>> futexes at all, just pipes, and is not threaded at all, so it looks 
>>> like
>>> a different problem to the volanomark one.
>>> hackbench runs into trouble at large numbers of tasks too though.
>>>
>>
>> Volano is all one process address space so it could be 
>> ->page_table_lock;
>> any chance you could find which spin_lock() call the pounded chunk of 
>> the
>> lock section jumps back to?
>>
>>
>
> OK its in futex_wait, up_read(&current->mm->mmap_sem) right after
> out_release_sem (line 517).
>
> So you get half points. Looks like its waiting on the bus rather than
> spinning on a lock. Or am I'm wrong?
>

The following patch moves the rwsem's up_read wake ups out of the
wait_lock. Wakeups need to aquire a runqueue lock which is probably
getting contended. The following graph is a best of 3 runs average.
http://www.kerneltrap.org/~npiggin/rwsem.png

The part to look at is the tail. I need to do some more testing to
see if its significant.


[-- Attachment #2: rwsem-scale.patch --]
[-- Type: text/plain, Size: 3744 bytes --]


Move rwsem's up_read wakeups out of the semaphore's wait_lock


 linux-2.6-npiggin/lib/rwsem.c |   39 +++++++++++++++++++++++++--------------
 1 files changed, 25 insertions(+), 14 deletions(-)

diff -puN lib/rwsem.c~rwsem-scale lib/rwsem.c
--- linux-2.6/lib/rwsem.c~rwsem-scale	2003-12-12 12:50:14.000000000 +1100
+++ linux-2.6-npiggin/lib/rwsem.c	2003-12-12 12:50:14.000000000 +1100
@@ -8,7 +8,7 @@
 #include <linux/module.h>
 
 struct rwsem_waiter {
-	struct list_head	list;
+	struct list_head	list, wake_list;
 	struct task_struct	*task;
 	unsigned int		flags;
 #define RWSEM_WAITING_FOR_READ	0x00000001
@@ -38,6 +38,7 @@ void rwsemtrace(struct rw_semaphore *sem
  */
 static inline struct rw_semaphore *__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 {
+	LIST_HEAD(wake_list);
 	struct rwsem_waiter *waiter;
 	struct list_head *next;
 	signed long oldcount;
@@ -65,7 +66,8 @@ static inline struct rw_semaphore *__rws
 
 	list_del(&waiter->list);
 	waiter->flags = 0;
-	wake_up_process(waiter->task);
+	if (list_empty(&waiter->wake_list))
+		list_add_tail(&waiter->wake_list, &wake_list);
 	goto out;
 
 	/* don't want to wake any writers */
@@ -74,9 +76,10 @@ static inline struct rw_semaphore *__rws
 	if (waiter->flags & RWSEM_WAITING_FOR_WRITE)
 		goto out;
 
-	/* grant an infinite number of read locks to the readers at the front of the queue
-	 * - note we increment the 'active part' of the count by the number of readers (less one
-	 *   for the activity decrement we've already done) before waking any processes up
+	/* grant an infinite number of read locks to the readers at the front
+	 * of the queue - note we increment the 'active part' of the count by
+	 * the number of readers (less one for the activity decrement we've
+	 * already done) before waking any processes up
 	 */
  readers_only:
 	woken = 0;
@@ -100,13 +103,21 @@ static inline struct rw_semaphore *__rws
 		waiter = list_entry(next,struct rwsem_waiter,list);
 		next = waiter->list.next;
 		waiter->flags = 0;
-		wake_up_process(waiter->task);
+		if (list_empty(&waiter->wake_list))
+			list_add_tail(&waiter->wake_list, &wake_list);
 	}
 
 	sem->wait_list.next = next;
 	next->prev = &sem->wait_list;
 
  out:
+	spin_unlock(&sem->wait_lock);
+	while (!list_empty(&wake_list)) {
+		waiter = list_entry(wake_list.next,struct rwsem_waiter,wake_list);
+		list_del_init(&waiter->wake_list);
+		mb();
+		wake_up_process(waiter->task);
+	}
 	rwsemtrace(sem,"Leaving __rwsem_do_wake");
 	return sem;
 
@@ -130,9 +141,9 @@ static inline struct rw_semaphore *rwsem
 	set_task_state(tsk,TASK_UNINTERRUPTIBLE);
 
 	/* set up my own style of waitqueue */
-	spin_lock(&sem->wait_lock);
 	waiter->task = tsk;
-
+	INIT_LIST_HEAD(&waiter->wake_list);
+	spin_lock(&sem->wait_lock);
 	list_add_tail(&waiter->list,&sem->wait_list);
 
 	/* note that we're now waiting on the lock, but no longer actively read-locking */
@@ -143,8 +154,8 @@ static inline struct rw_semaphore *rwsem
 	 */
 	if (!(count & RWSEM_ACTIVE_MASK))
 		sem = __rwsem_do_wake(sem,1);
-
-	spin_unlock(&sem->wait_lock);
+	else
+		spin_unlock(&sem->wait_lock);
 
 	/* wait to be given the lock */
 	for (;;) {
@@ -204,8 +215,8 @@ struct rw_semaphore *rwsem_wake(struct r
 	/* do nothing if list empty */
 	if (!list_empty(&sem->wait_list))
 		sem = __rwsem_do_wake(sem,1);
-
-	spin_unlock(&sem->wait_lock);
+	else
+		spin_unlock(&sem->wait_lock);
 
 	rwsemtrace(sem,"Leaving rwsem_wake");
 
@@ -226,8 +237,8 @@ struct rw_semaphore *rwsem_downgrade_wak
 	/* do nothing if list empty */
 	if (!list_empty(&sem->wait_list))
 		sem = __rwsem_do_wake(sem,0);
-
-	spin_unlock(&sem->wait_lock);
+	else
+		spin_unlock(&sem->wait_lock);
 
 	rwsemtrace(sem,"Leaving rwsem_downgrade_wake");
 	return sem;

_

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] improve rwsem scalability (was Re: [CFT][RFC] HT scheduler)
  2003-12-12  1:52                         ` [PATCH] improve rwsem scalability (was Re: [CFT][RFC] HT scheduler) Nick Piggin
@ 2003-12-12  2:02                           ` Nick Piggin
  2003-12-12  9:41                           ` Ingo Molnar
  1 sibling, 0 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-12  2:02 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, William Lee Irwin III, Ingo Molnar, Rusty Russell,
	Anton Blanchard, Martin J. Bligh, Nakajima, Jun, Mark Wong

[-- Attachment #1: Type: text/plain, Size: 435 bytes --]



Nick Piggin wrote:

>
> The following patch moves the rwsem's up_read wake ups out of the
> wait_lock. Wakeups need to aquire a runqueue lock which is probably
> getting contended. The following graph is a best of 3 runs average.
> http://www.kerneltrap.org/~npiggin/rwsem.png
>
> The part to look at is the tail. I need to do some more testing to
> see if its significant.


Err, here's one with a comment changed to match, sorry.


[-- Attachment #2: rwsem-scale.patch --]
[-- Type: text/plain, Size: 3984 bytes --]


Move rwsem's up_read wakeups out of the semaphore's wait_lock


 linux-2.6-npiggin/lib/rwsem.c |   41 +++++++++++++++++++++++++++--------------
 1 files changed, 27 insertions(+), 14 deletions(-)

diff -puN lib/rwsem.c~rwsem-scale lib/rwsem.c
--- linux-2.6/lib/rwsem.c~rwsem-scale	2003-12-12 13:01:28.000000000 +1100
+++ linux-2.6-npiggin/lib/rwsem.c	2003-12-12 13:01:45.000000000 +1100
@@ -8,7 +8,7 @@
 #include <linux/module.h>
 
 struct rwsem_waiter {
-	struct list_head	list;
+	struct list_head	list, wake_list;
 	struct task_struct	*task;
 	unsigned int		flags;
 #define RWSEM_WAITING_FOR_READ	0x00000001
@@ -35,9 +35,12 @@ void rwsemtrace(struct rw_semaphore *sem
  * - the spinlock must be held by the caller
  * - woken process blocks are discarded from the list after having flags zeroised
  * - writers are only woken if wakewrite is non-zero
+ *
+ * The spinlock will be dropped by this function
  */
 static inline struct rw_semaphore *__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 {
+	LIST_HEAD(wake_list);
 	struct rwsem_waiter *waiter;
 	struct list_head *next;
 	signed long oldcount;
@@ -65,7 +68,8 @@ static inline struct rw_semaphore *__rws
 
 	list_del(&waiter->list);
 	waiter->flags = 0;
-	wake_up_process(waiter->task);
+	if (list_empty(&waiter->wake_list))
+		list_add_tail(&waiter->wake_list, &wake_list);
 	goto out;
 
 	/* don't want to wake any writers */
@@ -74,9 +78,10 @@ static inline struct rw_semaphore *__rws
 	if (waiter->flags & RWSEM_WAITING_FOR_WRITE)
 		goto out;
 
-	/* grant an infinite number of read locks to the readers at the front of the queue
-	 * - note we increment the 'active part' of the count by the number of readers (less one
-	 *   for the activity decrement we've already done) before waking any processes up
+	/* grant an infinite number of read locks to the readers at the front
+	 * of the queue - note we increment the 'active part' of the count by
+	 * the number of readers (less one for the activity decrement we've
+	 * already done) before waking any processes up
 	 */
  readers_only:
 	woken = 0;
@@ -100,13 +105,21 @@ static inline struct rw_semaphore *__rws
 		waiter = list_entry(next,struct rwsem_waiter,list);
 		next = waiter->list.next;
 		waiter->flags = 0;
-		wake_up_process(waiter->task);
+		if (list_empty(&waiter->wake_list))
+			list_add_tail(&waiter->wake_list, &wake_list);
 	}
 
 	sem->wait_list.next = next;
 	next->prev = &sem->wait_list;
 
  out:
+	spin_unlock(&sem->wait_lock);
+	while (!list_empty(&wake_list)) {
+		waiter = list_entry(wake_list.next,struct rwsem_waiter,wake_list);
+		list_del_init(&waiter->wake_list);
+		mb();
+		wake_up_process(waiter->task);
+	}
 	rwsemtrace(sem,"Leaving __rwsem_do_wake");
 	return sem;
 
@@ -130,9 +143,9 @@ static inline struct rw_semaphore *rwsem
 	set_task_state(tsk,TASK_UNINTERRUPTIBLE);
 
 	/* set up my own style of waitqueue */
-	spin_lock(&sem->wait_lock);
 	waiter->task = tsk;
-
+	INIT_LIST_HEAD(&waiter->wake_list);
+	spin_lock(&sem->wait_lock);
 	list_add_tail(&waiter->list,&sem->wait_list);
 
 	/* note that we're now waiting on the lock, but no longer actively read-locking */
@@ -143,8 +156,8 @@ static inline struct rw_semaphore *rwsem
 	 */
 	if (!(count & RWSEM_ACTIVE_MASK))
 		sem = __rwsem_do_wake(sem,1);
-
-	spin_unlock(&sem->wait_lock);
+	else
+		spin_unlock(&sem->wait_lock);
 
 	/* wait to be given the lock */
 	for (;;) {
@@ -204,8 +217,8 @@ struct rw_semaphore *rwsem_wake(struct r
 	/* do nothing if list empty */
 	if (!list_empty(&sem->wait_list))
 		sem = __rwsem_do_wake(sem,1);
-
-	spin_unlock(&sem->wait_lock);
+	else
+		spin_unlock(&sem->wait_lock);
 
 	rwsemtrace(sem,"Leaving rwsem_wake");
 
@@ -226,8 +239,8 @@ struct rw_semaphore *rwsem_downgrade_wak
 	/* do nothing if list empty */
 	if (!list_empty(&sem->wait_list))
 		sem = __rwsem_do_wake(sem,0);
-
-	spin_unlock(&sem->wait_lock);
+	else
+		spin_unlock(&sem->wait_lock);
 
 	rwsemtrace(sem,"Leaving rwsem_downgrade_wake");
 	return sem;

_

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-11  4:25       ` [CFT][RFC] HT scheduler Nick Piggin
  2003-12-11  7:24         ` Nick Piggin
  2003-12-11 16:28         ` Kevin P. Fleming
@ 2003-12-12  2:24         ` Rusty Russell
  2003-12-12  7:00           ` Nick Piggin
  2 siblings, 1 reply; 80+ messages in thread
From: Rusty Russell @ 2003-12-12  2:24 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Martin J. Bligh,
	Nakajima, Jun, Mark Wong

In message <3FD7F1B9.5080100@cyberone.com.au> you write:
> http://www.kerneltrap.org/~npiggin/w26/
> Against 2.6.0-test11
> 
> This includes the SMT description for P4. Initial results shows comparable
> performance to Ingo's shared runqueue's patch on a dual P4 Xeon.

I'm still not convinced.  Sharing runqueues is simple, and in fact
exactly what you want for HT: you want to balance *runqueues*, not
CPUs.  In fact, it can be done without a CONFIG_SCHED_SMT addition.

Your patch is more general, more complex, but doesn't actually seem to
buy anything.  It puts a general domain structure inside the
scheduler, without putting it anywhere else which wants it (eg. slab
cache balancing).  My opinion is either (1) produce a general NUMA
topology which can then be used by the scheduler, or (2) do the
minimal change in the scheduler which makes HT work well.

Note: some of your changes I really like, it's just that I think this
is overkill.

I'll produce a patch so we can have something solid to talk about.

Cheers,
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-12  2:24         ` Rusty Russell
@ 2003-12-12  7:00           ` Nick Piggin
  2003-12-12  7:23             ` Rusty Russell
  2003-12-12  8:59             ` Nick Piggin
  0 siblings, 2 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-12  7:00 UTC (permalink / raw)
  To: Rusty Russell
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Martin J. Bligh,
	Nakajima, Jun, Mark Wong



Rusty Russell wrote:

>In message <3FD7F1B9.5080100@cyberone.com.au> you write:
>
>>http://www.kerneltrap.org/~npiggin/w26/
>>Against 2.6.0-test11
>>
>>This includes the SMT description for P4. Initial results shows comparable
>>performance to Ingo's shared runqueue's patch on a dual P4 Xeon.
>>
>
>I'm still not convinced.  Sharing runqueues is simple, and in fact
>exactly what you want for HT: you want to balance *runqueues*, not
>CPUs.  In fact, it can be done without a CONFIG_SCHED_SMT addition.
>
>Your patch is more general, more complex, but doesn't actually seem to
>buy anything.  It puts a general domain structure inside the
>scheduler, without putting it anywhere else which wants it (eg. slab
>cache balancing).  My opinion is either (1) produce a general NUMA
>topology which can then be used by the scheduler, or (2) do the
>minimal change in the scheduler which makes HT work well.
>
>Note: some of your changes I really like, it's just that I think this
>is overkill.
>
>I'll produce a patch so we can have something solid to talk about.
>

Thanks for having a look Rusty. I'll try to convince you :)

As you know, the domain classes is not just for HT, but can do multi levels
of NUMA, and it can be built by architecture specific code which is good
for Opteron, for example. It doesn't need CONFIG_SCHED_SMT either, of 
course,
or CONFIG_NUMA even: degenerate domains can just be collapsed (code isn't
there to do that now).

Shared runqueues I find isn't so flexible. I think it perfectly describes
the P4 HT architecture, but what happens if (when) siblings get seperate
L1 caches? What about SMT, CMP, SMP and NUMA levels in the POWER5?

The large SGI (and I imagine IBM's POWER5s) systems need things like
progressive balancing backoff and would probably benefit with a more
heirachical balancing scheme so all the balancing operations don't kill
the system.

w26 does ALL this, while sched.o is 3K smaller than Ingo's shared runqueue
patch on NUMA and SMP, and 1K smaller on UP (although sched.c is 90 lines
longer). kernbench system time is down nearly 10% on the NUMAQ, so it isn't
hurting performance either.

And finally, Linus also wanted the balancing code to be generalised to
handle SMT, and Ingo said he liked my patch from a first look.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-12  7:00           ` Nick Piggin
@ 2003-12-12  7:23             ` Rusty Russell
  2003-12-13  6:43               ` Nick Piggin
  2003-12-12  8:59             ` Nick Piggin
  1 sibling, 1 reply; 80+ messages in thread
From: Rusty Russell @ 2003-12-12  7:23 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Martin J. Bligh,
	Nakajima, Jun, Mark Wong

In message <3FD9679A.1020404@cyberone.com.au> you write:
> 
> Thanks for having a look Rusty. I'll try to convince you :)
> 
> As you know, the domain classes is not just for HT, but can do multi levels
> of NUMA, and it can be built by architecture specific code which is good
> for Opteron, for example. It doesn't need CONFIG_SCHED_SMT either, of 
> course,
> or CONFIG_NUMA even: degenerate domains can just be collapsed (code isn't
> there to do that now).

Yes, but this isn't what we really want.  I'm actually accusing you of
lacking ambition 8)

> Shared runqueues I find isn't so flexible. I think it perfectly describes
> the P4 HT architecture, but what happens if (when) siblings get seperate
> L1 caches? What about SMT, CMP, SMP and NUMA levels in the POWER5?

It describes every HyperThread implementation I am aware of today, so
it suits us fine for the moment.  Runqueues may still be worth sharing
even if L1 isn't, for example.

> The large SGI (and I imagine IBM's POWER5s) systems need things like
> progressive balancing backoff and would probably benefit with a more
> heirachical balancing scheme so all the balancing operations don't kill
> the system.

But this is my point.  Scheduling is one part of the problem.  I want
to be able to have the arch-specific code feed in a description of
memory and cpu distances, bandwidths and whatever, and have the
scheduler, slab allocator, per-cpu data allocation, page cache, page
migrator and anything else which cares adjust itself based on that.

Power 4 today has pairs of CPUs on a die, four dies on a board, and
four boards in a machine.  I want one infrastructure to descibe it,
not have to do program every infrastructure from arch-specific code.

> w26 does ALL this, while sched.o is 3K smaller than Ingo's shared runqueue
> patch on NUMA and SMP, and 1K smaller on UP (although sched.c is 90 lines
> longer). kernbench system time is down nearly 10% on the NUMAQ, so it isn't
> hurting performance either.

Agreed, but Ingo's shared runqueue patch is poor implementation of a
good idea: I've always disliked it.  I'm halfway through updating my
patch, and I really think you'll like it better.  It's not
incompatible with NUMA changes, in fact it's fairly non-invasive.

> And finally, Linus also wanted the balancing code to be generalised to
> handle SMT, and Ingo said he liked my patch from a first look.

Oh, I like your patch too (except those #defines should really be an
enum).  I just think we can do better with less.

Cheers,
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-12  7:00           ` Nick Piggin
  2003-12-12  7:23             ` Rusty Russell
@ 2003-12-12  8:59             ` Nick Piggin
  2003-12-12 15:14               ` Martin J. Bligh
  1 sibling, 1 reply; 80+ messages in thread
From: Nick Piggin @ 2003-12-12  8:59 UTC (permalink / raw)
  To: Rusty Russell
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Martin J. Bligh,
	Nakajima, Jun, Mark Wong



Nick Piggin wrote:

>
> w26 does ALL this, while sched.o is 3K smaller than Ingo's shared 
> runqueue
> patch on NUMA and SMP, and 1K smaller on UP (although sched.c is 90 lines
> longer). kernbench system time is down nearly 10% on the NUMAQ, so it 
> isn't
> hurting performance either.


Hackbench performance on the NUMAQ is improved by nearly 50% at large
numbers of tasks due to a better scaling factor (which I think is slightly
"more" linear too). It is also improved by nearly 25% (4.08 vs 3.15) on
OSDLs 8 ways at small number of tasks, due to a better constant factor.

http://www.kerneltrap.org/~npiggin/w26/hbench.png

And yeah hackbench kills the NUMAQ after about 350 rooms. This is due to
memory shortages. All the processes are getting stuck in shrink_caches,
get_free_pages, etc.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] improve rwsem scalability (was Re: [CFT][RFC] HT scheduler)
  2003-12-12  1:52                         ` [PATCH] improve rwsem scalability (was Re: [CFT][RFC] HT scheduler) Nick Piggin
  2003-12-12  2:02                           ` Nick Piggin
@ 2003-12-12  9:41                           ` Ingo Molnar
  2003-12-13  0:07                             ` Nick Piggin
  1 sibling, 1 reply; 80+ messages in thread
From: Ingo Molnar @ 2003-12-12  9:41 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, William Lee Irwin III, Rusty Russell,
	Anton Blanchard, Martin J. Bligh, Nakajima, Jun, Mark Wong


On Fri, 12 Dec 2003, Nick Piggin wrote:

> getting contended. The following graph is a best of 3 runs average.
> http://www.kerneltrap.org/~npiggin/rwsem.png

the graphs are too noise to be conclusive.

> The part to look at is the tail. I need to do some more testing to see
> if its significant.

yes, could you go from 150 to 300?

	Ingo

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-12  8:59             ` Nick Piggin
@ 2003-12-12 15:14               ` Martin J. Bligh
  0 siblings, 0 replies; 80+ messages in thread
From: Martin J. Bligh @ 2003-12-12 15:14 UTC (permalink / raw)
  To: Nick Piggin, Rusty Russell
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Nakajima, Jun, Mark Wong

>> w26 does ALL this, while sched.o is 3K smaller than Ingo's shared 
>> runqueue
>> patch on NUMA and SMP, and 1K smaller on UP (although sched.c is 90 lines
>> longer). kernbench system time is down nearly 10% on the NUMAQ, so it 
>> isn't
>> hurting performance either.
> 
> 
> Hackbench performance on the NUMAQ is improved by nearly 50% at large
> numbers of tasks due to a better scaling factor (which I think is slightly
> "more" linear too). It is also improved by nearly 25% (4.08 vs 3.15) on
> OSDLs 8 ways at small number of tasks, due to a better constant factor.
> 
> http://www.kerneltrap.org/~npiggin/w26/hbench.png
> 
> And yeah hackbench kills the NUMAQ after about 350 rooms. This is due to
> memory shortages. All the processes are getting stuck in shrink_caches,
> get_free_pages, etc.

Can you dump out the values of /proc/meminfo and /proc/slabinfo at that
point, and we'll see what's killing her?

Thanks,

M.


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] improve rwsem scalability (was Re: [CFT][RFC] HT scheduler)
  2003-12-12  9:41                           ` Ingo Molnar
@ 2003-12-13  0:07                             ` Nick Piggin
  2003-12-14  0:44                               ` Nick Piggin
  0 siblings, 1 reply; 80+ messages in thread
From: Nick Piggin @ 2003-12-13  0:07 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, William Lee Irwin III, Rusty Russell,
	Anton Blanchard, Martin J. Bligh, Nakajima, Jun, Mark Wong



Ingo Molnar wrote:

>On Fri, 12 Dec 2003, Nick Piggin wrote:
>
>
>>getting contended. The following graph is a best of 3 runs average.
>>http://www.kerneltrap.org/~npiggin/rwsem.png
>>
>
>the graphs are too noise to be conclusive.
>
>
>>The part to look at is the tail. I need to do some more testing to see
>>if its significant.
>>
>
>yes, could you go from 150 to 300?
>

The benchmark dies at 160 rooms unfortunately. Probably something in the 
JVM.

I'll do a larger number of runs around the 130-150 mark.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-12  7:23             ` Rusty Russell
@ 2003-12-13  6:43               ` Nick Piggin
  2003-12-14  1:35                 ` bill davidsen
                                   ` (2 more replies)
  0 siblings, 3 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-13  6:43 UTC (permalink / raw)
  To: Rusty Russell
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Martin J. Bligh,
	Nakajima, Jun, Mark Wong



Rusty Russell wrote:

>In message <3FD9679A.1020404@cyberone.com.au> you write:
>
>>Thanks for having a look Rusty. I'll try to convince you :)
>>
>>As you know, the domain classes is not just for HT, but can do multi levels
>>of NUMA, and it can be built by architecture specific code which is good
>>for Opteron, for example. It doesn't need CONFIG_SCHED_SMT either, of 
>>course,
>>or CONFIG_NUMA even: degenerate domains can just be collapsed (code isn't
>>there to do that now).
>>
>
>Yes, but this isn't what we really want.  I'm actually accusing you of
>lacking ambition 8)
>
>
>>Shared runqueues I find isn't so flexible. I think it perfectly describes
>>the P4 HT architecture, but what happens if (when) siblings get seperate
>>L1 caches? What about SMT, CMP, SMP and NUMA levels in the POWER5?
>>
>
>It describes every HyperThread implementation I am aware of today, so
>it suits us fine for the moment.  Runqueues may still be worth sharing
>even if L1 isn't, for example.
>

Possibly. But it restricts your load balancing to a specific case, it
eliminates any possibility of CPU affinity: 4 running threads on 1 HT
CPU for example, they'll ping pong from one cpu to the other happily.

I could get domains to do the same thing, but at the moment a CPU only looks
at its sibling's runqueue if they are unbalanced or is about to become idle.
I'm pretty sure domains can do anything shared runqueues can. I don't know
if you're disputing this or not?

>
>>The large SGI (and I imagine IBM's POWER5s) systems need things like
>>progressive balancing backoff and would probably benefit with a more
>>heirachical balancing scheme so all the balancing operations don't kill
>>the system.
>>
>
>But this is my point.  Scheduling is one part of the problem.  I want
>to be able to have the arch-specific code feed in a description of
>memory and cpu distances, bandwidths and whatever, and have the
>scheduler, slab allocator, per-cpu data allocation, page cache, page
>migrator and anything else which cares adjust itself based on that.
>
>Power 4 today has pairs of CPUs on a die, four dies on a board, and
>four boards in a machine.  I want one infrastructure to descibe it,
>not have to do program every infrastructure from arch-specific code.
>

(Plus two threads / siblings per CPU, right?)

I agree with you here. You know, we could rename struct sched_domain, add
a few fields to it and it becomes what you want. Its a _heirachical set_
of _sets of cpus sharing a certian property_ (underlining to aid grouping.

Uniform access to certian memory ranges could easily be one of these
properties. There is already some info about the amount of cache shared,
that also could be expanded on.

(Perhaps some exotic architecture  would like scheduling and memory a bit
more decoupled, but designing for *that* before hitting it would be over
engineering).

I'm not going to that because 2.6 doesn't need a generalised topology
because nothing makes use of it. Perhaps if something really good came up
in 2.7, there would be a case for backporting it. 2.6 does need improvements
to the scheduler though.

>
>>w26 does ALL this, while sched.o is 3K smaller than Ingo's shared runqueue
>>patch on NUMA and SMP, and 1K smaller on UP (although sched.c is 90 lines
>>longer). kernbench system time is down nearly 10% on the NUMAQ, so it isn't
>>hurting performance either.
>>
>
>Agreed, but Ingo's shared runqueue patch is poor implementation of a
>good idea: I've always disliked it.  I'm halfway through updating my
>patch, and I really think you'll like it better.  It's not
>incompatible with NUMA changes, in fact it's fairly non-invasive.
>

But if sched domains are accepted, there is no need for shared runqueues,
because as I said they can do anything sched domains can, so the code would
just be a redundant specialisation - unless you specifically wanted to share
locks & data with siblings.

I must admit I didn't look at your implementation. I look forward to it.
I'm not against shared runqueues. If my stuff doesn't get taken then of
course I'd rather shrq gets in than nothing at all, as I told Ingo. I
obviously just like sched domains better ;)



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] improve rwsem scalability (was Re: [CFT][RFC] HT scheduler)
  2003-12-13  0:07                             ` Nick Piggin
@ 2003-12-14  0:44                               ` Nick Piggin
  2003-12-17  5:27                                 ` Nick Piggin
  0 siblings, 1 reply; 80+ messages in thread
From: Nick Piggin @ 2003-12-14  0:44 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, William Lee Irwin III, Rusty Russell,
	Anton Blanchard, Martin J. Bligh, Nakajima, Jun, Mark Wong



Nick Piggin wrote:

>
>
> Ingo Molnar wrote:
>
>> On Fri, 12 Dec 2003, Nick Piggin wrote:
>>
>>
>>> getting contended. The following graph is a best of 3 runs average.
>>> http://www.kerneltrap.org/~npiggin/rwsem.png
>>>
>>
>> the graphs are too noise to be conclusive.
>>
>>
>>> The part to look at is the tail. I need to do some more testing to see
>>> if its significant.
>>>
>>
>> yes, could you go from 150 to 300?
>>
>
> The benchmark dies at 160 rooms unfortunately. Probably something in 
> the JVM.
>
> I'll do a larger number of runs around the 130-150 mark.
>

OK, this is an average of 5 runs at 145, 150, 155 rooms with my scheduler
patches, with and without my rwsem patch. Its all over the place, but I 
think
rwsem does give a small but significant improvement.

http://www.kerneltrap.org/~npiggin/rwsem2.png


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-13  6:43               ` Nick Piggin
@ 2003-12-14  1:35                 ` bill davidsen
  2003-12-14  2:18                   ` Nick Piggin
  2003-12-15  5:53                 ` Rusty Russell
  2003-12-15 20:21                 ` Zwane Mwaikambo
  2 siblings, 1 reply; 80+ messages in thread
From: bill davidsen @ 2003-12-14  1:35 UTC (permalink / raw)
  To: linux-kernel

In article <3FDAB517.4000309@cyberone.com.au>,
Nick Piggin  <piggin@cyberone.com.au> wrote:

| Possibly. But it restricts your load balancing to a specific case, it
| eliminates any possibility of CPU affinity: 4 running threads on 1 HT
| CPU for example, they'll ping pong from one cpu to the other happily.

Huh? I hope you meant sibling and not CPU, here.

| I could get domains to do the same thing, but at the moment a CPU only looks
| at its sibling's runqueue if they are unbalanced or is about to become idle.
| I'm pretty sure domains can do anything shared runqueues can. I don't know
| if you're disputing this or not?

Shared runqueues sound like a simplification to describe execution units
which have shared resourses and null cost of changing units. You can do
that by having a domain which behaved like that, but a shared runqueue
sounds better because it would eliminate the cost of even considering
moving a process from one sibling to another.

Sorry if I'm mixing features of Con/Nick/Ingo approaches here, I just
spent some time looking at the code for several approaches, thinking
about moving jobs from the end of the runqueue vs. the head, in terms of
fair vs. overall cache impact.

| >But this is my point.  Scheduling is one part of the problem.  I want
| >to be able to have the arch-specific code feed in a description of
| >memory and cpu distances, bandwidths and whatever, and have the
| >scheduler, slab allocator, per-cpu data allocation, page cache, page
| >migrator and anything else which cares adjust itself based on that.

And would shared runqueues not fit into that model?

| (Plus two threads / siblings per CPU, right?)
| 
| I agree with you here. You know, we could rename struct sched_domain, add
| a few fields to it and it becomes what you want. Its a _heirachical set_
| of _sets of cpus sharing a certian property_ (underlining to aid grouping.
| 
| Uniform access to certian memory ranges could easily be one of these
| properties. There is already some info about the amount of cache shared,
| that also could be expanded on.
| 
| (Perhaps some exotic architecture  would like scheduling and memory a bit
| more decoupled, but designing for *that* before hitting it would be over
| engineering).
| 
| I'm not going to that because 2.6 doesn't need a generalised topology
| because nothing makes use of it. Perhaps if something really good came up
| in 2.7, there would be a case for backporting it. 2.6 does need improvements
| to the scheduler though.

| But if sched domains are accepted, there is no need for shared runqueues,
| because as I said they can do anything sched domains can, so the code would
| just be a redundant specialisation - unless you specifically wanted to share
| locks & data with siblings.

I doubt the gain would be worth the complexity, but what do I know?

-- 
bill davidsen <davidsen@tmr.com>
  CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-14  1:35                 ` bill davidsen
@ 2003-12-14  2:18                   ` Nick Piggin
  2003-12-14  4:32                     ` Jamie Lokier
  2003-12-16 17:34                     ` Bill Davidsen
  0 siblings, 2 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-14  2:18 UTC (permalink / raw)
  To: bill davidsen; +Cc: linux-kernel



bill davidsen wrote:

>In article <3FDAB517.4000309@cyberone.com.au>,
>Nick Piggin  <piggin@cyberone.com.au> wrote:
>
>| Possibly. But it restricts your load balancing to a specific case, it
>| eliminates any possibility of CPU affinity: 4 running threads on 1 HT
>| CPU for example, they'll ping pong from one cpu to the other happily.
>
>Huh? I hope you meant sibling and not CPU, here.
>

It was late when I wrote that! You are right, of course.

>
>
>| I could get domains to do the same thing, but at the moment a CPU only looks
>| at its sibling's runqueue if they are unbalanced or is about to become idle.
>| I'm pretty sure domains can do anything shared runqueues can. I don't know
>| if you're disputing this or not?
>
>Shared runqueues sound like a simplification to describe execution units
>which have shared resourses and null cost of changing units. You can do
>that by having a domain which behaved like that, but a shared runqueue
>sounds better because it would eliminate the cost of even considering
>moving a process from one sibling to another.
>

You are correct, however it would be a miniscule cost advantage,
possibly outweighed by the shared lock, and overhead of more
changing of CPUs (I'm sure there would be some cost).

>
>Sorry if I'm mixing features of Con/Nick/Ingo approaches here, I just
>spent some time looking at the code for several approaches, thinking
>about moving jobs from the end of the runqueue vs. the head, in terms of
>fair vs. overall cache impact.
>
>| >But this is my point.  Scheduling is one part of the problem.  I want
>| >to be able to have the arch-specific code feed in a description of
>| >memory and cpu distances, bandwidths and whatever, and have the
>| >scheduler, slab allocator, per-cpu data allocation, page cache, page
>| >migrator and anything else which cares adjust itself based on that.
>
>And would shared runqueues not fit into that model?
>

Shared runqueues could be built from that model

>
>
>| (Plus two threads / siblings per CPU, right?)
>| 
>| I agree with you here. You know, we could rename struct sched_domain, add
>| a few fields to it and it becomes what you want. Its a _heirachical set_
>| of _sets of cpus sharing a certian property_ (underlining to aid grouping.
>| 
>| Uniform access to certian memory ranges could easily be one of these
>| properties. There is already some info about the amount of cache shared,
>| that also could be expanded on.
>| 
>| (Perhaps some exotic architecture  would like scheduling and memory a bit
>| more decoupled, but designing for *that* before hitting it would be over
>| engineering).
>| 
>| I'm not going to that because 2.6 doesn't need a generalised topology
>| because nothing makes use of it. Perhaps if something really good came up
>| in 2.7, there would be a case for backporting it. 2.6 does need improvements
>| to the scheduler though.
>
>| But if sched domains are accepted, there is no need for shared runqueues,
>| because as I said they can do anything sched domains can, so the code would
>| just be a redundant specialisation - unless you specifically wanted to share
>| locks & data with siblings.
>
>I doubt the gain would be worth the complexity, but what do I know?
>

Sorry I didn't follow, gain and complexity of what?

Earlier in the thread Ingo thought my approach is simpler. code size is the
same size, object size for my patch is significantly smaller, and it does
more. Benchmarks have so far shown that my patch is as performant as shared
runqueues.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-14  2:18                   ` Nick Piggin
@ 2003-12-14  4:32                     ` Jamie Lokier
  2003-12-14  9:40                       ` Nick Piggin
  2003-12-16 18:22                       ` Linus Torvalds
  2003-12-16 17:34                     ` Bill Davidsen
  1 sibling, 2 replies; 80+ messages in thread
From: Jamie Lokier @ 2003-12-14  4:32 UTC (permalink / raw)
  To: Nick Piggin; +Cc: bill davidsen, linux-kernel

Nick Piggin wrote:
> >Shared runqueues sound like a simplification to describe execution units
> >which have shared resourses and null cost of changing units. You can do
> >that by having a domain which behaved like that, but a shared runqueue
> >sounds better because it would eliminate the cost of even considering
> >moving a process from one sibling to another.
> 
> You are correct, however it would be a miniscule cost advantage,
> possibly outweighed by the shared lock, and overhead of more
> changing of CPUs (I'm sure there would be some cost).

Regarding the overhead of the shared runqueue lock:

Is the "lock" prefix actually required for locking between x86
siblings which share the same L1 cache?

-- Jaime

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-14  4:32                     ` Jamie Lokier
@ 2003-12-14  9:40                       ` Nick Piggin
  2003-12-14 10:46                         ` Arjan van de Ven
  2003-12-16 17:46                         ` Bill Davidsen
  2003-12-16 18:22                       ` Linus Torvalds
  1 sibling, 2 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-14  9:40 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: bill davidsen, linux-kernel



Jamie Lokier wrote:

>Nick Piggin wrote:
>
>>>Shared runqueues sound like a simplification to describe execution units
>>>which have shared resourses and null cost of changing units. You can do
>>>that by having a domain which behaved like that, but a shared runqueue
>>>sounds better because it would eliminate the cost of even considering
>>>moving a process from one sibling to another.
>>>
>>You are correct, however it would be a miniscule cost advantage,
>>possibly outweighed by the shared lock, and overhead of more
>>changing of CPUs (I'm sure there would be some cost).
>>
>
>Regarding the overhead of the shared runqueue lock:
>
>Is the "lock" prefix actually required for locking between x86
>siblings which share the same L1 cache?
>

That lock is still taken by other CPUs as well for eg. wakeups, balancing,
and so forth. I guess it could be a very specific optimisation for
spinlocks in general if there was only one HT core. Don't know if it
would be worthwhile though.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-14  9:40                       ` Nick Piggin
@ 2003-12-14 10:46                         ` Arjan van de Ven
  2003-12-16 17:46                         ` Bill Davidsen
  1 sibling, 0 replies; 80+ messages in thread
From: Arjan van de Ven @ 2003-12-14 10:46 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Jamie Lokier, bill davidsen, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 642 bytes --]


> >Regarding the overhead of the shared runqueue lock:
> >
> >Is the "lock" prefix actually required for locking between x86
> >siblings which share the same L1 cache?
> >
> 
> That lock is still taken by other CPUs as well for eg. wakeups, balancing,
> and so forth. I guess it could be a very specific optimisation for
> spinlocks in general if there was only one HT core. Don't know if it
> would be worthwhile though.

also keep in mind that current x86 processors all will internally
optimize out the lock prefix in UP mode or when the cacheline is owned
exclusive.... If HT matters here let the cpu optimize it out.....

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-13  6:43               ` Nick Piggin
  2003-12-14  1:35                 ` bill davidsen
@ 2003-12-15  5:53                 ` Rusty Russell
  2003-12-15 23:08                   ` Nick Piggin
  2004-01-03 18:57                   ` Bill Davidsen
  2003-12-15 20:21                 ` Zwane Mwaikambo
  2 siblings, 2 replies; 80+ messages in thread
From: Rusty Russell @ 2003-12-15  5:53 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Martin J. Bligh,
	Nakajima, Jun, Mark Wong

In message <3FDAB517.4000309@cyberone.com.au> you write:
> Rusty Russell wrote:
> 
> >In message <3FD9679A.1020404@cyberone.com.au> you write:
> >
> >>Thanks for having a look Rusty. I'll try to convince you :)

Actually, having produced the patch, I've changed my mind.

While it was spiritually rewarding to separate "struct runqueue" into
the stuff which was to do with the runqueue, and the stuff which was
per-cpu but there because it was convenient, I'm not sure the churn is
worthwhile since we will want the rest of your stuff anyway.

It (and lots of other things) might become worthwhile if single
processors with HT become the de-facto standard.  For these, lots of
our assumptions about CONFIG_SMP, such as the desirability of per-cpu
data, become bogus.

A few things need work:

1) There's a race between sys_sched_setaffinity() and
   sched_migrate_task() (this is nothing to do with your patch).

2) Please change those #defines into an enum for idle (patch follows,
   untested but trivial)

3) conditional locking in load_balance is v. bad idea.

4) load_balance returns "(!failed && !balanced)", but callers stop
   calling it when it returns true.  Why not simply return "balanced",
   or at least "balanced && !failed"?

Untested patch for #2 below...
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

D: Change #defines to enum, and use it.

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal working-2.6.0-test11-w26/kernel/sched.c working-2.6.0-test11-w26-enum/kernel/sched.c
--- working-2.6.0-test11-w26/kernel/sched.c	2003-12-15 16:31:15.000000000 +1100
+++ working-2.6.0-test11-w26-enum/kernel/sched.c	2003-12-15 16:38:49.000000000 +1100
@@ -1268,9 +1268,12 @@ out:
 	return pulled;
 }
 
-#define TYPE_NOIDLE	0
-#define TYPE_IDLE	1
-#define TYPE_NEWIDLE	2
+enum idle_type
+{
+	NOT_IDLE,
+	AM_IDLE,
+	NEWLY_IDLE,
+};
 
 /*
  * find_busiest_group finds and returns the busiest CPU group within the
@@ -1279,11 +1282,11 @@ out:
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *domain, int this_cpu,
-				unsigned long *imbalance, int idle)
+				unsigned long *imbalance, enum idle_type idle)
 {
 	unsigned long max_load, avg_load, total_load, this_load;
 	int modify, total_nr_cpus;
-	int package_idle = idle;
+	enum idle_type package_idle = idle;
 	struct sched_group *busiest = NULL, *group = domain->groups;
 
 	max_load = 0;
@@ -1299,7 +1302,7 @@ find_busiest_group(struct sched_domain *
 	 * statistics: its triggered by some value of nr_running (ie. 0).
 	 * Timer based balancing is a good statistic though.
 	 */
-	if (idle == TYPE_NEWIDLE)
+	if (idle == NEWLY_IDLE)
 		modify = 0;
 	else
 		modify = 1;
@@ -1318,7 +1321,7 @@ find_busiest_group(struct sched_domain *
 			if (local_group) {
 				load = get_high_cpu_load(i, modify);
 				if (!idle_cpu(i))
-					package_idle = 0;
+					package_idle = NOT_IDLE;
 			} else
 				load = get_low_cpu_load(i, modify);
 
@@ -1403,11 +1406,11 @@ static runqueue_t *find_busiest_queue(st
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  *
- * Called with this_rq unlocked unless idle == TYPE_NEWIDLE, in which case
+ * Called with this_rq unlocked unless idle == NEWLY_IDLE, in which case
  * it is called with this_rq locked.
  */
 static int load_balance(int this_cpu, runqueue_t *this_rq,
-				struct sched_domain *domain, int idle)
+			struct sched_domain *domain, enum idle_type idle)
 {
 	struct sched_group *group;
 	runqueue_t *busiest = NULL;
@@ -1415,7 +1418,7 @@ static int load_balance(int this_cpu, ru
 	int balanced = 0, failed = 0;
 	int wake_mthread = 0;
 
-	if (idle != TYPE_NEWIDLE)
+	if (idle != NEWLY_IDLE)
 		spin_lock(&this_rq->lock);
 
 	group = find_busiest_group(domain, this_cpu, &imbalance, idle);
@@ -1456,7 +1459,7 @@ static int load_balance(int this_cpu, ru
 	spin_unlock(&busiest->lock);
 
 out:
-	if (idle != TYPE_NEWIDLE) {
+	if (idle != NEWLY_IDLE) {
 		spin_unlock(&this_rq->lock);
 
 		if (failed)
@@ -1495,7 +1498,7 @@ static inline void idle_balance(int this
 			break;
 
 		if (domain->flags & SD_FLAG_NEWIDLE) {
-			if (load_balance(this_cpu, this_rq, domain, TYPE_NEWIDLE)) {
+			if (load_balance(this_cpu, this_rq, domain, NEWLY_IDLE)) {
 				/* We've pulled tasks over so stop searching */
 				break;
 			}
@@ -1537,7 +1540,7 @@ static void active_load_balance(runqueue
 /* Don't have all balancing operations going off at once */
 #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
 
-static void rebalance_tick(int this_cpu, runqueue_t *this_rq, int idle)
+static void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle)
 {
 	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
 	struct sched_domain *domain = this_sched_domain();
@@ -1556,7 +1559,7 @@ static void rebalance_tick(int this_cpu,
 		if (!(j % modulo)) {
 			if (load_balance(this_cpu, this_rq, domain, idle)) {
 				/* We've pulled tasks over so no longer idle */
-				idle = 0;
+				idle = NOT_IDLE;
 			}
 		}
 
@@ -1567,7 +1570,7 @@ static void rebalance_tick(int this_cpu,
 /*
  * on UP we do not need to balance between CPUs:
  */
-static inline void rebalance_tick(int this_cpu, runqueue_t *this_rq, int idle)
+static inline void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle)
 {
 }
 #endif
@@ -1621,7 +1624,7 @@ void scheduler_tick(int user_ticks, int 
 			cpustat->iowait += sys_ticks;
 		else
 			cpustat->idle += sys_ticks;
-		rebalance_tick(cpu, rq, 1);
+		rebalance_tick(cpu, rq, AM_IDLE);
 		return;
 	}
 	if (TASK_NICE(p) > 0)
@@ -1703,7 +1706,7 @@ void scheduler_tick(int user_ticks, int 
 out_unlock:
 	spin_unlock(&rq->lock);
 out:
-	rebalance_tick(cpu, rq, 0);
+	rebalance_tick(cpu, rq, NOT_IDLE);
 }
 
 void scheduling_functions_start_here(void) { }




^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-13  6:43               ` Nick Piggin
  2003-12-14  1:35                 ` bill davidsen
  2003-12-15  5:53                 ` Rusty Russell
@ 2003-12-15 20:21                 ` Zwane Mwaikambo
  2003-12-15 23:20                   ` Nick Piggin
  2 siblings, 1 reply; 80+ messages in thread
From: Zwane Mwaikambo @ 2003-12-15 20:21 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Rusty Russell, linux-kernel, Anton Blanchard, Ingo Molnar,
	Martin J. Bligh, Nakajima, Jun, Mark Wong

Hello Nick,
	This is somewhat unrelated to the current thread topic direction,
but i noticed that your HT scheduler really behaves oddly when the box is
used interactively. The one application which really seemed to be
misbehaving was 'gaim', whenever someone typed a message X11 would pause
for a second or so until the text displayed causing the mouse to jump
around erratically. I can try help with this but unfortunately i can't do
too much rebooting this week due to work (main workstation), but perhaps
we can discuss it over a weekend. The system is 4x logical 2.0GHz.

Thanks

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-15  5:53                 ` Rusty Russell
@ 2003-12-15 23:08                   ` Nick Piggin
  2003-12-19  4:57                     ` Nick Piggin
  2004-01-03 18:57                   ` Bill Davidsen
  1 sibling, 1 reply; 80+ messages in thread
From: Nick Piggin @ 2003-12-15 23:08 UTC (permalink / raw)
  To: Rusty Russell
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Martin J. Bligh,
	Nakajima, Jun, Mark Wong, John Hawkes



Rusty Russell wrote:

>In message <3FDAB517.4000309@cyberone.com.au> you write:
>
>>Rusty Russell wrote:
>>
>>
>>>In message <3FD9679A.1020404@cyberone.com.au> you write:
>>>
>>>
>>>>Thanks for having a look Rusty. I'll try to convince you :)
>>>>
>
>Actually, having produced the patch, I've changed my mind.
>
>While it was spiritually rewarding to separate "struct runqueue" into
>the stuff which was to do with the runqueue, and the stuff which was
>per-cpu but there because it was convenient, I'm not sure the churn is
>worthwhile since we will want the rest of your stuff anyway.
>

OK nice, I haven't heard any other objections. I'll be trying to get
this included in 2.6, so if anyone doesn't like it please speak up.

>
>It (and lots of other things) might become worthwhile if single
>processors with HT become the de-facto standard.  For these, lots of
>our assumptions about CONFIG_SMP, such as the desirability of per-cpu
>data, become bogus.
>
>A few things need work:
>
>1) There's a race between sys_sched_setaffinity() and
>   sched_migrate_task() (this is nothing to do with your patch).
>

Yep. They should both take the task's runqueue lock.

>
>2) Please change those #defines into an enum for idle (patch follows,
>   untested but trivial)
>

Thanks, I'll take the patch.

>
>3) conditional locking in load_balance is v. bad idea.
>

Yeah... I'm thinking about this. I don't think it should be too hard
to break out the shared portion.

>
>4) load_balance returns "(!failed && !balanced)", but callers stop
>   calling it when it returns true.  Why not simply return "balanced",
>   or at least "balanced && !failed"?
>
>

No, the idle balancer stops calling it when it returns true, the periodic
balancer sets idle to 0 when it returns true.

!balanced && !failed means it has moved a task.

I'll either comment that, or return it in a more direct way.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-15 20:21                 ` Zwane Mwaikambo
@ 2003-12-15 23:20                   ` Nick Piggin
  2003-12-16  0:11                     ` Zwane Mwaikambo
  0 siblings, 1 reply; 80+ messages in thread
From: Nick Piggin @ 2003-12-15 23:20 UTC (permalink / raw)
  To: Zwane Mwaikambo
  Cc: Rusty Russell, linux-kernel, Anton Blanchard, Ingo Molnar,
	Martin J. Bligh, Nakajima, Jun, Mark Wong



Zwane Mwaikambo wrote:

>Hello Nick,
>	This is somewhat unrelated to the current thread topic direction,
>but i noticed that your HT scheduler really behaves oddly when the box is
>used interactively. The one application which really seemed to be
>misbehaving was 'gaim', whenever someone typed a message X11 would pause
>for a second or so until the text displayed causing the mouse to jump
>around erratically. I can try help with this but unfortunately i can't do
>too much rebooting this week due to work (main workstation), but perhaps
>we can discuss it over a weekend. The system is 4x logical 2.0GHz.
>

Hi Zwane,
Thats weird. I assume you don't have problems with the regular SMP kernel.
Unfortunately I don't have an HT box I can do these sorts of tests with.
The first thing you could try is with CONFIG_SCHED_SMT 'N'.

Nick


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-15 23:20                   ` Nick Piggin
@ 2003-12-16  0:11                     ` Zwane Mwaikambo
  0 siblings, 0 replies; 80+ messages in thread
From: Zwane Mwaikambo @ 2003-12-16  0:11 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Rusty Russell, linux-kernel, Anton Blanchard, Ingo Molnar,
	Martin J. Bligh, Nakajima, Jun, Mark Wong

On Tue, 16 Dec 2003, Nick Piggin wrote:

> Thats weird. I assume you don't have problems with the regular SMP kernel.
> Unfortunately I don't have an HT box I can do these sorts of tests with.
> The first thing you could try is with CONFIG_SCHED_SMT 'N'.

Yes the regular SMP kernel is fine, it's rather hard to work with so i had
to back it out. i'll give it a go with CONFIG_SCHED_SMT a bit later in the week
and let you know.

Thanks

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-14  2:18                   ` Nick Piggin
  2003-12-14  4:32                     ` Jamie Lokier
@ 2003-12-16 17:34                     ` Bill Davidsen
  1 sibling, 0 replies; 80+ messages in thread
From: Bill Davidsen @ 2003-12-16 17:34 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-kernel

On Sun, 14 Dec 2003, Nick Piggin wrote:

> 
> 
> bill davidsen wrote:
> 
> >In article <3FDAB517.4000309@cyberone.com.au>,
> >Nick Piggin  <piggin@cyberone.com.au> wrote:

> >Shared runqueues sound like a simplification to describe execution units
> >which have shared resourses and null cost of changing units. You can do
> >that by having a domain which behaved like that, but a shared runqueue
> >sounds better because it would eliminate the cost of even considering
> >moving a process from one sibling to another.
> >
> 
> You are correct, however it would be a miniscule cost advantage,
> possibly outweighed by the shared lock, and overhead of more
> changing of CPUs (I'm sure there would be some cost).


> >| But if sched domains are accepted, there is no need for shared runqueues,
> >| because as I said they can do anything sched domains can, so the code would
> >| just be a redundant specialisation - unless you specifically wanted to share
> >| locks & data with siblings.
> >
> >I doubt the gain would be worth the complexity, but what do I know?
> >
> 
> Sorry I didn't follow, gain and complexity of what?

Doing without shared runqueues is what I meant. A single runqueue appears
to avoid having to move processes between runqueues, or considering
*where* to move a process.

> 
> Earlier in the thread Ingo thought my approach is simpler. code size is the
> same size, object size for my patch is significantly smaller, and it does
> more. Benchmarks have so far shown that my patch is as performant as shared
> runqueues.

If Ingo is happy, I am happy. I find shared runqueues very easy to
understand as a way to send tasks to a single HT chip, which is the only
case which comes to mind in which a CPU change is free. Clearly it isn't
going to apply to CPUs which don't share cache and behave more like
individual packages.

-- 
bill davidsen <davidsen@tmr.com>
  CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-14  9:40                       ` Nick Piggin
  2003-12-14 10:46                         ` Arjan van de Ven
@ 2003-12-16 17:46                         ` Bill Davidsen
  1 sibling, 0 replies; 80+ messages in thread
From: Bill Davidsen @ 2003-12-16 17:46 UTC (permalink / raw)
  To: Nick Piggin; +Cc: Jamie Lokier, linux-kernel

On Sun, 14 Dec 2003, Nick Piggin wrote:

> 
> 
> Jamie Lokier wrote:
> 
> >Nick Piggin wrote:
> >
> >>>Shared runqueues sound like a simplification to describe execution units
> >>>which have shared resourses and null cost of changing units. You can do
> >>>that by having a domain which behaved like that, but a shared runqueue
> >>>sounds better because it would eliminate the cost of even considering
> >>>moving a process from one sibling to another.
> >>>
> >>You are correct, however it would be a miniscule cost advantage,
> >>possibly outweighed by the shared lock, and overhead of more
> >>changing of CPUs (I'm sure there would be some cost).
> >>
> >
> >Regarding the overhead of the shared runqueue lock:
> >
> >Is the "lock" prefix actually required for locking between x86
> >siblings which share the same L1 cache?
> >
> 
> That lock is still taken by other CPUs as well for eg. wakeups, balancing,
> and so forth. I guess it could be a very specific optimisation for
> spinlocks in general if there was only one HT core. Don't know if it
> would be worthwhile though.

Well, if you're going to generalize the model, I would think that you
would want some form of local locking within each level, so you don't do
excess locking in NUMA config. It might be that you specify a lock and
lock type for each level, and the type of lock for shared L1 cache
siblings could be NONE.

I'm not sure what kind of generalized model you have in mind for the
future, so I may be totally off in the weeds, but I would assume that a
lock would ideally have some form of locality. In the came socket, on the
same board, on the same backplane, same rack, same room, same planet,
whatever.

-- 
bill davidsen <davidsen@tmr.com>
  CTO, TMR Associates, Inc
Doing interesting things with little computers since 1979.


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-14  4:32                     ` Jamie Lokier
  2003-12-14  9:40                       ` Nick Piggin
@ 2003-12-16 18:22                       ` Linus Torvalds
  2003-12-17  0:24                         ` Davide Libenzi
  1 sibling, 1 reply; 80+ messages in thread
From: Linus Torvalds @ 2003-12-16 18:22 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Nick Piggin, bill davidsen, linux-kernel



On Sun, 14 Dec 2003, Jamie Lokier wrote:

> Nick Piggin wrote:
> > >Shared runqueues sound like a simplification to describe execution units
> > >which have shared resourses and null cost of changing units. You can do
> > >that by having a domain which behaved like that, but a shared runqueue
> > >sounds better because it would eliminate the cost of even considering
> > >moving a process from one sibling to another.
> >
> > You are correct, however it would be a miniscule cost advantage,
> > possibly outweighed by the shared lock, and overhead of more
> > changing of CPUs (I'm sure there would be some cost).
>
> Regarding the overhead of the shared runqueue lock:
>
> Is the "lock" prefix actually required for locking between x86
> siblings which share the same L1 cache?

I bet it is. In a big way.

The lock does two independent things:
 - it tells the core that it can't just crack up the load and store.
 - it also tells other memory ops that they can't re-order around it.

Neither of these have anything to do with the L1 cache.

In short, I'd be very very surprised if you didn't need a "lock" prefix
even between hyperthreaded cores. It might be true in some specific
implementation of HT, but quite frankly I'd doubt it, and I'd be willing
to guarantee that Intel would never make that architectural even if it was
true today (ie it would then break on future versions).

It should be easy enough to test in user space.

[ Time passes ]

Done. Check this program out with and without the "lock ;" prefix. With
the "lock" it will run forever on a HT CPU. Without the lock, it will show
errors pretty much immediately when the two threads start accessing "nr"
concurrently.

		Linus

----
#include <pthread.h>
#include <signal.h>
#include <unistd.h>
#include <stdio.h>

unsigned long nr;

#define LOCK "lock ;"

void * check_bit(int bit)
{
	int set, reset;
	do {
		asm(LOCK "btsl %1,%2; sbbl %0,%0": "=r" (set): "r" (bit), "m" (nr):"memory");
		asm(LOCK "btcl %1,%2; sbbl %0,%0": "=r" (reset): "r" (bit), "m" (nr):"memory");
	} while (reset && !set);
	fprintf(stderr, "bit %d: %d %d (%08x)\n", bit, set, reset, nr);
	return NULL;
}

static void * thread1(void* dummy)
{
	return check_bit(0);
}

static void * thread2(void *dummy)
{
	return check_bit(1);
}

int main(int argc, char ** argv)
{
	pthread_t p;

	pthread_create(&p, NULL, thread1, NULL);
	sleep(1);
	thread2(NULL);
	return 1;
}

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-16 18:22                       ` Linus Torvalds
@ 2003-12-17  0:24                         ` Davide Libenzi
  2003-12-17  0:41                           ` Linus Torvalds
  0 siblings, 1 reply; 80+ messages in thread
From: Davide Libenzi @ 2003-12-17  0:24 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jamie Lokier, Nick Piggin, bill davidsen, Linux Kernel Mailing List

On Tue, 16 Dec 2003, Linus Torvalds wrote:

> I bet it is. In a big way.
> 
> The lock does two independent things:
>  - it tells the core that it can't just crack up the load and store.
>  - it also tells other memory ops that they can't re-order around it.

You forgot the one where no HT knowledge can be used for optimizations. 

- Asserting the CPU's #LOCK pin to take control of the system bus. That in 
  MP system translate into the signaling CPU taking full control of the 
  system bus until the pin is deasserted.



- Davide



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-17  0:24                         ` Davide Libenzi
@ 2003-12-17  0:41                           ` Linus Torvalds
  2003-12-17  0:54                             ` Davide Libenzi
  0 siblings, 1 reply; 80+ messages in thread
From: Linus Torvalds @ 2003-12-17  0:41 UTC (permalink / raw)
  To: Davide Libenzi
  Cc: Jamie Lokier, Nick Piggin, bill davidsen, Linux Kernel Mailing List



On Tue, 16 Dec 2003, Davide Libenzi wrote:
> > I bet it is. In a big way.
> >
> > The lock does two independent things:
> >  - it tells the core that it can't just crack up the load and store.
> >  - it also tells other memory ops that they can't re-order around it.
>
> You forgot the one where no HT knowledge can be used for optimizations.
>
> - Asserting the CPU's #LOCK pin to take control of the system bus. That in
>   MP system translate into the signaling CPU taking full control of the
>   system bus until the pin is deasserted.

I think this is historical and no longer true.

All modern CPU's do atomic accesses on a cache coherency (and write buffer
ordering) level, and it has nothing to do with the external bus any more.

I suspect locked accesses to memory-mapped IO _may_ still set an external
flag, but I seriously doubt anybody cares. I wouldn't be surprised if the
pin doesn't even exist any more.

			Linus

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-17  0:41                           ` Linus Torvalds
@ 2003-12-17  0:54                             ` Davide Libenzi
  0 siblings, 0 replies; 80+ messages in thread
From: Davide Libenzi @ 2003-12-17  0:54 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jamie Lokier, Nick Piggin, bill davidsen, Linux Kernel Mailing List

On Tue, 16 Dec 2003, Linus Torvalds wrote:

> On Tue, 16 Dec 2003, Davide Libenzi wrote:
> > > I bet it is. In a big way.
> > >
> > > The lock does two independent things:
> > >  - it tells the core that it can't just crack up the load and store.
> > >  - it also tells other memory ops that they can't re-order around it.
> >
> > You forgot the one where no HT knowledge can be used for optimizations.
> >
> > - Asserting the CPU's #LOCK pin to take control of the system bus. That in
> >   MP system translate into the signaling CPU taking full control of the
> >   system bus until the pin is deasserted.
> 
> I think this is historical and no longer true.
> 
> All modern CPU's do atomic accesses on a cache coherency (and write buffer
> ordering) level, and it has nothing to do with the external bus any more.
> 
> I suspect locked accesses to memory-mapped IO _may_ still set an external
> flag, but I seriously doubt anybody cares. I wouldn't be surprised if the
> pin doesn't even exist any more.

I just checked the P6 stuff. It does exist but it can is some cases 
(write-back access on data completely on cache, for example) be caught by 
the cache controller that handles the following ops using normal cache 
coherency mechanisms.



- Davide



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] improve rwsem scalability (was Re: [CFT][RFC] HT scheduler)
  2003-12-14  0:44                               ` Nick Piggin
@ 2003-12-17  5:27                                 ` Nick Piggin
  2003-12-19 11:52                                   ` Nick Piggin
  0 siblings, 1 reply; 80+ messages in thread
From: Nick Piggin @ 2003-12-17  5:27 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, William Lee Irwin III, Rusty Russell,
	Anton Blanchard, Martin J. Bligh, Nakajima, Jun, Mark Wong



Nick Piggin wrote:

>
>
> Nick Piggin wrote:
>
>>
>>
>>
>> The benchmark dies at 160 rooms unfortunately. Probably something in 
>> the JVM.
>>
>> I'll do a larger number of runs around the 130-150 mark.
>>
>
> OK, this is an average of 5 runs at 145, 150, 155 rooms with my scheduler
> patches, with and without my rwsem patch. Its all over the place, but 
> I think
> rwsem does give a small but significant improvement.
>
> http://www.kerneltrap.org/~npiggin/rwsem2.png


What do you think? Is there any other sorts of benchmarks I should try?
The improvement I think is significant, although volanomark is quite
erratic and doesn't show it well.

I don't see any problem with moving the wakeups out of the rwsem's spinlock.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-15 23:08                   ` Nick Piggin
@ 2003-12-19  4:57                     ` Nick Piggin
  2003-12-19  5:13                       ` Nick Piggin
  2003-12-20  2:43                       ` Rusty Russell
  0 siblings, 2 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-19  4:57 UTC (permalink / raw)
  To: Rusty Russell
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Martin J. Bligh,
	Nakajima, Jun, Mark Wong, John Hawkes

[-- Attachment #1: Type: text/plain, Size: 1371 bytes --]



Nick Piggin wrote:

>
>
> Rusty Russell wrote:
>
>>
>> A few things need work:
>>
>> 1) There's a race between sys_sched_setaffinity() and
>>   sched_migrate_task() (this is nothing to do with your patch).
>>
>
> Yep. They should both take the task's runqueue lock. 


Easier said than done... anyway, how does this patch look?
It should also cure a possible and not entirely unlikely use
after free of the task_struct in sched_migrate_task on NUMA
AFAIKS.

Patch is on top of a few other changes so might not apply, just
for review. I'll release a new rollup with this included soon.

>
>
>>
>> 2) Please change those #defines into an enum for idle (patch follows,
>>   untested but trivial)
>>
>
> Thanks, I'll take the patch.


done

>
>>
>> 3) conditional locking in load_balance is v. bad idea.
>>
>
> Yeah... I'm thinking about this. I don't think it should be too hard
> to break out the shared portion. 


done

>
>
>>
>> 4) load_balance returns "(!failed && !balanced)", but callers stop
>>   calling it when it returns true.  Why not simply return "balanced",
>>   or at least "balanced && !failed"?
>>
>>
>
> No, the idle balancer stops calling it when it returns true, the periodic
> balancer sets idle to 0 when it returns true.
>
> !balanced && !failed means it has moved a task.
>
> I'll either comment that, or return it in a more direct way.
>

done



[-- Attachment #2: sched-migrate-affinity-race.patch --]
[-- Type: text/plain, Size: 4423 bytes --]


Prevents a race where sys_sched_setaffinity can race with sched_migrate_task
and cause sched_migrate_task to restore an invalid cpu mask.


 linux-2.6-npiggin/kernel/sched.c |   89 +++++++++++++++++++++++++++++----------
 1 files changed, 68 insertions(+), 21 deletions(-)

diff -puN kernel/sched.c~sched-migrate-affinity-race kernel/sched.c
--- linux-2.6/kernel/sched.c~sched-migrate-affinity-race	2003-12-19 14:45:58.000000000 +1100
+++ linux-2.6-npiggin/kernel/sched.c	2003-12-19 15:19:30.000000000 +1100
@@ -947,6 +947,9 @@ static inline void double_rq_unlock(runq
 }
 
 #ifdef CONFIG_NUMA
+
+static inline int __set_cpus_allowed(task_t *p, cpumask_t new_mask, unsigned long *flags);
+
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -955,16 +958,43 @@ static inline void double_rq_unlock(runq
  */
 static void sched_migrate_task(task_t *p, int dest_cpu)
 {
-	cpumask_t old_mask;
+	runqueue_t *rq;
+	unsigned long flags;
+	cpumask_t old_mask, new_mask = cpumask_of_cpu(dest_cpu);
 
+	rq = task_rq_lock(p, &flags);
 	old_mask = p->cpus_allowed;
-	if (!cpu_isset(dest_cpu, old_mask))
+	if (!cpu_isset(dest_cpu, old_mask)) {
+		task_rq_unlock(rq, &flags);
 		return;
+	}
+
+	get_task_struct(p);
+
 	/* force the process onto the specified CPU */
-	set_cpus_allowed(p, cpumask_of_cpu(dest_cpu));
+	if (__set_cpus_allowed(p, new_mask, &flags) < 0)
+		goto out;
+
+	/* __set_cpus_allowed unlocks the runqueue */
+	rq = task_rq_lock(p, &flags);
+	if (unlikely(p->cpus_allowed != new_mask)) {
+		/*
+		 * We have raced with another set_cpus_allowed.
+		 * old_mask is invalid and we needn't move the
+		 * task back.
+		 */
+		task_rq_unlock(rq, &flags);
+		goto out;
+	}
+
+	/*
+	 * restore the cpus allowed mask. old_mask must be valid because
+	 * p->cpus_allowed is a subset of old_mask.
+	 */
+	__set_cpus_allowed(p, old_mask, &flags);
 
-	/* restore the cpus allowed mask */
-	set_cpus_allowed(p, old_mask);
+out:
+	put_task_struct(p);
 }
 
 /*
@@ -2603,31 +2633,27 @@ typedef struct {
 } migration_req_t;
 
 /*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely.  The
- * call is not atomic; no spinlocks may be held.
+ * See comment for set_cpus_allowed. calling rules are different:
+ * the task's runqueue lock must be held, and __set_cpus_allowed
+ * will return with the runqueue unlocked.
  */
-int set_cpus_allowed(task_t *p, cpumask_t new_mask)
+static inline int __set_cpus_allowed(task_t *p, cpumask_t new_mask, unsigned long *flags)
 {
-	unsigned long flags;
 	migration_req_t req;
-	runqueue_t *rq;
+	runqueue_t *rq = task_rq(p);
 
-	if (any_online_cpu(new_mask) == NR_CPUS)
+	if (any_online_cpu(new_mask) == NR_CPUS) {
+		task_rq_unlock(rq, flags);
 		return -EINVAL;
+	}
 
-	rq = task_rq_lock(p, &flags);
 	p->cpus_allowed = new_mask;
 	/*
 	 * Can the task run on the task's current CPU? If not then
 	 * migrate the thread off to a proper CPU.
 	 */
 	if (cpu_isset(task_cpu(p), new_mask)) {
-		task_rq_unlock(rq, &flags);
+		task_rq_unlock(rq, flags);
 		return 0;
 	}
 	/*
@@ -2636,18 +2662,39 @@ int set_cpus_allowed(task_t *p, cpumask_
 	 */
 	if (!p->array && !task_running(rq, p)) {
 		set_task_cpu(p, any_online_cpu(p->cpus_allowed));
-		task_rq_unlock(rq, &flags);
+		task_rq_unlock(rq, flags);
 		return 0;
 	}
+
 	init_completion(&req.done);
 	req.task = p;
 	list_add(&req.list, &rq->migration_queue);
-	task_rq_unlock(rq, &flags);
+	task_rq_unlock(rq, flags);
 
 	wake_up_process(rq->migration_thread);
-
 	wait_for_completion(&req.done);
+
 	return 0;
+
+}
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely.  The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed(task_t *p, cpumask_t new_mask)
+{
+	unsigned long flags;
+	runqueue_t *rq;
+
+	rq = task_rq_lock(p, &flags);
+
+	return __set_cpus_allowed(p, new_mask, &flags);
 }
 
 EXPORT_SYMBOL_GPL(set_cpus_allowed);

_

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-19  4:57                     ` Nick Piggin
@ 2003-12-19  5:13                       ` Nick Piggin
  2003-12-20  2:43                       ` Rusty Russell
  1 sibling, 0 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-19  5:13 UTC (permalink / raw)
  To: Rusty Russell
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Martin J. Bligh,
	Nakajima, Jun, Mark Wong, John Hawkes



Nick Piggin wrote:

>
> Easier said than done... anyway, how does this patch look?
> It should also cure a possible and not entirely unlikely use
> after free of the task_struct in sched_migrate_task on NUMA
> AFAIKS.


Err, no of course it doesn't because its executed by said task.
So the get/put_task_struct can go.




^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] improve rwsem scalability (was Re: [CFT][RFC] HT scheduler)
  2003-12-17  5:27                                 ` Nick Piggin
@ 2003-12-19 11:52                                   ` Nick Piggin
  2003-12-19 15:06                                     ` Martin J. Bligh
  0 siblings, 1 reply; 80+ messages in thread
From: Nick Piggin @ 2003-12-19 11:52 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: linux-kernel, William Lee Irwin III, Rusty Russell,
	Anton Blanchard, Martin J. Bligh, Nakajima, Jun, Mark Wong



Nick Piggin wrote:

>
>
> Nick Piggin wrote:
>
>>
>> OK, this is an average of 5 runs at 145, 150, 155 rooms with my 
>> scheduler
>> patches, with and without my rwsem patch. Its all over the place, but 
>> I think
>> rwsem does give a small but significant improvement.
>>
>> http://www.kerneltrap.org/~npiggin/rwsem2.png
>
>
>
> What do you think? Is there any other sorts of benchmarks I should try?
> The improvement I think is significant, although volanomark is quite
> erratic and doesn't show it well.
>
> I don't see any problem with moving the wakeups out of the rwsem's 
> spinlock.
>

Actually my implementation does have a race because list_del_init isn't
atomic. Shouldn't be difficult to fix if anyone cares about it... otherwise
I won't bother.

Nick


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] improve rwsem scalability (was Re: [CFT][RFC] HT scheduler)
  2003-12-19 11:52                                   ` Nick Piggin
@ 2003-12-19 15:06                                     ` Martin J. Bligh
  2003-12-20  0:08                                       ` Nick Piggin
  0 siblings, 1 reply; 80+ messages in thread
From: Martin J. Bligh @ 2003-12-19 15:06 UTC (permalink / raw)
  To: Nick Piggin, Ingo Molnar
  Cc: linux-kernel, William Lee Irwin III, Rusty Russell,
	Anton Blanchard, Nakajima, Jun, Mark Wong

>> What do you think? Is there any other sorts of benchmarks I should try?
>> The improvement I think is significant, although volanomark is quite
>> erratic and doesn't show it well.
>> 
>> I don't see any problem with moving the wakeups out of the rwsem's 
>> spinlock.
>> 
> 
> Actually my implementation does have a race because list_del_init isn't
> atomic. Shouldn't be difficult to fix if anyone cares about it... otherwise
> I won't bother.

If you can fix it up, I'll get people here to do some more testing on the
patch on other benchmarks, etc.

M.


^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [PATCH] improve rwsem scalability (was Re: [CFT][RFC] HT scheduler)
  2003-12-19 15:06                                     ` Martin J. Bligh
@ 2003-12-20  0:08                                       ` Nick Piggin
  0 siblings, 0 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-20  0:08 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Ingo Molnar, linux-kernel, William Lee Irwin III, Rusty Russell,
	Anton Blanchard, Nakajima, Jun, Mark Wong

[-- Attachment #1: Type: text/plain, Size: 925 bytes --]



Martin J. Bligh wrote:

>>>What do you think? Is there any other sorts of benchmarks I should try?
>>>The improvement I think is significant, although volanomark is quite
>>>erratic and doesn't show it well.
>>>
>>>I don't see any problem with moving the wakeups out of the rwsem's 
>>>spinlock.
>>>
>>>
>>Actually my implementation does have a race because list_del_init isn't
>>atomic. Shouldn't be difficult to fix if anyone cares about it... otherwise
>>I won't bother.
>>
>
>If you can fix it up, I'll get people here to do some more testing on the
>patch on other benchmarks, etc.
>

OK, this one should work. There isn't much that uses rwsems, but mmap_sem
is the obvious one.

So if the patch helps anywhere, it will be something heavily threaded, that
is taking the mmap sem a lot (mostly for reading, sometimes writing), with
a lot of CPUs and a lot of runqueue activity (ie waking up, sleeping, tasks
running).


[-- Attachment #2: rwsem-scale.patch --]
[-- Type: text/plain, Size: 4056 bytes --]


Move rwsem's up_read wakeups out of the semaphore's wait_lock


 linux-2.6-npiggin/lib/rwsem.c |   42 ++++++++++++++++++++++++++++--------------
 1 files changed, 28 insertions(+), 14 deletions(-)

diff -puN lib/rwsem.c~rwsem-scale lib/rwsem.c
--- linux-2.6/lib/rwsem.c~rwsem-scale	2003-12-20 02:17:36.000000000 +1100
+++ linux-2.6-npiggin/lib/rwsem.c	2003-12-20 02:23:59.000000000 +1100
@@ -8,7 +8,7 @@
 #include <linux/module.h>
 
 struct rwsem_waiter {
-	struct list_head	list;
+	struct list_head	list, wake_list;
 	struct task_struct	*task;
 	unsigned int		flags;
 #define RWSEM_WAITING_FOR_READ	0x00000001
@@ -35,9 +35,12 @@ void rwsemtrace(struct rw_semaphore *sem
  * - the spinlock must be held by the caller
  * - woken process blocks are discarded from the list after having flags zeroised
  * - writers are only woken if wakewrite is non-zero
+ *
+ * The spinlock will be dropped by this function
  */
 static inline struct rw_semaphore *__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
 {
+	LIST_HEAD(wake_list);
 	struct rwsem_waiter *waiter;
 	struct list_head *next;
 	signed long oldcount;
@@ -65,7 +68,8 @@ static inline struct rw_semaphore *__rws
 
 	list_del(&waiter->list);
 	waiter->flags = 0;
-	wake_up_process(waiter->task);
+	if (list_empty(&waiter->wake_list))
+		list_add_tail(&waiter->wake_list, &wake_list);
 	goto out;
 
 	/* don't want to wake any writers */
@@ -74,9 +78,10 @@ static inline struct rw_semaphore *__rws
 	if (waiter->flags & RWSEM_WAITING_FOR_WRITE)
 		goto out;
 
-	/* grant an infinite number of read locks to the readers at the front of the queue
-	 * - note we increment the 'active part' of the count by the number of readers (less one
-	 *   for the activity decrement we've already done) before waking any processes up
+	/* grant an infinite number of read locks to the readers at the front
+	 * of the queue - note we increment the 'active part' of the count by
+	 * the number of readers (less one for the activity decrement we've
+	 * already done) before waking any processes up
 	 */
  readers_only:
 	woken = 0;
@@ -100,13 +105,22 @@ static inline struct rw_semaphore *__rws
 		waiter = list_entry(next,struct rwsem_waiter,list);
 		next = waiter->list.next;
 		waiter->flags = 0;
-		wake_up_process(waiter->task);
+		if (list_empty(&waiter->wake_list))
+			list_add_tail(&waiter->wake_list, &wake_list);
 	}
 
 	sem->wait_list.next = next;
 	next->prev = &sem->wait_list;
 
  out:
+	spin_unlock(&sem->wait_lock);
+	while (!list_empty(&wake_list)) {
+		waiter = list_entry(wake_list.next,struct rwsem_waiter,wake_list);
+		list_del(&waiter->wake_list);
+		waiter->wake_list.next = &waiter->wake_list;
+		wmb(); /* Mustn't lose wakeups */
+		wake_up_process(waiter->task);
+	}
 	rwsemtrace(sem,"Leaving __rwsem_do_wake");
 	return sem;
 
@@ -130,9 +144,9 @@ static inline struct rw_semaphore *rwsem
 	set_task_state(tsk,TASK_UNINTERRUPTIBLE);
 
 	/* set up my own style of waitqueue */
-	spin_lock(&sem->wait_lock);
 	waiter->task = tsk;
-
+	INIT_LIST_HEAD(&waiter->wake_list);
+	spin_lock(&sem->wait_lock);
 	list_add_tail(&waiter->list,&sem->wait_list);
 
 	/* note that we're now waiting on the lock, but no longer actively read-locking */
@@ -143,8 +157,8 @@ static inline struct rw_semaphore *rwsem
 	 */
 	if (!(count & RWSEM_ACTIVE_MASK))
 		sem = __rwsem_do_wake(sem,1);
-
-	spin_unlock(&sem->wait_lock);
+	else
+		spin_unlock(&sem->wait_lock);
 
 	/* wait to be given the lock */
 	for (;;) {
@@ -204,8 +218,8 @@ struct rw_semaphore *rwsem_wake(struct r
 	/* do nothing if list empty */
 	if (!list_empty(&sem->wait_list))
 		sem = __rwsem_do_wake(sem,1);
-
-	spin_unlock(&sem->wait_lock);
+	else
+		spin_unlock(&sem->wait_lock);
 
 	rwsemtrace(sem,"Leaving rwsem_wake");
 
@@ -226,8 +240,8 @@ struct rw_semaphore *rwsem_downgrade_wak
 	/* do nothing if list empty */
 	if (!list_empty(&sem->wait_list))
 		sem = __rwsem_do_wake(sem,0);
-
-	spin_unlock(&sem->wait_lock);
+	else
+		spin_unlock(&sem->wait_lock);
 
 	rwsemtrace(sem,"Leaving rwsem_downgrade_wake");
 	return sem;

_

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-19  4:57                     ` Nick Piggin
  2003-12-19  5:13                       ` Nick Piggin
@ 2003-12-20  2:43                       ` Rusty Russell
  2003-12-21  2:56                         ` Nick Piggin
  1 sibling, 1 reply; 80+ messages in thread
From: Rusty Russell @ 2003-12-20  2:43 UTC (permalink / raw)
  To: Nick Piggin
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Martin J. Bligh,
	Nakajima, Jun, Mark Wong, John Hawkes

In message <3FE28529.1010003@cyberone.com.au> you write:
> + * See comment for set_cpus_allowed. calling rules are different:
> + * the task's runqueue lock must be held, and __set_cpus_allowed
> + * will return with the runqueue unlocked.

Please, never *ever* do this.

Locking is probably the hardest thing to get right, and code like this
makes it harder.

Fortunately, there is a simple solution coming with the hotplug CPU
code: we need to hold the cpucontrol semaphore here anyway, against
cpus vanishing.

Perhaps we should just use that in both places.
Rusty.
--
  Anyone who quotes me in their sig is an idiot. -- Rusty Russell.

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-20  2:43                       ` Rusty Russell
@ 2003-12-21  2:56                         ` Nick Piggin
  0 siblings, 0 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-21  2:56 UTC (permalink / raw)
  To: Rusty Russell
  Cc: linux-kernel, Anton Blanchard, Ingo Molnar, Martin J. Bligh,
	Nakajima, Jun, Mark Wong, John Hawkes



Rusty Russell wrote:

>In message <3FE28529.1010003@cyberone.com.au> you write:
>
>>+ * See comment for set_cpus_allowed. calling rules are different:
>>+ * the task's runqueue lock must be held, and __set_cpus_allowed
>>+ * will return with the runqueue unlocked.
>>
>
>Please, never *ever* do this.
>
>Locking is probably the hardest thing to get right, and code like this
>makes it harder.
>

Although in this case it is only one lock, and its only used in
one place, with comments. But yeah its far more complex than a
blocking semaphore would be.

>
>Fortunately, there is a simple solution coming with the hotplug CPU
>code: we need to hold the cpucontrol semaphore here anyway, against
>cpus vanishing.
>
>Perhaps we should just use that in both places.
>

We could just use a private semaphore until the hotplug code is in place.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-15  5:53                 ` Rusty Russell
  2003-12-15 23:08                   ` Nick Piggin
@ 2004-01-03 18:57                   ` Bill Davidsen
  1 sibling, 0 replies; 80+ messages in thread
From: Bill Davidsen @ 2004-01-03 18:57 UTC (permalink / raw)
  To: linux-kernel

Rusty Russell wrote:

> Actually, having produced the patch, I've changed my mind.
> 
> While it was spiritually rewarding to separate "struct runqueue" into
> the stuff which was to do with the runqueue, and the stuff which was
> per-cpu but there because it was convenient, I'm not sure the churn is
> worthwhile since we will want the rest of your stuff anyway.
> 
> It (and lots of other things) might become worthwhile if single
> processors with HT become the de-facto standard.  For these, lots of
> our assumptions about CONFIG_SMP, such as the desirability of per-cpu
> data, become bogus.

Now that Intel is shipping inexpensive CPUs with HT and faster memory 
bus, I think that's the direction of the mass market. It would be very 
desirable to have HT help rather than hinder. However, I admit I'm 
willing to take the 1-2% penalty on light load to get the bonus on heavy 
load.

If someone has measured the effect of HT on interrupt latency or server 
transaction response I haven't seen it, but based on a server I just 
built using WBEL and the RHEL scheduler, first numbers look as if the 
response is better. This is just based on notably less data in the 
incoming sockets, but it's encouraging.

"netstat -t | sort +1nr" shows a LOT fewer sockets with unread bytes.

-- 
bill davidsen <davidsen@tmr.com>
   CTO TMR Associates, Inc
   Doing interesting things with small computers since 1979

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-16 17:37 ` Andrew Theurer
@ 2003-12-17  2:41   ` Nick Piggin
  0 siblings, 0 replies; 80+ messages in thread
From: Nick Piggin @ 2003-12-17  2:41 UTC (permalink / raw)
  To: habanero; +Cc: rusty, linux-kernel



Andrew Theurer wrote:

>>In message <3FD7F1B9.5080100@cyberone.com.au> you write:
>>
>>>http://www.kerneltrap.org/~npiggin/w26/
>>>Against 2.6.0-test11
>>>
>>>This includes the SMT description for P4. Initial results shows
>>>comparable performance to Ingo's shared runqueue's patch on a dual P4
>>>Xeon.
>>>
>>I'm still not convinced.  Sharing runqueues is simple, and in fact
>>exactly what you want for HT: you want to balance *runqueues*, not
>>CPUs.  In fact, it can be done without a CONFIG_SCHED_SMT addition.
>>
>>Your patch is more general, more complex, but doesn't actually seem to
>>buy anything.  It puts a general domain structure inside the
>>scheduler, without putting it anywhere else which wants it (eg. slab
>>cache balancing).  My opinion is either (1) produce a general NUMA
>>topology which can then be used by the scheduler, or (2) do the
>>minimal change in the scheduler which makes HT work well.
>>
>
>FWIW, here is a patch I was working on a while back, to have multilevel NUMA 
>heirarchies (based on arch specific NUMA topology) and more importantly, a 
>runqueue centric point of view for all the load balance routines.  This patch 
>is quite rough, and I have not looked at this patch in a while, but maybe it 
>could help someone?
>
>Also, with runqueue centric approach, shared runqueues should just "work", so 
>adding that to this patch should be fairly clean.  
>
>One more thing, we are missing some stuff in the NUMA topology, which Rusty 
>mentions in another email, like core arrangement, arch states, cache 
>locations/types -all that stuff eventually should make it into some sort of 
>topology, be it NUMA topology stuff or a more generic thing like sysfs.  
>Right now we are a bit limited at what the scheduler looks at, just 
>cpu_to_node type stuff...
>

Hi Andrew,
sched domains can do all this. It is currently set up using just the
simple NUMA toplogy, so its much the same, but the potential is there.

It implements a structure to describe topology in detail which is
used to drive scheduling choices. It could quite easily be extended
to include more memory information and become a general description
for topology.



^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [CFT][RFC] HT scheduler
@ 2003-12-17  0:38 Nakajima, Jun
  0 siblings, 0 replies; 80+ messages in thread
From: Nakajima, Jun @ 2003-12-17  0:38 UTC (permalink / raw)
  To: Davide Libenzi, Linus Torvalds
  Cc: Jamie Lokier, Nick Piggin, bill davidsen, Linux Kernel Mailing List

> You forgot the one where no HT knowledge can be used for
optimizations.
> 
> - Asserting the CPU's #LOCK pin to take control of the system bus.
That in
>   MP system translate into the signaling CPU taking full control of
the
>   system bus until the pin is deasserted.
>

It is not the case with UP (single socket) using HT, where some
optimization for locks could be available. But you still need locks for
the logical processors.

	Jun

> -----Original Message-----
> From: linux-kernel-owner@vger.kernel.org [mailto:linux-kernel-
> owner@vger.kernel.org] On Behalf Of Davide Libenzi
> Sent: Tuesday, December 16, 2003 4:25 PM
> To: Linus Torvalds
> Cc: Jamie Lokier; Nick Piggin; bill davidsen; Linux Kernel Mailing
List
> Subject: Re: [CFT][RFC] HT scheduler
> 
> On Tue, 16 Dec 2003, Linus Torvalds wrote:
> 
> > I bet it is. In a big way.
> >
> > The lock does two independent things:
> >  - it tells the core that it can't just crack up the load and store.
> >  - it also tells other memory ops that they can't re-order around
it.
> 
> You forgot the one where no HT knowledge can be used for
optimizations.
> 
> - Asserting the CPU's #LOCK pin to take control of the system bus.
That in
>   MP system translate into the signaling CPU taking full control of
the
>   system bus until the pin is deasserted.
> 
> 
> 
> - Davide
> 
> 
> -
> To unsubscribe from this list: send the line "unsubscribe
linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 80+ messages in thread

* RE: [CFT][RFC] HT scheduler
@ 2003-12-16 19:03 Nakajima, Jun
  0 siblings, 0 replies; 80+ messages in thread
From: Nakajima, Jun @ 2003-12-16 19:03 UTC (permalink / raw)
  To: Linus Torvalds, Jamie Lokier; +Cc: Nick Piggin, bill davidsen, linux-kernel


> > Regarding the overhead of the shared runqueue lock:
> >
> > Is the "lock" prefix actually required for locking between x86
> > siblings which share the same L1 cache?
> 
> I bet it is. In a big way.

Of course it is.

> 
> The lock does two independent things:
>  - it tells the core that it can't just crack up the load and store.
>  - it also tells other memory ops that they can't re-order around it.
> 
> Neither of these have anything to do with the L1 cache.
> 
> In short, I'd be very very surprised if you didn't need a "lock"
prefix
> even between hyperthreaded cores. It might be true in some specific
> implementation of HT, but quite frankly I'd doubt it, and I'd be
willing
> to guarantee that Intel would never make that architectural even if it
was
> true today (ie it would then break on future versions).

Correct. If such a code happens to be working today, it would be broken
anytime.


> 
> It should be easy enough to test in user space.
> 
> [ Time passes ]
> 
> Done. Check this program out with and without the "lock ;" prefix.
With
> the "lock" it will run forever on a HT CPU. Without the lock, it will
show
> errors pretty much immediately when the two threads start accessing
"nr"
> concurrently.
> 
> 		Linus
> 
> ----
> #include <pthread.h>
> #include <signal.h>
> #include <unistd.h>
> #include <stdio.h>
> 
> unsigned long nr;
> 
> #define LOCK "lock ;"
> 
> void * check_bit(int bit)
> {
> 	int set, reset;
> 	do {
> 		asm(LOCK "btsl %1,%2; sbbl %0,%0": "=r" (set): "r"
(bit), "m"
> (nr):"memory");
> 		asm(LOCK "btcl %1,%2; sbbl %0,%0": "=r" (reset): "r"
(bit),
> "m" (nr):"memory");
> 	} while (reset && !set);
> 	fprintf(stderr, "bit %d: %d %d (%08x)\n", bit, set, reset, nr);
> 	return NULL;
> }
> 
> static void * thread1(void* dummy)
> {
> 	return check_bit(0);
> }
> 
> static void * thread2(void *dummy)
> {
> 	return check_bit(1);
> }
> 
> int main(int argc, char ** argv)
> {
> 	pthread_t p;
> 
> 	pthread_create(&p, NULL, thread1, NULL);
> 	sleep(1);
> 	thread2(NULL);
> 	return 1;
> }
> -
> To unsubscribe from this list: send the line "unsubscribe
linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
       [not found] <200312161127.13691.habanero@us.ibm.com>
@ 2003-12-16 17:37 ` Andrew Theurer
  2003-12-17  2:41   ` Nick Piggin
  0 siblings, 1 reply; 80+ messages in thread
From: Andrew Theurer @ 2003-12-16 17:37 UTC (permalink / raw)
  To: rusty, piggin, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1772 bytes --]

> In message <3FD7F1B9.5080100@cyberone.com.au> you write:
> > http://www.kerneltrap.org/~npiggin/w26/
> > Against 2.6.0-test11
> >
> > This includes the SMT description for P4. Initial results shows
> > comparable performance to Ingo's shared runqueue's patch on a dual P4
> > Xeon.
>
> I'm still not convinced.  Sharing runqueues is simple, and in fact
> exactly what you want for HT: you want to balance *runqueues*, not
> CPUs.  In fact, it can be done without a CONFIG_SCHED_SMT addition.
>
> Your patch is more general, more complex, but doesn't actually seem to
> buy anything.  It puts a general domain structure inside the
> scheduler, without putting it anywhere else which wants it (eg. slab
> cache balancing).  My opinion is either (1) produce a general NUMA
> topology which can then be used by the scheduler, or (2) do the
> minimal change in the scheduler which makes HT work well.

FWIW, here is a patch I was working on a while back, to have multilevel NUMA 
heirarchies (based on arch specific NUMA topology) and more importantly, a 
runqueue centric point of view for all the load balance routines.  This patch 
is quite rough, and I have not looked at this patch in a while, but maybe it 
could help someone?

Also, with runqueue centric approach, shared runqueues should just "work", so 
adding that to this patch should be fairly clean.  

One more thing, we are missing some stuff in the NUMA topology, which Rusty 
mentions in another email, like core arrangement, arch states, cache 
locations/types -all that stuff eventually should make it into some sort of 
topology, be it NUMA topology stuff or a more generic thing like sysfs.  
Right now we are a bit limited at what the scheduler looks at, just 
cpu_to_node type stuff...

-Andrew Theurer

[-- Attachment #2: patch-numasched.260-test3-bk8 --]
[-- Type: text/x-diff, Size: 1396 bytes --]

diff -Naurp linux-2.6.0-test3-bk8-numasched/kernel/sched.c linux-2.6.0-test3-bk8-numasched2/kernel/sched.c
--- linux-2.6.0-test3-bk8-numasched/kernel/sched.c	2003-08-22 12:03:30.000000000 -0500
+++ linux-2.6.0-test3-bk8-numasched2/kernel/sched.c	2003-08-22 12:14:09.000000000 -0500
@@ -1261,7 +1261,7 @@ static void rebalance_tick(runqueue_t *t
 
 	if (idle) {
 		while (node) {
-			if (!(j % node->idle_rebalance_tick))
+			if (!(j % node->idle_rebalance_tick) && (node->nr_cpus > 1))
 				balance_node(this_rq, idle, last_node, node);
 			last_node = node;
 			node = node->parent_node;
@@ -1269,7 +1269,7 @@ static void rebalance_tick(runqueue_t *t
 		return;
 	}
 	while (node) {
-		if (!(j % node->busy_rebalance_tick))
+		if (!(j % node->busy_rebalance_tick) && (node->nr_cpus > 1))
 			balance_node(this_rq, idle, last_node, node);
 		last_node = node;
 		node = node->parent_node;
@@ -1405,6 +1405,7 @@ asmlinkage void schedule(void)
 	runqueue_t *rq;
 	prio_array_t *array;
 	struct list_head *queue;
+	struct node_t *node;
 	int idx;
 
 	/*
@@ -1449,8 +1450,10 @@ need_resched:
 pick_next_task:
 	if (unlikely(!rq->nr_running)) {
 #ifdef CONFIG_SMP
-		if (rq->node)
-			load_balance(rq, 1, rq->node);
+		node = rq->node;
+		while (node->nr_cpus < 2 && node != top_node)
+			node = node->parent_node;
+		load_balance(rq, 1, node);
 		if (rq->nr_running)
 			goto pick_next_task;
 #endif

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
  2003-12-14 16:26             ` [CFT][RFC] HT scheduler Andi Kleen
@ 2003-12-14 16:54               ` Arjan van de Ven
  0 siblings, 0 replies; 80+ messages in thread
From: Arjan van de Ven @ 2003-12-14 16:54 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Arjan van de Ven, linux-kernel

On Sun, Dec 14, 2003 at 05:26:23PM +0100, Andi Kleen wrote:
> Arjan van de Ven <arjanv@redhat.com> writes:
> > 
> > also keep in mind that current x86 processors all will internally
> > optimize out the lock prefix in UP mode or when the cacheline is owned
> > exclusive.... If HT matters here let the cpu optimize it out.....
> 
> Are you sure they optimize it out in UP mode? IMHO this would break
> device drivers that use locked cycles to communicate with bus master
> devices (and which are not necessarily mapped uncachable/write combining) 

uncachable regions are different obviously....

^ permalink raw reply	[flat|nested] 80+ messages in thread

* Re: [CFT][RFC] HT scheduler
       [not found]           ` <1071398761.5233.1.camel@laptop.fenrus.com.suse.lists.linux.kernel>
@ 2003-12-14 16:26             ` Andi Kleen
  2003-12-14 16:54               ` Arjan van de Ven
  0 siblings, 1 reply; 80+ messages in thread
From: Andi Kleen @ 2003-12-14 16:26 UTC (permalink / raw)
  To: Arjan van de Ven; +Cc: linux-kernel

Arjan van de Ven <arjanv@redhat.com> writes:
> 
> also keep in mind that current x86 processors all will internally
> optimize out the lock prefix in UP mode or when the cacheline is owned
> exclusive.... If HT matters here let the cpu optimize it out.....

Are you sure they optimize it out in UP mode? IMHO this would break
device drivers that use locked cycles to communicate with bus master
devices (and which are not necessarily mapped uncachable/write combining) 

-Andi

^ permalink raw reply	[flat|nested] 80+ messages in thread

end of thread, other threads:[~2004-01-03 18:58 UTC | newest]

Thread overview: 80+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-12-08  4:25 [PATCH][RFC] make cpu_sibling_map a cpumask_t Nick Piggin
2003-12-08 15:59 ` Anton Blanchard
2003-12-08 23:08   ` Nick Piggin
2003-12-09  0:14     ` Anton Blanchard
2003-12-11  4:25       ` [CFT][RFC] HT scheduler Nick Piggin
2003-12-11  7:24         ` Nick Piggin
2003-12-11  8:57           ` Nick Piggin
2003-12-11 11:52             ` William Lee Irwin III
2003-12-11 13:09               ` Nick Piggin
2003-12-11 13:23                 ` William Lee Irwin III
2003-12-11 13:30                   ` Nick Piggin
2003-12-11 13:32                     ` William Lee Irwin III
2003-12-11 15:30                       ` Nick Piggin
2003-12-11 15:38                         ` William Lee Irwin III
2003-12-11 15:51                           ` Nick Piggin
2003-12-11 15:56                             ` William Lee Irwin III
2003-12-11 16:37                               ` Nick Piggin
2003-12-11 16:40                                 ` William Lee Irwin III
2003-12-12  1:52                         ` [PATCH] improve rwsem scalability (was Re: [CFT][RFC] HT scheduler) Nick Piggin
2003-12-12  2:02                           ` Nick Piggin
2003-12-12  9:41                           ` Ingo Molnar
2003-12-13  0:07                             ` Nick Piggin
2003-12-14  0:44                               ` Nick Piggin
2003-12-17  5:27                                 ` Nick Piggin
2003-12-19 11:52                                   ` Nick Piggin
2003-12-19 15:06                                     ` Martin J. Bligh
2003-12-20  0:08                                       ` Nick Piggin
2003-12-12  0:58             ` [CFT][RFC] HT scheduler Rusty Russell
2003-12-11 10:01           ` Rhino
2003-12-11  8:14             ` Nick Piggin
2003-12-11 16:49               ` Rhino
2003-12-11 15:16                 ` Nick Piggin
2003-12-11 11:40             ` William Lee Irwin III
2003-12-11 17:05               ` Rhino
2003-12-11 15:17                 ` William Lee Irwin III
2003-12-11 16:28         ` Kevin P. Fleming
2003-12-11 16:41           ` Nick Piggin
2003-12-12  2:24         ` Rusty Russell
2003-12-12  7:00           ` Nick Piggin
2003-12-12  7:23             ` Rusty Russell
2003-12-13  6:43               ` Nick Piggin
2003-12-14  1:35                 ` bill davidsen
2003-12-14  2:18                   ` Nick Piggin
2003-12-14  4:32                     ` Jamie Lokier
2003-12-14  9:40                       ` Nick Piggin
2003-12-14 10:46                         ` Arjan van de Ven
2003-12-16 17:46                         ` Bill Davidsen
2003-12-16 18:22                       ` Linus Torvalds
2003-12-17  0:24                         ` Davide Libenzi
2003-12-17  0:41                           ` Linus Torvalds
2003-12-17  0:54                             ` Davide Libenzi
2003-12-16 17:34                     ` Bill Davidsen
2003-12-15  5:53                 ` Rusty Russell
2003-12-15 23:08                   ` Nick Piggin
2003-12-19  4:57                     ` Nick Piggin
2003-12-19  5:13                       ` Nick Piggin
2003-12-20  2:43                       ` Rusty Russell
2003-12-21  2:56                         ` Nick Piggin
2004-01-03 18:57                   ` Bill Davidsen
2003-12-15 20:21                 ` Zwane Mwaikambo
2003-12-15 23:20                   ` Nick Piggin
2003-12-16  0:11                     ` Zwane Mwaikambo
2003-12-12  8:59             ` Nick Piggin
2003-12-12 15:14               ` Martin J. Bligh
2003-12-08 19:44 ` [PATCH][RFC] make cpu_sibling_map a cpumask_t James Cleverdon
2003-12-08 20:38 ` Ingo Molnar
2003-12-08 20:51 ` Zwane Mwaikambo
2003-12-08 20:55   ` Ingo Molnar
2003-12-08 23:17     ` Nick Piggin
2003-12-08 23:36       ` Ingo Molnar
2003-12-08 23:58         ` Nick Piggin
2003-12-08 23:46 ` Rusty Russell
2003-12-09 13:36   ` Nick Piggin
2003-12-11 21:41     ` bill davidsen
     [not found] <20031213022038.300B22C2C1@lists.samba.org.suse.lists.linux.kernel>
     [not found] ` <3FDAB517.4000309@cyberone.com.au.suse.lists.linux.kernel>
     [not found]   ` <brgeo7$huv$1@gatekeeper.tmr.com.suse.lists.linux.kernel>
     [not found]     ` <3FDBC876.3020603@cyberone.com.au.suse.lists.linux.kernel>
     [not found]       ` <20031214043245.GC21241@mail.shareable.org.suse.lists.linux.kernel>
     [not found]         ` <3FDC3023.9030708@cyberone.com.au.suse.lists.linux.kernel>
     [not found]           ` <1071398761.5233.1.camel@laptop.fenrus.com.suse.lists.linux.kernel>
2003-12-14 16:26             ` [CFT][RFC] HT scheduler Andi Kleen
2003-12-14 16:54               ` Arjan van de Ven
     [not found] <200312161127.13691.habanero@us.ibm.com>
2003-12-16 17:37 ` Andrew Theurer
2003-12-17  2:41   ` Nick Piggin
2003-12-16 19:03 Nakajima, Jun
2003-12-17  0:38 Nakajima, Jun

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).