linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node
@ 2005-12-15  2:33 Ravikiran G Thirumalai
  2005-12-15  2:35 ` [patch 2/3] x86_64: Node local pda take 2 -- cpu_pda_prep Ravikiran G Thirumalai
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Ravikiran G Thirumalai @ 2005-12-15  2:33 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, discuss, Andrew Morton

Here is take 2 on x86_64 node local pda allocation.

This patchset does away with the extra memory reference for non CONFIG_NUMA
case.  The early cpu_to_node helps AMD and EM64T systems which work well
with CONFIG_ACPI_NUMA.  cpu_to_node is not inited early for AMD systems
which work only with old style K8_NUMA. (Tested on EM64 NUMA and Tyan K8
dual core 4 cpu boxes)

Andi, I could not eliminate the need for a initial static pda array, since
sched_init needs the static per-cpu offset array for NR_CPUS early.  Hope
this is OK.

Thanks,
Kiran

---

Patch enables early intialization of cpu_to_node. apicid_to_node is built by reading
the SRAT table, from acpi_numa_init, and x86_cpu_to_apicid is built by parsing the ACPI
MADT table, from acpi_boot_init. We combine these two tables and setup cpu_to_node.

Early intialization helps the static per_cpu_areas in getting pages from correct node.

Tested on EM64T NUMA and Tyan K8 dual core board (with CONFIG_ACPI_NUMA + K8)

Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>

Index: linux-2.6.15-rc4/arch/x86_64/kernel/setup.c
===================================================================
--- linux-2.6.15-rc4.orig/arch/x86_64/kernel/setup.c	2005-12-02 16:25:19.000000000 -0800
+++ linux-2.6.15-rc4/arch/x86_64/kernel/setup.c	2005-12-12 01:49:00.000000000 -0800
@@ -669,6 +669,8 @@
 	acpi_boot_init();
 #endif
 
+	init_cpu_to_node();
+
 #ifdef CONFIG_X86_LOCAL_APIC
 	/*
 	 * get boot-time SMP configuration:
Index: linux-2.6.15-rc4/arch/x86_64/mm/srat.c
===================================================================
--- linux-2.6.15-rc4.orig/arch/x86_64/mm/srat.c	2005-12-01 17:09:51.000000000 -0800
+++ linux-2.6.15-rc4/arch/x86_64/mm/srat.c	2005-12-12 01:19:00.000000000 -0800
@@ -226,4 +226,15 @@
 	return acpi_slit->entry[index + node_to_pxm(b)];
 }
 
+/*
+ * Setup cpu_to_node using the SRAT lapcis & ACPI MADT table
+ * info.
+ */
+void __init init_cpu_to_node(void)
+{
+	int i;	
+ 	for (i = 0; i < NR_CPUS; i++)
+ 		cpu_to_node[i] = apicid_to_node[x86_cpu_to_apicid[i]];
+}
+
 EXPORT_SYMBOL(__node_distance);
Index: linux-2.6.15-rc4/include/linux/acpi.h
===================================================================
--- linux-2.6.15-rc4.orig/include/linux/acpi.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.15-rc4/include/linux/acpi.h	2005-12-12 01:52:28.000000000 -0800
@@ -519,11 +519,16 @@
 
 #ifdef CONFIG_ACPI_NUMA
 int acpi_get_pxm(acpi_handle handle);
+void __init init_cpu_to_node();
 #else
 static inline int acpi_get_pxm(acpi_handle handle)
 {
 	return 0;
 }
+
+static inline void init_cpu_to_node(void)
+{
+}
 #endif
 
 extern int pnpacpi_disabled;

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [patch 2/3] x86_64: Node local pda take 2 -- cpu_pda_prep
  2005-12-15  2:33 [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node Ravikiran G Thirumalai
@ 2005-12-15  2:35 ` Ravikiran G Thirumalai
  2005-12-15  2:37 ` [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation Ravikiran G Thirumalai
  2005-12-15  9:44 ` [discuss] [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node Andi Kleen
  2 siblings, 0 replies; 13+ messages in thread
From: Ravikiran G Thirumalai @ 2005-12-15  2:35 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, discuss, Andrew Morton

Helper patch to change cpu_pda users to use macros to access cpu_pda
instead of the cpu_pda[] array.

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>

Index: linux-2.6.15-rc1git/arch/x86_64/kernel/irq.c
===================================================================
--- linux-2.6.15-rc1git.orig/arch/x86_64/kernel/irq.c	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.15-rc1git/arch/x86_64/kernel/irq.c	2005-11-16 14:08:14.000000000 -0800
@@ -69,13 +69,13 @@
 		seq_printf(p, "NMI: ");
 		for (j = 0; j < NR_CPUS; j++)
 			if (cpu_online(j))
-				seq_printf(p, "%10u ", cpu_pda[j].__nmi_count);
+				seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
 		seq_putc(p, '\n');
 #ifdef CONFIG_X86_LOCAL_APIC
 		seq_printf(p, "LOC: ");
 		for (j = 0; j < NR_CPUS; j++)
 			if (cpu_online(j))
-				seq_printf(p, "%10u ", cpu_pda[j].apic_timer_irqs);
+				seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
 		seq_putc(p, '\n');
 #endif
 		seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
Index: linux-2.6.15-rc1git/arch/x86_64/kernel/nmi.c
===================================================================
--- linux-2.6.15-rc1git.orig/arch/x86_64/kernel/nmi.c	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.15-rc1git/arch/x86_64/kernel/nmi.c	2005-11-16 14:08:14.000000000 -0800
@@ -155,19 +155,19 @@
 		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
-		counts[cpu] = cpu_pda[cpu].__nmi_count; 
+		counts[cpu] = cpu_pda(cpu)->__nmi_count; 
 	local_irq_enable();
 	mdelay((10*1000)/nmi_hz); // wait 10 ticks
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 		if (!cpu_online(cpu))
 			continue;
-		if (cpu_pda[cpu].__nmi_count - counts[cpu] <= 5) {
+		if (cpu_pda(cpu)->__nmi_count - counts[cpu] <= 5) {
 			endflag = 1;
 			printk("CPU#%d: NMI appears to be stuck (%d->%d)!\n",
 			       cpu,
 			       counts[cpu],
-			       cpu_pda[cpu].__nmi_count);
+			       cpu_pda(cpu)->__nmi_count);
 			nmi_active = 0;
 			lapic_nmi_owner &= ~LAPIC_NMI_WATCHDOG;
 			nmi_perfctr_msr = 0;
Index: linux-2.6.15-rc1git/arch/x86_64/kernel/setup64.c
===================================================================
--- linux-2.6.15-rc1git.orig/arch/x86_64/kernel/setup64.c	2005-11-16 12:13:40.000000000 -0800
+++ linux-2.6.15-rc1git/arch/x86_64/kernel/setup64.c	2005-11-16 14:08:14.000000000 -0800
@@ -30,7 +30,7 @@
 
 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
-struct x8664_pda cpu_pda[NR_CPUS] __cacheline_aligned; 
+struct x8664_pda _cpu_pda[NR_CPUS] __cacheline_aligned; 
 
 struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; 
 
@@ -110,18 +110,18 @@
 		}
 		if (!ptr)
 			panic("Cannot allocate cpu data for CPU %d\n", i);
-		cpu_pda[i].data_offset = ptr - __per_cpu_start;
+		cpu_pda(i)->data_offset = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 	}
 } 
 
 void pda_init(int cpu)
 { 
-	struct x8664_pda *pda = &cpu_pda[cpu];
+	struct x8664_pda *pda = cpu_pda(cpu);
 
 	/* Setup up data that may be needed in __get_free_pages early */
 	asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
-	wrmsrl(MSR_GS_BASE, cpu_pda + cpu);
+	wrmsrl(MSR_GS_BASE, pda);
 
 	pda->cpunumber = cpu; 
 	pda->irqcount = -1;
Index: linux-2.6.15-rc1git/arch/x86_64/kernel/smpboot.c
===================================================================
--- linux-2.6.15-rc1git.orig/arch/x86_64/kernel/smpboot.c	2005-11-16 12:13:40.000000000 -0800
+++ linux-2.6.15-rc1git/arch/x86_64/kernel/smpboot.c	2005-11-16 14:08:14.000000000 -0800
@@ -778,7 +778,7 @@
 
 do_rest:
 
-	cpu_pda[cpu].pcurrent = c_idle.idle;
+	cpu_pda(cpu)->pcurrent = c_idle.idle;
 
 	start_rip = setup_trampoline();
 
Index: linux-2.6.15-rc1git/arch/x86_64/kernel/traps.c
===================================================================
--- linux-2.6.15-rc1git.orig/arch/x86_64/kernel/traps.c	2005-11-16 12:13:40.000000000 -0800
+++ linux-2.6.15-rc1git/arch/x86_64/kernel/traps.c	2005-11-16 14:08:14.000000000 -0800
@@ -158,7 +158,7 @@
 {
 	unsigned long addr;
 	const unsigned cpu = safe_smp_processor_id();
-	unsigned long *irqstack_end = (unsigned long *)cpu_pda[cpu].irqstackptr;
+	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
 	int i;
 	unsigned used = 0;
 
@@ -226,8 +226,8 @@
 	unsigned long *stack;
 	int i;
 	const int cpu = safe_smp_processor_id();
-	unsigned long *irqstack_end = (unsigned long *) (cpu_pda[cpu].irqstackptr);
-	unsigned long *irqstack = (unsigned long *) (cpu_pda[cpu].irqstackptr - IRQSTACKSIZE);    
+	unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
+	unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);    
 
 	// debugging aid: "show_stack(NULL, NULL);" prints the
 	// back trace for this cpu.
@@ -275,7 +275,7 @@
 	int in_kernel = !user_mode(regs);
 	unsigned long rsp;
 	const int cpu = safe_smp_processor_id(); 
-	struct task_struct *cur = cpu_pda[cpu].pcurrent; 
+	struct task_struct *cur = cpu_pda(cpu)->pcurrent; 
 
 		rsp = regs->rsp;
 
Index: linux-2.6.15-rc1git/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux-2.6.15-rc1git.orig/arch/x86_64/kernel/x8664_ksyms.c	2005-11-16 12:13:40.000000000 -0800
+++ linux-2.6.15-rc1git/arch/x86_64/kernel/x8664_ksyms.c	2005-11-16 14:08:14.000000000 -0800
@@ -109,7 +109,7 @@
 EXPORT_SYMBOL(copy_page);
 EXPORT_SYMBOL(clear_page);
 
-EXPORT_SYMBOL(cpu_pda);
+EXPORT_SYMBOL(_cpu_pda);
 #ifdef CONFIG_SMP
 EXPORT_SYMBOL(cpu_data);
 EXPORT_SYMBOL(cpu_online_map);
Index: linux-2.6.15-rc1git/arch/x86_64/mm/numa.c
===================================================================
--- linux-2.6.15-rc1git.orig/arch/x86_64/mm/numa.c	2005-11-16 12:13:40.000000000 -0800
+++ linux-2.6.15-rc1git/arch/x86_64/mm/numa.c	2005-11-16 14:11:41.000000000 -0800
@@ -270,7 +270,7 @@
 
 void __cpuinit numa_set_node(int cpu, int node)
 {
-	cpu_pda[cpu].nodenumber = node;
+	cpu_pda(cpu)->nodenumber = node;
 	cpu_to_node[cpu] = node;
 }
 
Index: linux-2.6.15-rc1git/include/asm-x86_64/pda.h
===================================================================
--- linux-2.6.15-rc1git.orig/include/asm-x86_64/pda.h	2005-11-16 12:13:40.000000000 -0800
+++ linux-2.6.15-rc1git/include/asm-x86_64/pda.h	2005-11-16 14:08:14.000000000 -0800
@@ -27,7 +27,9 @@
 #define IRQSTACK_ORDER 2
 #define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER) 
 
-extern struct x8664_pda cpu_pda[];
+extern struct x8664_pda _cpu_pda[];
+
+#define cpu_pda(i) (&_cpu_pda[i])
 
 /* 
  * There is no fast way to get the base address of the PDA, all the accesses
Index: linux-2.6.15-rc1git/include/asm-x86_64/percpu.h
===================================================================
--- linux-2.6.15-rc1git.orig/include/asm-x86_64/percpu.h	2005-10-27 17:02:08.000000000 -0700
+++ linux-2.6.15-rc1git/include/asm-x86_64/percpu.h	2005-11-16 14:08:14.000000000 -0800
@@ -11,7 +11,7 @@
 
 #include <asm/pda.h>
 
-#define __per_cpu_offset(cpu) (cpu_pda[cpu].data_offset)
+#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
 #define __my_cpu_offset() read_pda(data_offset)
 
 /* Separate out the type, so (int[3], foo) works. */

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation
  2005-12-15  2:33 [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node Ravikiran G Thirumalai
  2005-12-15  2:35 ` [patch 2/3] x86_64: Node local pda take 2 -- cpu_pda_prep Ravikiran G Thirumalai
@ 2005-12-15  2:37 ` Ravikiran G Thirumalai
  2005-12-15  8:22   ` Eric Dumazet
  2005-12-15  9:42   ` [discuss] " Andi Kleen
  2005-12-15  9:44 ` [discuss] [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node Andi Kleen
  2 siblings, 2 replies; 13+ messages in thread
From: Ravikiran G Thirumalai @ 2005-12-15  2:37 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, discuss, Andrew Morton, dada1

Patch uses a static PDA array early at boot and reallocates processor PDA
with node local memory when kmalloc is ready, just before pda_init.
The boot_cpu_pda is needed since the cpu_pda is used even before pda_init for
that cpu is called.   
(pda_init is called when APs are brought on at rest_init().  But
setup_per_cpu_areas is called early in start_kernel and 
sched_init uses the per-cpu offset table early)

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Shai Fultheim <shai@scalex86.org>

Index: linux-2.6.15-rc4/arch/x86_64/kernel/head64.c
===================================================================
--- linux-2.6.15-rc4.orig/arch/x86_64/kernel/head64.c	2005-12-12 01:11:01.000000000 -0800
+++ linux-2.6.15-rc4/arch/x86_64/kernel/head64.c	2005-12-12 02:24:02.000000000 -0800
@@ -92,6 +92,11 @@
 	memcpy(init_level4_pgt, boot_level4_pgt, PTRS_PER_PGD*sizeof(pgd_t));
 	asm volatile("movq %0,%%cr3" :: "r" (__pa_symbol(&init_level4_pgt)));
 
+#ifdef CONFIG_NUMA
+ 	for (i = 0; i < NR_CPUS; i++)
+ 		cpu_pda(i) = &boot_cpu_pda[i];
+#endif
+
 	pda_init(0);
 	copy_bootdata(real_mode_data);
 #ifdef CONFIG_SMP
Index: linux-2.6.15-rc4/arch/x86_64/kernel/setup64.c
===================================================================
--- linux-2.6.15-rc4.orig/arch/x86_64/kernel/setup64.c	2005-12-12 02:24:00.000000000 -0800
+++ linux-2.6.15-rc4/arch/x86_64/kernel/setup64.c	2005-12-12 02:24:02.000000000 -0800
@@ -30,7 +30,12 @@
 
 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
-struct x8664_pda _cpu_pda[NR_CPUS] __cacheline_aligned; 
+#ifdef CONFIG_NUMA
+struct x8664_pda *_cpu_pda[NR_CPUS] __read_mostly; 
+struct x8664_pda boot_cpu_pda[NR_CPUS] __cacheline_aligned;
+#else
+struct x8664_pda _cpu_pda[NR_CPUS] __read_mostly;
+#endif
 
 struct desc_ptr idt_descr = { 256 * 16, (unsigned long) idt_table }; 
 
@@ -119,6 +124,25 @@
 { 
 	struct x8664_pda *pda = cpu_pda(cpu);
 
+#ifdef CONFIG_NUMA
+	/* Allocate node local memory for AP pdas */
+	if (cpu) {
+		struct x8664_pda *newpda;
+		newpda = kmalloc_node(sizeof (struct x8664_pda), GFP_ATOMIC,
+				      cpu_to_node(cpu));
+		if (newpda) {
+			printk("Allocating node local PDA for cpu %d at 0x%lx\n",
+				cpu, (unsigned long) newpda);
+			memcpy(newpda, pda, sizeof (struct x8664_pda));
+			pda = newpda;
+			cpu_pda(cpu) = pda;
+		}
+		else
+			printk("Could not allocate node local PDA for cpu %d\n",
+				cpu);
+	}
+#endif
+
 	/* Setup up data that may be needed in __get_free_pages early */
 	asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); 
 	wrmsrl(MSR_GS_BASE, pda);
Index: linux-2.6.15-rc4/include/asm-x86_64/pda.h
===================================================================
--- linux-2.6.15-rc4.orig/include/asm-x86_64/pda.h	2005-12-12 02:24:00.000000000 -0800
+++ linux-2.6.15-rc4/include/asm-x86_64/pda.h	2005-12-12 02:24:02.000000000 -0800
@@ -27,9 +27,14 @@
 #define IRQSTACK_ORDER 2
 #define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER) 
 
+#ifdef CONFIG_NUMA
+extern struct x8664_pda *_cpu_pda[];
+extern struct x8664_pda boot_cpu_pda[];
+#define cpu_pda(i) (_cpu_pda[i])
+#else
 extern struct x8664_pda _cpu_pda[];
-
 #define cpu_pda(i) (&_cpu_pda[i])
+#endif
 
 /* 
  * There is no fast way to get the base address of the PDA, all the accesses

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation
  2005-12-15  2:37 ` [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation Ravikiran G Thirumalai
@ 2005-12-15  8:22   ` Eric Dumazet
  2005-12-15  9:36     ` Andi Kleen
  2005-12-15  9:42   ` [discuss] " Andi Kleen
  1 sibling, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2005-12-15  8:22 UTC (permalink / raw)
  To: Ravikiran G Thirumalai
  Cc: Andi Kleen, linux-kernel, discuss, Andrew Morton, dada1

Ravikiran G Thirumalai a écrit :
> Patch uses a static PDA array early at boot and reallocates processor PDA
> with node local memory when kmalloc is ready, just before pda_init.
> The boot_cpu_pda is needed since the cpu_pda is used even before pda_init for
> that cpu is called.   
> (pda_init is called when APs are brought on at rest_init().  But
> setup_per_cpu_areas is called early in start_kernel and 
> sched_init uses the per-cpu offset table early)

That seems good, thank you !

Do you have an idea of the performance gain we could expect from this node 
local pda allocation ?

Say a CPU is on Node 1,  was a change in pda (allocated on Node 0) immediatly 
mirrored on remote node or not ?

Eric

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation
  2005-12-15  8:22   ` Eric Dumazet
@ 2005-12-15  9:36     ` Andi Kleen
  0 siblings, 0 replies; 13+ messages in thread
From: Andi Kleen @ 2005-12-15  9:36 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Ravikiran G Thirumalai, Andi Kleen, linux-kernel, discuss,
	Andrew Morton, dada1

> Do you have an idea of the performance gain we could expect from this node 
> local pda allocation ?

I wouldn't expect very much.

> Say a CPU is on Node 1,  was a change in pda (allocated on Node 0) 
> immediatly mirrored on remote node or not ?

The Opteron caches are write back afaik - this means data only leaves
the L2 cache when other data pushes it out.
But the additional traffic on the interconnect was likely negligible. 

If anything I would expect the reduced latency when a user space program eat up all 
cache and the PDA is needed on the next kernel entry to be helpful.

But it's not very much at least on an Opteron because the NUMA factor
isn't that bad. On Kiran's machines which likely have a higher NUMA 
factor I guess it helps more.

-Andi

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [discuss] [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation
  2005-12-15  2:37 ` [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation Ravikiran G Thirumalai
  2005-12-15  8:22   ` Eric Dumazet
@ 2005-12-15  9:42   ` Andi Kleen
  2005-12-15 18:47     ` Ravikiran G Thirumalai
  1 sibling, 1 reply; 13+ messages in thread
From: Andi Kleen @ 2005-12-15  9:42 UTC (permalink / raw)
  To: Ravikiran G Thirumalai
  Cc: Andi Kleen, linux-kernel, discuss, Andrew Morton, dada1

On Wed, Dec 14, 2005 at 06:37:48PM -0800, Ravikiran G Thirumalai wrote:
> Patch uses a static PDA array early at boot and reallocates processor PDA
> with node local memory when kmalloc is ready, just before pda_init.
> The boot_cpu_pda is needed since the cpu_pda is used even before pda_init for
> that cpu is called.   
> (pda_init is called when APs are brought on at rest_init().  But
> setup_per_cpu_areas is called early in start_kernel and 
> sched_init uses the per-cpu offset table early)
> 

That is why I suggested to allocate it in smpboot.c in advance before
starting the AP.  Can you please do that change? 

-Andi


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [discuss] [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node
  2005-12-15  2:33 [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node Ravikiran G Thirumalai
  2005-12-15  2:35 ` [patch 2/3] x86_64: Node local pda take 2 -- cpu_pda_prep Ravikiran G Thirumalai
  2005-12-15  2:37 ` [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation Ravikiran G Thirumalai
@ 2005-12-15  9:44 ` Andi Kleen
  2005-12-15 19:01   ` Ravikiran G Thirumalai
  2 siblings, 1 reply; 13+ messages in thread
From: Andi Kleen @ 2005-12-15  9:44 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: Andi Kleen, linux-kernel, discuss, Andrew Morton

On Wed, Dec 14, 2005 at 06:33:45PM -0800, Ravikiran G Thirumalai wrote:
> Here is take 2 on x86_64 node local pda allocation.
> 
> This patchset does away with the extra memory reference for non CONFIG_NUMA
> case.  The early cpu_to_node helps AMD and EM64T systems which work well
> with CONFIG_ACPI_NUMA.  cpu_to_node is not inited early for AMD systems
> which work only with old style K8_NUMA. (Tested on EM64 NUMA and Tyan K8
> dual core 4 cpu boxes)

Thanks for now testing on AMD too - that makes me more confident in your
patches.

> Andi, I could not eliminate the need for a initial static pda array, since
> sched_init needs the static per-cpu offset array for NR_CPUS early.  Hope
> this is OK.

See my comment.

> + * Setup cpu_to_node using the SRAT lapcis & ACPI MADT table
> + * info.
> + */
> +void __init init_cpu_to_node(void)
> +{
> +	int i;	
> + 	for (i = 0; i < NR_CPUS; i++)
> + 		cpu_to_node[i] = apicid_to_node[x86_cpu_to_apicid[i]];
> +}

I would prefer it if you moved that to numa.c and run always 
(even for the k8topology case). Otherwise k8topology will behave
differently whether CONFIG_ACPI_NUMA is set or not, and I don't like
that.

-Andi


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [discuss] [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation
  2005-12-15  9:42   ` [discuss] " Andi Kleen
@ 2005-12-15 18:47     ` Ravikiran G Thirumalai
  2005-12-16  0:19       ` Andi Kleen
  0 siblings, 1 reply; 13+ messages in thread
From: Ravikiran G Thirumalai @ 2005-12-15 18:47 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-kernel, discuss, Andrew Morton, dada1,
	Shai Fultheim (Shai@scalex86.org)

On Thu, Dec 15, 2005 at 10:42:32AM +0100, Andi Kleen wrote:
> On Wed, Dec 14, 2005 at 06:37:48PM -0800, Ravikiran G Thirumalai wrote:
> > Patch uses a static PDA array early at boot and reallocates processor PDA
> > with node local memory when kmalloc is ready, just before pda_init.
> > The boot_cpu_pda is needed since the cpu_pda is used even before pda_init for
> > that cpu is called.   
> > (pda_init is called when APs are brought on at rest_init().  But
> > setup_per_cpu_areas is called early in start_kernel and 
> > sched_init uses the per-cpu offset table early)
> > 
> 
> That is why I suggested to allocate it in smpboot.c in advance before
> starting the AP.  Can you please do that change? 

Maybe I am missing something, or not getting what you are suggesting;
As I see it,

asmlinkage void __init start_kernel(void)
{
	...
	...
	...
	setup_arch(&command_line);  --> (1)
	setup_per_cpu_areas();	    --> (2)
	...
	sched_init();		    --> (3)
	...
        vfs_caches_init_early();
        mem_init();
        kmem_cache_init();	    --> (4)
	...
	rest_init()		    --> (5)
}
	

I could allocate memory for pda somewhere in setup_arch after cpu_to_node is
initialized, but I would have to use alloc_bootmem_node and allocate for 
NR_CPUS, which could be wasteful.  I cannot use kmalloc_node until after (4) 
above, and sched_init refers to the per-cpu offset table before that.

So are you suggesting I use alloc_bootmem_node and allocate PDA for
NR_CPUS?


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [discuss] [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node
  2005-12-15  9:44 ` [discuss] [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node Andi Kleen
@ 2005-12-15 19:01   ` Ravikiran G Thirumalai
  2005-12-16  0:20     ` Andi Kleen
  0 siblings, 1 reply; 13+ messages in thread
From: Ravikiran G Thirumalai @ 2005-12-15 19:01 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, discuss, Andrew Morton

On Thu, Dec 15, 2005 at 10:44:37AM +0100, Andi Kleen wrote:
> On Wed, Dec 14, 2005 at 06:33:45PM -0800, Ravikiran G Thirumalai wrote:
> > + * info.
> > + */
> > +void __init init_cpu_to_node(void)
> > +{
> > +	int i;	
> > + 	for (i = 0; i < NR_CPUS; i++)
> > + 		cpu_to_node[i] = apicid_to_node[x86_cpu_to_apicid[i]];
> > +}
> 
> I would prefer it if you moved that to numa.c and run always 
> (even for the k8topology case). Otherwise k8topology will behave
> differently whether CONFIG_ACPI_NUMA is set or not, and I don't like
> that.

Sure!  I moved it to srat.c based on your suggestion to my earlier post.  
I will move this to numa.c.

Thanks,
Kiran

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [discuss] [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation
  2005-12-15 18:47     ` Ravikiran G Thirumalai
@ 2005-12-16  0:19       ` Andi Kleen
  2005-12-16  3:55         ` Ravikiran G Thirumalai
  0 siblings, 1 reply; 13+ messages in thread
From: Andi Kleen @ 2005-12-16  0:19 UTC (permalink / raw)
  To: Ravikiran G Thirumalai
  Cc: Andi Kleen, linux-kernel, discuss, Andrew Morton, dada1,
	Shai Fultheim (Shai@scalex86.org)

> So are you suggesting I use alloc_bootmem_node and allocate PDA for
> NR_CPUS?

Continue to allocate the boot PDA of the BP statically - this should
be ok because BP should be always on node 0 (or if you're paranoid
about it you could also reallocate, but it's probably not needed) 

And for the APs you allocate the PDA in smpboot.c before actually sending
the startup IPI to the AP. 

-Andi


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [discuss] [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node
  2005-12-15 19:01   ` Ravikiran G Thirumalai
@ 2005-12-16  0:20     ` Andi Kleen
  2005-12-16  8:11       ` Ravikiran G Thirumalai
  0 siblings, 1 reply; 13+ messages in thread
From: Andi Kleen @ 2005-12-16  0:20 UTC (permalink / raw)
  To: Ravikiran G Thirumalai; +Cc: Andi Kleen, linux-kernel, discuss, Andrew Morton

On Thu, Dec 15, 2005 at 11:01:42AM -0800, Ravikiran G Thirumalai wrote:
> On Thu, Dec 15, 2005 at 10:44:37AM +0100, Andi Kleen wrote:
> > On Wed, Dec 14, 2005 at 06:33:45PM -0800, Ravikiran G Thirumalai wrote:
> > > + * info.
> > > + */
> > > +void __init init_cpu_to_node(void)
> > > +{
> > > +	int i;	
> > > + 	for (i = 0; i < NR_CPUS; i++)
> > > + 		cpu_to_node[i] = apicid_to_node[x86_cpu_to_apicid[i]];
> > > +}
> > 
> > I would prefer it if you moved that to numa.c and run always 
> > (even for the k8topology case). Otherwise k8topology will behave
> > differently whether CONFIG_ACPI_NUMA is set or not, and I don't like
> > that.
> 
> Sure!  I moved it to srat.c based on your suggestion to my earlier post.  
> I will move this to numa.c.

Sorry for changing my mind on this. I hope you can bear with me.

-Andi

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [discuss] [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation
  2005-12-16  0:19       ` Andi Kleen
@ 2005-12-16  3:55         ` Ravikiran G Thirumalai
  0 siblings, 0 replies; 13+ messages in thread
From: Ravikiran G Thirumalai @ 2005-12-16  3:55 UTC (permalink / raw)
  To: Andi Kleen
  Cc: linux-kernel, discuss, Andrew Morton, dada1,
	Shai Fultheim (Shai@scalex86.org)

On Fri, Dec 16, 2005 at 01:19:34AM +0100, Andi Kleen wrote:
> 
> And for the APs you allocate the PDA in smpboot.c before actually sending
> the startup IPI to the AP. 

You mean wakeup_secondary_via_INIT, called by do_boot_cpu?
That is too late. sched_init happens much earlier, and the per-cpu offset
table for all AP cpus not present is referenced, and I hit an early exception.
sched_init is executed on the BP very early and sched_init does this:

        for (i = 0; i < NR_CPUS; i++) {
                prio_array_t *array;

                rq = cpu_rq(i); 

The cpu_rq macro ends up needing per-cpu offset table stored in cpu_pda of
the AP cpus, even before we hit the code to send startup IPIs.
(#define __per_cpu_offset(cpu) (cpu_pda[cpu].data_offset))
This is way before slab is ready.  So I either use alloc_bootmem before
sched_init in setup_arch, or keep the static boot_cpu_pda.

Am I missing something?

Thanks,
Kiran

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [discuss] [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node
  2005-12-16  0:20     ` Andi Kleen
@ 2005-12-16  8:11       ` Ravikiran G Thirumalai
  0 siblings, 0 replies; 13+ messages in thread
From: Ravikiran G Thirumalai @ 2005-12-16  8:11 UTC (permalink / raw)
  To: Andi Kleen; +Cc: linux-kernel, discuss, Andrew Morton

On Fri, Dec 16, 2005 at 01:20:01AM +0100, Andi Kleen wrote:
> On Thu, Dec 15, 2005 at 11:01:42AM -0800, Ravikiran G Thirumalai wrote:
> > On Thu, Dec 15, 2005 at 10:44:37AM +0100, Andi Kleen wrote:
> > > On Wed, Dec 14, 2005 at 06:33:45PM -0800, Ravikiran G Thirumalai wrote:
> > 
> > Sure!  I moved it to srat.c based on your suggestion to my earlier post.  
> > I will move this to numa.c.
> 
> Sorry for changing my mind on this. I hope you can bear with me.

No problem.  I hadn't done this earlier 'cause I didn't have a K8 box to
test.  Here is the modified patch.

Thanks,
Kiran

---
Patch enables early intialization of cpu_to_node.
apicid_to_node is built by reading the SRAT table, from acpi_numa_init with 
ACPI_NUMA and k8_scan_nodes with K8_NUMA.
x86_cpu_to_apicid is built by parsing the ACPI MADT table, from acpi_boot_init. We combine these two tables and setup cpu_to_node.

Early intialization helps the static per_cpu_areas in getting pages from 
correct node.

Patch tested on TYAN dual core 4P board with K8 only and then ACPI_NUMA.
Tested on EM64T NUMA too.
 
Signed-off-by: Alok N Kataria <alokk@calsoftinc.com>
Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>

Index: linux-2.6.15-rc5/arch/x86_64/kernel/setup.c
===================================================================
--- linux-2.6.15-rc5.orig/arch/x86_64/kernel/setup.c	2005-12-14 17:02:14.000000000 -0800
+++ linux-2.6.15-rc5/arch/x86_64/kernel/setup.c	2005-12-14 17:16:07.000000000 -0800
@@ -669,6 +669,8 @@
 	acpi_boot_init();
 #endif
 
+	init_cpu_to_node();
+
 #ifdef CONFIG_X86_LOCAL_APIC
 	/*
 	 * get boot-time SMP configuration:
Index: linux-2.6.15-rc5/arch/x86_64/mm/numa.c
===================================================================
--- linux-2.6.15-rc5.orig/arch/x86_64/mm/numa.c	2005-12-15 12:44:39.000000000 -0800
+++ linux-2.6.15-rc5/arch/x86_64/mm/numa.c	2005-12-15 23:03:07.000000000 -0800
@@ -330,6 +330,16 @@
 	return 1;
 } 
 
+/*
+ * Setup early cpu_to_node.
+ */
+void __init init_cpu_to_node(void)
+{
+	int i;	
+ 	for (i = 0; i < NR_CPUS; i++)
+ 		cpu_to_node[i] = apicid_to_node[x86_cpu_to_apicid[i]];
+}
+
 EXPORT_SYMBOL(cpu_to_node);
 EXPORT_SYMBOL(node_to_cpumask);
 EXPORT_SYMBOL(memnode_shift);
Index: linux-2.6.15-rc5/include/asm-x86_64/numa.h
===================================================================
--- linux-2.6.15-rc5.orig/include/asm-x86_64/numa.h	2005-12-14 15:33:35.000000000 -0800
+++ linux-2.6.15-rc5/include/asm-x86_64/numa.h	2005-12-15 23:11:35.000000000 -0800
@@ -21,6 +21,11 @@
 
 extern unsigned char apicid_to_node[256];
 
+#ifdef CONFIG_NUMA
+extern void __init init_cpu_to_node(void);
+#else
+#define init_cpu_to_node() do {} while (0)
+#endif
 #define NUMA_NO_NODE 0xff
 
 #endif

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2005-12-16  8:11 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-12-15  2:33 [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node Ravikiran G Thirumalai
2005-12-15  2:35 ` [patch 2/3] x86_64: Node local pda take 2 -- cpu_pda_prep Ravikiran G Thirumalai
2005-12-15  2:37 ` [patch 3/3] x86_64: Node local pda take 2 -- node local pda allocation Ravikiran G Thirumalai
2005-12-15  8:22   ` Eric Dumazet
2005-12-15  9:36     ` Andi Kleen
2005-12-15  9:42   ` [discuss] " Andi Kleen
2005-12-15 18:47     ` Ravikiran G Thirumalai
2005-12-16  0:19       ` Andi Kleen
2005-12-16  3:55         ` Ravikiran G Thirumalai
2005-12-15  9:44 ` [discuss] [patch 1/3] x86_64: Node local pda take 2 -- early cpu_to_node Andi Kleen
2005-12-15 19:01   ` Ravikiran G Thirumalai
2005-12-16  0:20     ` Andi Kleen
2005-12-16  8:11       ` Ravikiran G Thirumalai

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).