All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Allow per-cpu variables to be page-aligned
@ 2007-03-21  6:10 Rusty Russell
  2007-03-21  6:30 ` [PATCH 0/4] i386 GDT cleanups Rusty Russell
  2007-03-21  9:21 ` [PATCH] Allow per-cpu variables to be page-aligned Eric W. Biederman
  0 siblings, 2 replies; 19+ messages in thread
From: Rusty Russell @ 2007-03-21  6:10 UTC (permalink / raw)
  To: Andrew Morton, Ingo Molnar, Andi Kleen; +Cc: lkml - Kernel Mailing List

[This was part of the GDT cleanups and per-cpu-> pda changes, which I
have revised, but this stands on its own.  The only change is catching
the x86-64 per-cpu allocator too].
==
Let's allow page-alignment in general for per-cpu data (wanted by Xen,
and Ingo suggested KVM as well).

Because larger alignments can use more room, we increase the max
per-cpu memory to 64k rather than 32k: it's getting a little tight.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/kernel/vmlinux.lds.S   |    2 +-
 arch/arm/kernel/vmlinux.lds.S     |    2 +-
 arch/cris/arch-v32/vmlinux.lds.S  |    1 +
 arch/frv/kernel/vmlinux.lds.S     |    1 +
 arch/i386/kernel/vmlinux.lds.S    |    2 +-
 arch/m32r/kernel/vmlinux.lds.S    |    2 +-
 arch/mips/kernel/vmlinux.lds.S    |    2 +-
 arch/parisc/kernel/vmlinux.lds.S  |    2 +-
 arch/powerpc/kernel/setup_64.c    |    4 ++--
 arch/powerpc/kernel/vmlinux.lds.S |    6 +-----
 arch/ppc/kernel/vmlinux.lds.S     |    2 +-
 arch/s390/kernel/vmlinux.lds.S    |    2 +-
 arch/sh/kernel/vmlinux.lds.S      |    2 +-
 arch/sh64/kernel/vmlinux.lds.S    |    2 +-
 arch/sparc/kernel/vmlinux.lds.S   |    2 +-
 arch/sparc64/kernel/smp.c         |    6 +++---
 arch/x86_64/kernel/setup64.c      |    4 ++--
 arch/x86_64/kernel/vmlinux.lds.S  |    2 +-
 arch/xtensa/kernel/vmlinux.lds.S  |    2 +-
 include/linux/percpu.h            |    2 +-
 init/main.c                       |    4 ++--
 kernel/module.c                   |   10 +++++-----
 22 files changed, 31 insertions(+), 33 deletions(-)

===================================================================
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -69,7 +69,7 @@ SECTIONS
   . = ALIGN(8);
   SECURITY_INIT
 
-  . = ALIGN(64);
+  . = ALIGN(8192);
   __per_cpu_start = .;
   .data.percpu : { *(.data.percpu) }
   __per_cpu_end = .;
===================================================================
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -59,7 +59,7 @@ SECTIONS
 			usr/built-in.o(.init.ramfs)
 		__initramfs_end = .;
 #endif
-		. = ALIGN(64);
+		. = ALIGN(4096);
 		__per_cpu_start = .;
 			*(.data.percpu)
 		__per_cpu_end = .;
===================================================================
--- a/arch/cris/arch-v32/vmlinux.lds.S
+++ b/arch/cris/arch-v32/vmlinux.lds.S
@@ -91,6 +91,7 @@ SECTIONS
 	}
 	SECURITY_INIT
 
+	. =  ALIGN (8192);
 	__per_cpu_start = .;
 	.data.percpu  : { *(.data.percpu) }
 	__per_cpu_end = .;
===================================================================
--- a/arch/frv/kernel/vmlinux.lds.S
+++ b/arch/frv/kernel/vmlinux.lds.S
@@ -57,6 +57,7 @@ SECTIONS
   __alt_instructions_end = .;
  .altinstr_replacement : { *(.altinstr_replacement) }
 
+  . = ALIGN(4096);
   __per_cpu_start = .;
   .data.percpu  : { *(.data.percpu) }
   __per_cpu_end = .;
===================================================================
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -195,7 +195,7 @@ SECTIONS
 	__initramfs_end = .;
   }
 #endif
-  . = ALIGN(L1_CACHE_BYTES);
+  . = ALIGN(4096);
   .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
 	__per_cpu_start = .;
 	*(.data.percpu)
===================================================================
--- a/arch/m32r/kernel/vmlinux.lds.S
+++ b/arch/m32r/kernel/vmlinux.lds.S
@@ -110,7 +110,7 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
-  . = ALIGN(32);
+  . = ALIGN(4096);
   __per_cpu_start = .;
   .data.percpu  : { *(.data.percpu) }
   __per_cpu_end = .;
===================================================================
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -119,7 +119,7 @@ SECTIONS
   .init.ramfs : { *(.init.ramfs) }
   __initramfs_end = .;
 #endif
-  . = ALIGN(32);
+  . = ALIGN(_PAGE_SIZE);
   __per_cpu_start = .;
   .data.percpu  : { *(.data.percpu) }
   __per_cpu_end = .;
===================================================================
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -181,7 +181,7 @@ SECTIONS
   .init.ramfs : { *(.init.ramfs) }
   __initramfs_end = .;
 #endif
-  . = ALIGN(32);
+  . = ALIGN(ASM_PAGE_SIZE);
   __per_cpu_start = .;
   .data.percpu  : { *(.data.percpu) }
   __per_cpu_end = .;
===================================================================
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -583,14 +583,14 @@ void __init setup_per_cpu_areas(void)
 	char *ptr;
 
 	/* Copy section for each CPU (we discard the original) */
-	size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
+	size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE);
 #ifdef CONFIG_MODULES
 	if (size < PERCPU_ENOUGH_ROOM)
 		size = PERCPU_ENOUGH_ROOM;
 #endif
 
 	for_each_possible_cpu(i) {
-		ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
+		ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
 		if (!ptr)
 			panic("Cannot allocate cpu data for CPU %d\n", i);
 
===================================================================
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -139,11 +139,7 @@ SECTIONS
 		__initramfs_end = .;
 	}
 #endif
-#ifdef CONFIG_PPC32
-	. = ALIGN(32);
-#else
-	. = ALIGN(128);
-#endif
+	. = ALIGN(PAGE_SIZE);
 	.data.percpu : {
 		__per_cpu_start = .;
 		*(.data.percpu)
===================================================================
--- a/arch/ppc/kernel/vmlinux.lds.S
+++ b/arch/ppc/kernel/vmlinux.lds.S
@@ -130,7 +130,7 @@ SECTIONS
   __ftr_fixup : { *(__ftr_fixup) }
   __stop___ftr_fixup = .;
 
-  . = ALIGN(32);
+  . = ALIGN(4096);
   __per_cpu_start = .;
   .data.percpu  : { *(.data.percpu) }
   __per_cpu_end = .;
===================================================================
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -99,7 +99,7 @@ SECTIONS
   . = ALIGN(2);
   __initramfs_end = .;
 #endif
-  . = ALIGN(256);
+  . = ALIGN(4096);
   __per_cpu_start = .;
   .data.percpu  : { *(.data.percpu) }
   __per_cpu_end = .;
===================================================================
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -54,7 +54,7 @@ SECTIONS
   . = ALIGN(PAGE_SIZE);
   .data.page_aligned : { *(.data.page_aligned) }
 
-  . = ALIGN(L1_CACHE_BYTES);
+  . = ALIGN(PAGE_SIZE);
   __per_cpu_start = .;
   .data.percpu : { *(.data.percpu) }
   __per_cpu_end = .;
===================================================================
--- a/arch/sh64/kernel/vmlinux.lds.S
+++ b/arch/sh64/kernel/vmlinux.lds.S
@@ -85,7 +85,7 @@ SECTIONS
   . = ALIGN(PAGE_SIZE);
   .data.page_aligned : C_PHYS(.data.page_aligned) { *(.data.page_aligned) }
 
-  . = ALIGN(L1_CACHE_BYTES);
+  . = ALIGN(PAGE_SIZE);
   __per_cpu_start = .;
   .data.percpu : C_PHYS(.data.percpu) { *(.data.percpu) }
   __per_cpu_end = . ;
===================================================================
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -65,7 +65,7 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
-  . = ALIGN(32);
+  . = ALIGN(4096);
   __per_cpu_start = .;
   .data.percpu  : { *(.data.percpu) }
   __per_cpu_end = .;
===================================================================
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -1449,11 +1449,11 @@ void __init setup_per_cpu_areas(void)
 	/* Copy section for each CPU (we discard the original) */
 	goal = PERCPU_ENOUGH_ROOM;
 
-	__per_cpu_shift = 0;
-	for (size = 1UL; size < goal; size <<= 1UL)
+	__per_cpu_shift = PAGE_SHIFT;
+	for (size = PAGE_SIZE; size < goal; size <<= 1UL)
 		__per_cpu_shift++;
 
-	ptr = alloc_bootmem(size * NR_CPUS);
+	ptr = alloc_bootmem_pages(size * NR_CPUS);
 
 	__per_cpu_base = ptr - __per_cpu_start;
 
===================================================================
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -103,9 +103,9 @@ void __init setup_per_cpu_areas(void)
 		if (!NODE_DATA(cpu_to_node(i))) {
 			printk("cpu with no node %d, num_online_nodes %d\n",
 			       i, num_online_nodes());
-			ptr = alloc_bootmem(size);
+			ptr = alloc_bootmem_pages(size);
 		} else { 
-			ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size);
+			ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size);
 		}
 		if (!ptr)
 			panic("Cannot allocate cpu data for CPU %d\n", i);
===================================================================
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -194,7 +194,7 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
-    . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+  . = ALIGN(4096);
   __per_cpu_start = .;
   .data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) }
   __per_cpu_end = .;
===================================================================
--- a/arch/xtensa/kernel/vmlinux.lds.S
+++ b/arch/xtensa/kernel/vmlinux.lds.S
@@ -198,7 +198,7 @@ SECTIONS
   __ftr_fixup : { *(__ftr_fixup) }
   __stop___ftr_fixup = .;
 
-  . = ALIGN(32);
+  . = ALIGN(4096);
   __per_cpu_start = .;
   .data.percpu  : { *(.data.percpu) }
   __per_cpu_end = .;
===================================================================
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -11,7 +11,7 @@
 
 /* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
 #ifndef PERCPU_ENOUGH_ROOM
-#define PERCPU_ENOUGH_ROOM 32768
+#define PERCPU_ENOUGH_ROOM 65536
 #endif
 
 /*
===================================================================
--- a/init/main.c
+++ b/init/main.c
@@ -369,12 +369,12 @@ static void __init setup_per_cpu_areas(v
 	unsigned long nr_possible_cpus = num_possible_cpus();
 
 	/* Copy section for each CPU (we discard the original) */
-	size = ALIGN(__per_cpu_end - __per_cpu_start, SMP_CACHE_BYTES);
+	size = ALIGN(__per_cpu_end - __per_cpu_start, PAGE_SIZE);
 #ifdef CONFIG_MODULES
 	if (size < PERCPU_ENOUGH_ROOM)
 		size = PERCPU_ENOUGH_ROOM;
 #endif
-	ptr = alloc_bootmem(size * nr_possible_cpus);
+	ptr = alloc_bootmem_pages(size * nr_possible_cpus);
 
 	for_each_possible_cpu(i) {
 		__per_cpu_offset[i] = ptr - __per_cpu_start;
===================================================================
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -346,10 +346,10 @@ static void *percpu_modalloc(unsigned lo
 	unsigned int i;
 	void *ptr;
 
-	if (align > SMP_CACHE_BYTES) {
-		printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n",
-		       name, align, SMP_CACHE_BYTES);
-		align = SMP_CACHE_BYTES;
+	if (align > PAGE_SIZE) {
+		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+		       name, align, PAGE_SIZE);
+		align = PAGE_SIZE;
 	}
 
 	ptr = __per_cpu_start;
@@ -430,7 +430,7 @@ static int percpu_modinit(void)
 	pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
 			    GFP_KERNEL);
 	/* Static in-kernel percpu data (used). */
-	pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES);
+	pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, PAGE_SIZE);
 	/* Free room. */
 	pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
 	if (pcpu_size[1] < 0) {





^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 0/4] i386 GDT cleanups
  2007-03-21  6:10 [PATCH] Allow per-cpu variables to be page-aligned Rusty Russell
@ 2007-03-21  6:30 ` Rusty Russell
  2007-03-21  6:32   ` [PATCH 1/4] i386 GDT cleanups: Use per-cpu variables for GDT, PDA Rusty Russell
  2007-03-21  9:21 ` [PATCH] Allow per-cpu variables to be page-aligned Eric W. Biederman
  1 sibling, 1 reply; 19+ messages in thread
From: Rusty Russell @ 2007-03-21  6:30 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ingo Molnar, Andi Kleen, lkml - Kernel Mailing List

After lots of good feedback and contributions from the last series, this
set of 4 simply cleans up GDT usage in i386.  Percpu->pda is not
included: it's really a separate problem (but made much simpler by these
patches).

Patches are:
no-gdt-pda-alloc.patch 
        - Simplify by using per-cpu vars for gdt & pda, not allocating.
        This patch has been seen here before, and Jeremy Fitzhardinge
        acked it.

direct-percpu-gdt.patch 
        - Simplify boot by switching straight from boot_gdt_table
        straight to per-cpu versions, rather than going to cpu_gdt_table
        then per-cpu gdt.  This is a new approach after Ingo cautioned
        about removing boot_gdt_table.

cleanup-cpuinits.patch 
        - Simple patch: we can now roll two identical functions together
        
cleanup-gdt-accessors.patch 
        - Remove a level of indirection

Cheers,
Rusty.



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 1/4] i386 GDT cleanups: Use per-cpu variables for GDT, PDA
  2007-03-21  6:30 ` [PATCH 0/4] i386 GDT cleanups Rusty Russell
@ 2007-03-21  6:32   ` Rusty Russell
  2007-03-21  6:35     ` [PATCH 2/4] i386 GDT cleanups: Use per-cpu GDT immediately upon boot Rusty Russell
  0 siblings, 1 reply; 19+ messages in thread
From: Rusty Russell @ 2007-03-21  6:32 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ingo Molnar, Andi Kleen, lkml - Kernel Mailing List

Allocating PDA and GDT at boot is a pain.  Using simple per-cpu
variables adds happiness (although we need the GDT page-aligned for
Xen, which we do in a followup patch).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

---
 arch/i386/kernel/cpu/common.c        |   96 +++++------------------------------
 arch/i386/kernel/smpboot.c           |   21 -------
 arch/i386/mach-voyager/voyager_smp.c |   10 ---
 include/asm-i386/desc.h              |    1 
 include/asm-i386/pda.h               |    7 +-
 include/asm-i386/processor.h         |    2 
 6 files changed, 21 insertions(+), 116 deletions(-)

diff -r d9b1ba2049f8 arch/i386/kernel/cpu/common.c
--- a/arch/i386/kernel/cpu/common.c	Wed Mar 21 10:14:02 2007 +1100
+++ b/arch/i386/kernel/cpu/common.c	Wed Mar 21 14:21:43 2007 +1100
@@ -25,8 +25,10 @@ DEFINE_PER_CPU(struct Xgt_desc_struct, c
 DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
 EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
 
-struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
+DEFINE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]);
+
+DEFINE_PER_CPU(struct i386_pda, _cpu_pda);
+EXPORT_PER_CPU_SYMBOL(_cpu_pda);
 
 static int cachesize_override __cpuinitdata = -1;
 static int disable_x86_fxsr __cpuinitdata;
@@ -609,52 +611,6 @@ struct pt_regs * __devinit idle_regs(str
 	return regs;
 }
 
-static __cpuinit int alloc_gdt(int cpu)
-{
-	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
-	struct desc_struct *gdt;
-	struct i386_pda *pda;
-
-	gdt = (struct desc_struct *)cpu_gdt_descr->address;
-	pda = cpu_pda(cpu);
-
-	/*
-	 * This is a horrible hack to allocate the GDT.  The problem
-	 * is that cpu_init() is called really early for the boot CPU
-	 * (and hence needs bootmem) but much later for the secondary
-	 * CPUs, when bootmem will have gone away
-	 */
-	if (NODE_DATA(0)->bdata->node_bootmem_map) {
-		BUG_ON(gdt != NULL || pda != NULL);
-
-		gdt = alloc_bootmem_pages(PAGE_SIZE);
-		pda = alloc_bootmem(sizeof(*pda));
-		/* alloc_bootmem(_pages) panics on failure, so no check */
-
-		memset(gdt, 0, PAGE_SIZE);
-		memset(pda, 0, sizeof(*pda));
-	} else {
-		/* GDT and PDA might already have been allocated if
-		   this is a CPU hotplug re-insertion. */
-		if (gdt == NULL)
-			gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL);
-
-		if (pda == NULL)
-			pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu));
-
-		if (unlikely(!gdt || !pda)) {
-			free_pages((unsigned long)gdt, 0);
-			kfree(pda);
-			return 0;
-		}
-	}
-
- 	cpu_gdt_descr->address = (unsigned long)gdt;
-	cpu_pda(cpu) = pda;
-
-	return 1;
-}
-
 /* Initial PDA used by boot CPU */
 struct i386_pda boot_pda = {
 	._pda = &boot_pda,
@@ -670,31 +626,17 @@ static inline void set_kernel_fs(void)
 	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
 }
 
-/* Initialize the CPU's GDT and PDA.  The boot CPU does this for
-   itself, but secondaries find this done for them. */
-__cpuinit int init_gdt(int cpu, struct task_struct *idle)
+/* Initialize the CPU's GDT and PDA.  This is either the boot CPU doing itself
+   (still using cpu_gdt_table), or a CPU doing it for a secondary which
+   will soon come up. */
+__cpuinit void init_gdt(int cpu, struct task_struct *idle)
 {
 	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
-	struct desc_struct *gdt;
-	struct i386_pda *pda;
-
-	/* For non-boot CPUs, the GDT and PDA should already have been
-	   allocated. */
-	if (!alloc_gdt(cpu)) {
-		printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu);
-		return 0;
-	}
-
-	gdt = (struct desc_struct *)cpu_gdt_descr->address;
-	pda = cpu_pda(cpu);
-
-	BUG_ON(gdt == NULL || pda == NULL);
-
-	/*
-	 * Initialize the per-CPU GDT with the boot GDT,
-	 * and set up the GDT descriptor:
-	 */
+	struct desc_struct *gdt = per_cpu(cpu_gdt, cpu);
+	struct i386_pda *pda = &per_cpu(_cpu_pda, cpu);
+
  	memcpy(gdt, cpu_gdt_table, GDT_SIZE);
+ 	cpu_gdt_descr->address = (unsigned long)gdt;
 	cpu_gdt_descr->size = GDT_SIZE - 1;
 
 	pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
@@ -706,17 +648,12 @@ __cpuinit int init_gdt(int cpu, struct t
 	pda->_pda = pda;
 	pda->cpu_number = cpu;
 	pda->pcurrent = idle;
-
-	return 1;
 }
 
 void __cpuinit cpu_set_gdt(int cpu)
 {
 	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
 
-	/* Reinit these anyway, even if they've already been done (on
-	   the boot CPU, this will transition from the boot gdt+pda to
-	   the real ones). */
 	load_gdt(cpu_gdt_descr);
 	set_kernel_fs();
 }
@@ -804,13 +741,8 @@ void __cpuinit cpu_init(void)
 	struct task_struct *curr = current;
 
 	/* Set up the real GDT and PDA, so we can transition from the
-	   boot versions. */
-	if (!init_gdt(cpu, curr)) {
-		/* failed to allocate something; not much we can do... */
-		for (;;)
-			local_irq_enable();
-	}
-
+	   boot_gdt_table & boot_pda. */
+	init_gdt(cpu, curr);
 	cpu_set_gdt(cpu);
 	_cpu_init(cpu, curr);
 }
diff -r d9b1ba2049f8 arch/i386/kernel/smpboot.c
--- a/arch/i386/kernel/smpboot.c	Wed Mar 21 10:14:02 2007 +1100
+++ b/arch/i386/kernel/smpboot.c	Wed Mar 21 14:20:35 2007 +1100
@@ -808,13 +808,7 @@ static int __cpuinit do_boot_cpu(int api
 	if (IS_ERR(idle))
 		panic("failed fork for CPU %d", cpu);
 
-	/* Pre-allocate and initialize the CPU's GDT and PDA so it
-	   doesn't have to do any memory allocation during the
-	   delicate CPU-bringup phase. */
-	if (!init_gdt(cpu, idle)) {
-		printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
-		return -1;	/* ? */
-	}
+	init_gdt(cpu, idle);
 
 	idle->thread.eip = (unsigned long) start_secondary;
 	/* start_eip had better be page-aligned! */
@@ -940,24 +934,11 @@ static int __cpuinit __smp_prepare_cpu(i
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct warm_boot_cpu_info info;
 	int	apicid, ret;
-	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
 
 	apicid = x86_cpu_to_apicid[cpu];
 	if (apicid == BAD_APICID) {
 		ret = -ENODEV;
 		goto exit;
-	}
-
-	/*
-	 * the CPU isn't initialized at boot time, allocate gdt table here.
-	 * cpu_init will initialize it
-	 */
-	if (!cpu_gdt_descr->address) {
-		cpu_gdt_descr->address = get_zeroed_page(GFP_KERNEL);
-		if (!cpu_gdt_descr->address)
-			printk(KERN_CRIT "CPU%d failed to allocate GDT\n", cpu);
-			ret = -ENOMEM;
-			goto exit;
 	}
 
 	info.complete = &done;
diff -r d9b1ba2049f8 arch/i386/mach-voyager/voyager_smp.c
--- a/arch/i386/mach-voyager/voyager_smp.c	Wed Mar 21 10:14:02 2007 +1100
+++ b/arch/i386/mach-voyager/voyager_smp.c	Wed Mar 21 14:20:35 2007 +1100
@@ -580,15 +580,7 @@ do_boot_cpu(__u8 cpu)
 	/* init_tasks (in sched.c) is indexed logically */
 	stack_start.esp = (void *) idle->thread.esp;
 
-	/* Pre-allocate and initialize the CPU's GDT and PDA so it
-	   doesn't have to do any memory allocation during the
-	   delicate CPU-bringup phase. */
-	if (!init_gdt(cpu, idle)) {
-		printk(KERN_INFO "Couldn't allocate GDT/PDA for CPU %d\n", cpu);
-		cpucount--;
-		return;
-	}
-
+	init_gdt(cpu, idle);
 	irq_ctx_init(cpu);
 
 	/* Note: Don't modify initial ss override */
diff -r d9b1ba2049f8 include/asm-i386/desc.h
--- a/include/asm-i386/desc.h	Wed Mar 21 10:14:02 2007 +1100
+++ b/include/asm-i386/desc.h	Wed Mar 21 14:20:35 2007 +1100
@@ -22,6 +22,7 @@ struct Xgt_desc_struct {
 
 extern struct Xgt_desc_struct idt_descr;
 DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
+DECLARE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]);
 extern struct Xgt_desc_struct early_gdt_descr;
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
diff -r d9b1ba2049f8 include/asm-i386/pda.h
--- a/include/asm-i386/pda.h	Wed Mar 21 10:14:02 2007 +1100
+++ b/include/asm-i386/pda.h	Wed Mar 21 14:22:17 2007 +1100
@@ -8,6 +8,7 @@
 
 #include <linux/stddef.h>
 #include <linux/types.h>
+#include <asm/percpu.h>
 
 struct i386_pda
 {
@@ -18,10 +19,8 @@ struct i386_pda
 	struct pt_regs *irq_regs;
 };
 
-extern struct i386_pda *_cpu_pda[];
-
-#define cpu_pda(i)	(_cpu_pda[i])
-
+DECLARE_PER_CPU(struct i386_pda, _cpu_pda);
+#define cpu_pda(i)	(&per_cpu(_cpu_pda, (i)))
 #define pda_offset(field) offsetof(struct i386_pda, field)
 
 extern void __bad_pda_field(void);
diff -r d9b1ba2049f8 include/asm-i386/processor.h
--- a/include/asm-i386/processor.h	Wed Mar 21 10:14:02 2007 +1100
+++ b/include/asm-i386/processor.h	Wed Mar 21 14:20:35 2007 +1100
@@ -743,7 +743,7 @@ extern void enable_sep_cpu(void);
 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
 
-extern int init_gdt(int cpu, struct task_struct *idle);
+extern void init_gdt(int cpu, struct task_struct *idle);
 extern void cpu_set_gdt(int);
 extern void secondary_cpu_init(void);
 



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 2/4] i386 GDT cleanups: Use per-cpu GDT immediately upon boot
  2007-03-21  6:32   ` [PATCH 1/4] i386 GDT cleanups: Use per-cpu variables for GDT, PDA Rusty Russell
@ 2007-03-21  6:35     ` Rusty Russell
  2007-03-21  6:36       ` [PATCH 3/4] i386 GDT cleanups: clean up cpu_init() Rusty Russell
  2007-03-21  9:31       ` [PATCH 2/4] i386 GDT cleanups: Use per-cpu GDT immediately upon boot Eric W. Biederman
  0 siblings, 2 replies; 19+ messages in thread
From: Rusty Russell @ 2007-03-21  6:35 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ingo Molnar, Andi Kleen, lkml - Kernel Mailing List

Now we are no longer dynamically allocating the GDT, we don't need the
"cpu_gdt_table" at all: we can switch straight from "boot_gdt_table"
to the per-cpu GDT.  This means initializing the cpu_gdt array in C.

The boot CPU uses the per-cpu var directly, then in smp_prepare_cpus()
it switches to the per-cpu copy just allocated.  For secondary CPUs,
the early_gdt_descr is set to point directly to their per-cpu copy.

For UP the code is very simple: it keeps using the "per-cpu" GDT as
per SMP, but we never have to move.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/i386/kernel/cpu/common.c        |   74 ++++++++++++----------------------
 arch/i386/kernel/head.S              |   65 -----------------------------
 arch/i386/kernel/smpboot.c           |   59 ++++++++++++++++++++++-----
 arch/i386/mach-voyager/voyager_smp.c |    6 --
 include/asm-i386/desc.h              |    2
 include/asm-i386/processor.h         |    1
 6 files changed, 77 insertions(+), 130 deletions(-)

diff -r a8a4e2f9da08 arch/i386/kernel/cpu/common.c
--- a/arch/i386/kernel/cpu/common.c	Wed Mar 21 15:20:48 2007 +1100
+++ b/arch/i386/kernel/cpu/common.c	Wed Mar 21 15:31:49 2007 +1100
@@ -25,7 +25,33 @@ DEFINE_PER_CPU(struct Xgt_desc_struct, c
 DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
 EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
 
-DEFINE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]);
+DEFINE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]) = {
+	[GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
+	[GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
+	[GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
+	[GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
+	/*
+	 * Segments used for calling PnP BIOS have byte granularity.
+	 * They code segments and data segments have fixed 64k limits,
+	 * the transfer segment sizes are set at run time.
+	 */
+	[GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
+	[GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
+	[GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
+	[GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
+	[GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
+	/*
+	 * The APM segments have byte granularity and their bases
+	 * are set at run time.  All have 64k limits.
+	 */
+	[GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
+	/* 16-bit code */
+	[GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
+	[GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
+
+	[GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
+	[GDT_ENTRY_PDA] = { 0x00000000, 0x00c09200 }, /* set in setup_pda */
+};
 
 DEFINE_PER_CPU(struct i386_pda, _cpu_pda);
 EXPORT_PER_CPU_SYMBOL(_cpu_pda);
@@ -618,46 +644,6 @@ struct i386_pda boot_pda = {
 	.pcurrent = &init_task,
 };
 
-static inline void set_kernel_fs(void)
-{
-	/* Set %fs for this CPU's PDA.  Memory clobber is to create a
-	   barrier with respect to any PDA operations, so the compiler
-	   doesn't move any before here. */
-	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
-}
-
-/* Initialize the CPU's GDT and PDA.  This is either the boot CPU doing itself
-   (still using cpu_gdt_table), or a CPU doing it for a secondary which
-   will soon come up. */
-__cpuinit void init_gdt(int cpu, struct task_struct *idle)
-{
-	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
-	struct desc_struct *gdt = per_cpu(cpu_gdt, cpu);
-	struct i386_pda *pda = &per_cpu(_cpu_pda, cpu);
-
- 	memcpy(gdt, cpu_gdt_table, GDT_SIZE);
- 	cpu_gdt_descr->address = (unsigned long)gdt;
-	cpu_gdt_descr->size = GDT_SIZE - 1;
-
-	pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
-			(u32 *)&gdt[GDT_ENTRY_PDA].b,
-			(unsigned long)pda, sizeof(*pda) - 1,
-			0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
-
-	memset(pda, 0, sizeof(*pda));
-	pda->_pda = pda;
-	pda->cpu_number = cpu;
-	pda->pcurrent = idle;
-}
-
-void __cpuinit cpu_set_gdt(int cpu)
-{
-	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
-
-	load_gdt(cpu_gdt_descr);
-	set_kernel_fs();
-}
-
 /* Common CPU init for both boot and secondary CPUs */
 static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
 {
@@ -740,10 +726,6 @@ void __cpuinit cpu_init(void)
 	int cpu = smp_processor_id();
 	struct task_struct *curr = current;
 
-	/* Set up the real GDT and PDA, so we can transition from the
-	   boot_gdt_table & boot_pda. */
-	init_gdt(cpu, curr);
-	cpu_set_gdt(cpu);
 	_cpu_init(cpu, curr);
 }
 
diff -r a8a4e2f9da08 arch/i386/kernel/head.S
--- a/arch/i386/kernel/head.S	Wed Mar 21 15:20:48 2007 +1100
+++ b/arch/i386/kernel/head.S	Wed Mar 21 15:32:38 2007 +1100
@@ -599,67 +599,10 @@ idt_descr:
 	.word 0				# 32 bit align gdt_desc.address
 ENTRY(early_gdt_descr)
 	.word GDT_ENTRIES*8-1
-	.long cpu_gdt_table
-
-/*
- * The boot_gdt_table must mirror the equivalent in setup.S and is
- * used only for booting.
- */
+	.long per_cpu__cpu_gdt		/* Overwritten for secondary CPUs */
+
 	.align L1_CACHE_BYTES
 ENTRY(boot_gdt_table)
 	.fill GDT_ENTRY_BOOT_CS,8,0
 	.quad 0x00cf9a000000ffff	/* kernel 4GB code at 0x00000000 */
 	.quad 0x00cf92000000ffff	/* kernel 4GB data at 0x00000000 */
-
-/*
- * The Global Descriptor Table contains 32 quadwords, per-CPU.
- */
-	.align L1_CACHE_BYTES
-ENTRY(cpu_gdt_table)
-	.quad 0x0000000000000000	/* NULL descriptor */
-	.quad 0x0000000000000000	/* 0x0b reserved */
-	.quad 0x0000000000000000	/* 0x13 reserved */
-	.quad 0x0000000000000000	/* 0x1b reserved */
-	.quad 0x0000000000000000	/* 0x20 unused */
-	.quad 0x0000000000000000	/* 0x28 unused */
-	.quad 0x0000000000000000	/* 0x33 TLS entry 1 */
-	.quad 0x0000000000000000	/* 0x3b TLS entry 2 */
-	.quad 0x0000000000000000	/* 0x43 TLS entry 3 */
-	.quad 0x0000000000000000	/* 0x4b reserved */
-	.quad 0x0000000000000000	/* 0x53 reserved */
-	.quad 0x0000000000000000	/* 0x5b reserved */
-
-	.quad 0x00cf9a000000ffff	/* 0x60 kernel 4GB code at 0x00000000 */
-	.quad 0x00cf92000000ffff	/* 0x68 kernel 4GB data at 0x00000000 */
-	.quad 0x00cffa000000ffff	/* 0x73 user 4GB code at 0x00000000 */
-	.quad 0x00cff2000000ffff	/* 0x7b user 4GB data at 0x00000000 */
-
-	.quad 0x0000000000000000	/* 0x80 TSS descriptor */
-	.quad 0x0000000000000000	/* 0x88 LDT descriptor */
-
-	/*
-	 * Segments used for calling PnP BIOS have byte granularity.
-	 * The code segments and data segments have fixed 64k limits,
-	 * the transfer segment sizes are set at run time.
-	 */
-	.quad 0x00409a000000ffff	/* 0x90 32-bit code */
-	.quad 0x00009a000000ffff	/* 0x98 16-bit code */
-	.quad 0x000092000000ffff	/* 0xa0 16-bit data */
-	.quad 0x0000920000000000	/* 0xa8 16-bit data */
-	.quad 0x0000920000000000	/* 0xb0 16-bit data */
-
-	/*
-	 * The APM segments have byte granularity and their bases
-	 * are set at run time.  All have 64k limits.
-	 */
-	.quad 0x00409a000000ffff	/* 0xb8 APM CS    code */
-	.quad 0x00009a000000ffff	/* 0xc0 APM CS 16 code (16 bit) */
-	.quad 0x004092000000ffff	/* 0xc8 APM DS    data */
-
-	.quad 0x00c0920000000000	/* 0xd0 - ESPFIX SS */
-	.quad 0x00cf92000000ffff	/* 0xd8 - PDA */
-	.quad 0x0000000000000000	/* 0xe0 - unused */
-	.quad 0x0000000000000000	/* 0xe8 - unused */
-	.quad 0x0000000000000000	/* 0xf0 - unused */
-	.quad 0x0000000000000000	/* 0xf8 - GDT entry 31: double-fault TSS */
-
diff -r a8a4e2f9da08 arch/i386/kernel/smpboot.c
--- a/arch/i386/kernel/smpboot.c	Wed Mar 21 15:20:48 2007 +1100
+++ b/arch/i386/kernel/smpboot.c	Wed Mar 21 15:54:04 2007 +1100
@@ -440,12 +440,6 @@ void __devinit initialize_secondary(void
 void __devinit initialize_secondary(void)
 {
 	/*
-	 * switch to the per CPU GDT we already set up
-	 * in do_boot_cpu()
-	 */
-	cpu_set_gdt(current_thread_info()->cpu);
-
-	/*
 	 * We don't actually need to load the full TSS,
 	 * basically just the stack pointer and the eip.
 	 */
@@ -787,6 +781,32 @@ static inline struct task_struct * alloc
 #define alloc_idle_task(cpu) fork_idle(cpu)
 #endif
 
+/* Initialize the CPU's GDT.  This is either the boot CPU doing itself
+   (still using the master per-cpu area), or a CPU doing it for a
+   secondary which will soon come up. */
+static __cpuinit void init_gdt(int cpu, struct task_struct *idle)
+{
+	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
+	struct desc_struct *gdt = per_cpu(cpu_gdt, cpu);
+	struct i386_pda *pda = &per_cpu(_cpu_pda, cpu);
+
+ 	cpu_gdt_descr->address = (unsigned long)gdt;
+	cpu_gdt_descr->size = GDT_SIZE - 1;
+
+	pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
+			(u32 *)&gdt[GDT_ENTRY_PDA].b,
+			(unsigned long)pda, sizeof(*pda) - 1,
+			0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
+
+	memset(pda, 0, sizeof(*pda));
+	pda->_pda = pda;
+	pda->cpu_number = cpu;
+	pda->pcurrent = idle;
+}
+
+/* Defined in head.S */
+extern struct Xgt_desc_struct early_gdt_descr;
+
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -809,6 +829,8 @@ static int __cpuinit do_boot_cpu(int api
 		panic("failed fork for CPU %d", cpu);
 
 	init_gdt(cpu, idle);
+	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+	start_pda = cpu_pda(cpu);
 
 	idle->thread.eip = (unsigned long) start_secondary;
 	/* start_eip had better be page-aligned! */
@@ -1161,13 +1183,26 @@ void __init smp_prepare_cpus(unsigned in
 	smp_boot_cpus(max_cpus);
 }
 
-void __devinit smp_prepare_boot_cpu(void)
-{
-	cpu_set(smp_processor_id(), cpu_online_map);
-	cpu_set(smp_processor_id(), cpu_callout_map);
-	cpu_set(smp_processor_id(), cpu_present_map);
-	cpu_set(smp_processor_id(), cpu_possible_map);
-	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+/* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+static inline void switch_to_new_gdt(void)
+{
+	load_gdt(&per_cpu(cpu_gdt_descr, smp_processor_id()));
+	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
+}
+
+void __init smp_prepare_boot_cpu(void)
+{
+	unsigned int cpu = smp_processor_id();
+
+	init_gdt(cpu, current);
+	switch_to_new_gdt();
+
+	cpu_set(cpu, cpu_online_map);
+	cpu_set(cpu, cpu_callout_map);
+	cpu_set(cpu, cpu_present_map);
+	cpu_set(cpu, cpu_possible_map);
+	__get_cpu_var(cpu_state) = CPU_ONLINE;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
diff -r a8a4e2f9da08 arch/i386/lguest/lguest.c
--- a/arch/i386/lguest/lguest.c	Wed Mar 21 15:20:48 2007 +1100
+++ b/arch/i386/lguest/lguest.c	Wed Mar 21 15:48:20 2007 +1100
@@ -449,6 +449,9 @@ static void lguest_power_off(void)
 	hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
 }
 
+/* From head.S */
+extern void setup_pda(void);
+
 static __attribute_used__ __init void lguest_init(void)
 {
 	paravirt_ops.name = "lguest";
@@ -510,10 +513,7 @@ static __attribute_used__ __init void lg
 	init_pg_tables_end = __pa(pg0);
 
 	/* set up PDA descriptor */
-	pack_descriptor((u32 *)&cpu_gdt_table[GDT_ENTRY_PDA].a,
-			(u32 *)&cpu_gdt_table[GDT_ENTRY_PDA].b,
-			(unsigned)&boot_pda, sizeof(boot_pda)-1,
-			0x80 | DESCTYPE_S | 0x02, 0);
+	setup_pda();
 	load_gdt(&early_gdt_descr);
 	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
 
diff -r a8a4e2f9da08 arch/i386/mach-voyager/voyager_smp.c
--- a/arch/i386/mach-voyager/voyager_smp.c	Wed Mar 21 15:20:48 2007 +1100
+++ b/arch/i386/mach-voyager/voyager_smp.c	Wed Mar 21 15:31:49 2007 +1100
@@ -763,12 +763,6 @@ initialize_secondary(void)
 	// AC kernels only
 	set_current(hard_get_current());
 #endif
-
-	/*
-	 * switch to the per CPU GDT we already set up
-	 * in do_boot_cpu()
-	 */
-	cpu_set_gdt(current_thread_info()->cpu);
 
 	/*
 	 * We don't actually need to load the full TSS,
diff -r a8a4e2f9da08 include/asm-i386/desc.h
--- a/include/asm-i386/desc.h	Wed Mar 21 15:20:48 2007 +1100
+++ b/include/asm-i386/desc.h	Wed Mar 21 15:31:49 2007 +1100
@@ -11,8 +11,6 @@
 #include <linux/percpu.h>
 
 #include <asm/mmu.h>
-
-extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
 
 struct Xgt_desc_struct {
 	unsigned short size;
diff -r a8a4e2f9da08 include/asm-i386/processor.h
--- a/include/asm-i386/processor.h	Wed Mar 21 15:20:48 2007 +1100
+++ b/include/asm-i386/processor.h	Wed Mar 21 15:31:49 2007 +1100
@@ -743,7 +743,6 @@ extern void enable_sep_cpu(void);
 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
 
-extern void init_gdt(int cpu, struct task_struct *idle);
 extern void cpu_set_gdt(int);
 extern void secondary_cpu_init(void);
 



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 3/4] i386 GDT cleanups: clean up cpu_init()
  2007-03-21  6:35     ` [PATCH 2/4] i386 GDT cleanups: Use per-cpu GDT immediately upon boot Rusty Russell
@ 2007-03-21  6:36       ` Rusty Russell
  2007-03-21  6:53         ` [PATCH 4/4] i386 GDT cleanups: cleanup GDT Access Rusty Russell
  2007-03-21  9:31       ` [PATCH 2/4] i386 GDT cleanups: Use per-cpu GDT immediately upon boot Eric W. Biederman
  1 sibling, 1 reply; 19+ messages in thread
From: Rusty Russell @ 2007-03-21  6:36 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ingo Molnar, Andi Kleen, lkml - Kernel Mailing List

We now have cpu_init() and secondary_cpu_init() doing nothing but
calling _cpu_init() with the same arguments.  Rename _cpu_init() to
cpu_init() and use it as a replcement for secondary_cpu_init().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

---
 arch/i386/kernel/cpu/common.c |   36 ++++++++++--------------------------
 arch/i386/kernel/smpboot.c    |   10 +++++-----
 include/asm-i386/processor.h  |    3 ++-
 3 files changed, 17 insertions(+), 32 deletions(-)

diff -r 18694148c020 arch/i386/kernel/cpu/common.c
--- a/arch/i386/kernel/cpu/common.c	Wed Mar 21 17:15:19 2007 +1100
+++ b/arch/i386/kernel/cpu/common.c	Wed Mar 21 17:18:37 2007 +1100
@@ -644,9 +644,16 @@ struct i386_pda boot_pda = {
 	.pcurrent = &init_task,
 };
 
-/* Common CPU init for both boot and secondary CPUs */
-static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
-{
+/*
+ * cpu_init() initializes state that is per-CPU. Some data is already
+ * initialized (naturally) in the bootstrap process, such as the GDT
+ * and IDT. We reload them nevertheless, this function acts as a
+ * 'CPU state barrier', nothing should get across.
+ */
+void __cpuinit cpu_init(void)
+{
+	int cpu = smp_processor_id();
+	struct task_struct *curr = current;
 	struct tss_struct * t = &per_cpu(init_tss, cpu);
 	struct thread_struct *thread = &curr->thread;
 
@@ -706,29 +713,6 @@ static void __cpuinit _cpu_init(int cpu,
 	mxcsr_feature_mask_init();
 }
 
-/* Entrypoint to initialize secondary CPU */
-void __cpuinit secondary_cpu_init(void)
-{
-	int cpu = smp_processor_id();
-	struct task_struct *curr = current;
-
-	_cpu_init(cpu, curr);
-}
-
-/*
- * cpu_init() initializes state that is per-CPU. Some data is already
- * initialized (naturally) in the bootstrap process, such as the GDT
- * and IDT. We reload them nevertheless, this function acts as a
- * 'CPU state barrier', nothing should get across.
- */
-void __cpuinit cpu_init(void)
-{
-	int cpu = smp_processor_id();
-	struct task_struct *curr = current;
-
-	_cpu_init(cpu, curr);
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 void __cpuinit cpu_uninit(void)
 {
diff -r 18694148c020 arch/i386/kernel/smpboot.c
--- a/arch/i386/kernel/smpboot.c	Wed Mar 21 17:15:19 2007 +1100
+++ b/arch/i386/kernel/smpboot.c	Wed Mar 21 17:18:37 2007 +1100
@@ -378,14 +378,14 @@ static void __cpuinit start_secondary(vo
 static void __cpuinit start_secondary(void *unused)
 {
 	/*
-	 * Don't put *anything* before secondary_cpu_init(), SMP
-	 * booting is too fragile that we want to limit the
-	 * things done here to the most necessary things.
+	 * Don't put *anything* before cpu_init(), SMP booting is too
+	 * fragile that we want to limit the things done here to the
+	 * most necessary things.
 	 */
 #ifdef CONFIG_VMI
 	vmi_bringup();
 #endif
-	secondary_cpu_init();
+	cpu_init();
 	preempt_disable();
 	smp_callin();
 	while (!cpu_isset(smp_processor_id(), smp_commenced_mask))
diff -r 18694148c020 include/asm-i386/processor.h
--- a/include/asm-i386/processor.h	Wed Mar 21 17:15:19 2007 +1100
+++ b/include/asm-i386/processor.h	Wed Mar 21 17:18:37 2007 +1100
@@ -744,6 +744,6 @@ extern int sysenter_setup(void);
 extern int sysenter_setup(void);
 
 extern void cpu_set_gdt(int);
-extern void secondary_cpu_init(void);
+extern void cpu_init(void);
 
 #endif /* __ASM_I386_PROCESSOR_H */



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 4/4] i386 GDT cleanups: cleanup GDT Access
  2007-03-21  6:36       ` [PATCH 3/4] i386 GDT cleanups: clean up cpu_init() Rusty Russell
@ 2007-03-21  6:53         ` Rusty Russell
  0 siblings, 0 replies; 19+ messages in thread
From: Rusty Russell @ 2007-03-21  6:53 UTC (permalink / raw)
  To: Andrew Morton; +Cc: Ingo Molnar, Andi Kleen, lkml - Kernel Mailing List

Now we have an explicit per-cpu GDT variable, we don't need to keep
the descriptors around to use them to find the GDT: expose cpu_gdt
directly.

We could go further and make load_gdt() pack the descriptor for us, or
even assume it means "load the current cpu's GDT" which is what it
always does.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/i386/kernel/cpu/common.c |    4 +---
 arch/i386/kernel/efi.c        |   18 +++++++++---------
 arch/i386/kernel/entry.S      |    3 +--
 arch/i386/kernel/smpboot.c    |   12 ++++++------
 arch/i386/kernel/traps.c      |    4 +---
 include/asm-i386/desc.h       |   15 ++++++---------
 6 files changed, 24 insertions(+), 32 deletions(-)

diff -r 0714eeaace72 arch/i386/kernel/cpu/common.c
--- a/arch/i386/kernel/cpu/common.c	Wed Mar 21 15:55:51 2007 +1100
+++ b/arch/i386/kernel/cpu/common.c	Wed Mar 21 16:02:02 2007 +1100
@@ -22,9 +22,6 @@
 
 #include "cpu.h"
 
-DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
-EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
-
 DEFINE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]) = {
 	[GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
 	[GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
@@ -52,6 +49,7 @@ DEFINE_PER_CPU(struct desc_struct, cpu_g
 	[GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
 	[GDT_ENTRY_PDA] = { 0x00000000, 0x00c09200 }, /* set in setup_pda */
 };
+EXPORT_PER_CPU_SYMBOL_GPL(cpu_gdt);
 
 DEFINE_PER_CPU(struct i386_pda, _cpu_pda);
 EXPORT_PER_CPU_SYMBOL(_cpu_pda);
diff -r 0714eeaace72 arch/i386/kernel/efi.c
--- a/arch/i386/kernel/efi.c	Wed Mar 21 15:55:51 2007 +1100
+++ b/arch/i386/kernel/efi.c	Wed Mar 21 15:56:22 2007 +1100
@@ -69,12 +69,10 @@ static void efi_call_phys_prelog(void) _
 {
 	unsigned long cr4;
 	unsigned long temp;
-	struct Xgt_desc_struct *cpu_gdt_descr;
+	struct Xgt_desc_struct gdt_descr;
 
 	spin_lock(&efi_rt_lock);
 	local_irq_save(efi_rt_eflags);
-
-	cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0);
 
 	/*
 	 * If I don't have PSE, I should just duplicate two entries in page
@@ -105,17 +103,19 @@ static void efi_call_phys_prelog(void) _
 	 */
 	local_flush_tlb();
 
-	cpu_gdt_descr->address = __pa(cpu_gdt_descr->address);
-	load_gdt(cpu_gdt_descr);
+	gdt_descr.address = __pa(get_cpu_gdt_table(0));
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
 }
 
 static void efi_call_phys_epilog(void) __releases(efi_rt_lock)
 {
 	unsigned long cr4;
-	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0);
-
-	cpu_gdt_descr->address = (unsigned long)__va(cpu_gdt_descr->address);
-	load_gdt(cpu_gdt_descr);
+	struct Xgt_desc_struct gdt_descr;
+
+	gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
 
 	cr4 = read_cr4();
 
diff -r 0714eeaace72 arch/i386/kernel/entry.S
--- a/arch/i386/kernel/entry.S	Wed Mar 21 15:55:51 2007 +1100
+++ b/arch/i386/kernel/entry.S	Wed Mar 21 16:02:02 2007 +1100
@@ -558,8 +558,7 @@ END(syscall_badsys)
 #define FIXUP_ESPFIX_STACK \
 	/* since we are on a wrong stack, we cant make it a C code :( */ \
 	movl %fs:PDA_cpu, %ebx; \
-	PER_CPU(cpu_gdt_descr, %ebx); \
-	movl GDS_address(%ebx), %ebx; \
+	PER_CPU(cpu_gdt, %ebx); \
 	GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \
 	addl %esp, %eax; \
 	pushl $__KERNEL_DS; \
diff -r 0714eeaace72 arch/i386/kernel/smpboot.c
--- a/arch/i386/kernel/smpboot.c	Wed Mar 21 15:55:51 2007 +1100
+++ b/arch/i386/kernel/smpboot.c	Wed Mar 21 16:02:02 2007 +1100
@@ -786,12 +786,8 @@ static inline struct task_struct * alloc
    secondary which will soon come up. */
 static __cpuinit void init_gdt(int cpu, struct task_struct *idle)
 {
-	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
-	struct desc_struct *gdt = per_cpu(cpu_gdt, cpu);
+	struct desc_struct *gdt = get_cpu_gdt_table(cpu);
 	struct i386_pda *pda = &per_cpu(_cpu_pda, cpu);
-
- 	cpu_gdt_descr->address = (unsigned long)gdt;
-	cpu_gdt_descr->size = GDT_SIZE - 1;
 
 	pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
 			(u32 *)&gdt[GDT_ENTRY_PDA].b,
@@ -1187,7 +1183,11 @@ void __init smp_prepare_cpus(unsigned in
  * it's on the real one. */
 static inline void switch_to_new_gdt(void)
 {
-	load_gdt(&per_cpu(cpu_gdt_descr, smp_processor_id()));
+	struct Xgt_desc_struct gdt_descr;
+
+	gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
+	gdt_descr.size = GDT_SIZE - 1;
+	load_gdt(&gdt_descr);
 	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
 }
 
diff -r 0714eeaace72 arch/i386/kernel/traps.c
--- a/arch/i386/kernel/traps.c	Wed Mar 21 15:55:51 2007 +1100
+++ b/arch/i386/kernel/traps.c	Wed Mar 21 16:02:02 2007 +1100
@@ -1037,9 +1037,7 @@ fastcall unsigned long patch_espfix_desc
 fastcall unsigned long patch_espfix_desc(unsigned long uesp,
 					  unsigned long kesp)
 {
-	int cpu = smp_processor_id();
-	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
-	struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address;
+	struct desc_struct *gdt = __get_cpu_var(cpu_gdt);
 	unsigned long base = (kesp - uesp) & -THREAD_SIZE;
 	unsigned long new_kesp = kesp - base;
 	unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
diff -r 0714eeaace72 arch/i386/lguest/core.c
--- a/arch/i386/lguest/core.c	Wed Mar 21 15:55:51 2007 +1100
+++ b/arch/i386/lguest/core.c	Wed Mar 21 16:11:14 2007 +1100
@@ -96,7 +96,8 @@ static __init int map_hypervisor(void)
 		struct lguest_ro_state *state = &pages->state;
 
 		/* These fields are static: rest done in copy_in_guest_info */
-		state->host_gdt_desc = per_cpu(cpu_gdt_descr, i);
+		state->host_gdt_desc.size = GDT_SIZE-1;
+		state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
 		store_idt(&state->host_idt_desc);
 		state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
 		state->guest_idt_desc.address = (long)&state->guest_idt;
diff -r 0714eeaace72 arch/i386/lguest/lguest.c
--- a/arch/i386/lguest/lguest.c	Wed Mar 21 15:55:51 2007 +1100
+++ b/arch/i386/lguest/lguest.c	Wed Mar 21 16:03:12 2007 +1100
@@ -451,6 +451,7 @@ static void lguest_power_off(void)
 
 /* From head.S */
 extern void setup_pda(void);
+extern struct Xgt_desc_struct early_gdt_descr;
 
 static __attribute_used__ __init void lguest_init(void)
 {
diff -r 0714eeaace72 include/asm-i386/desc.h
--- a/include/asm-i386/desc.h	Wed Mar 21 15:55:51 2007 +1100
+++ b/include/asm-i386/desc.h	Wed Mar 21 16:02:02 2007 +1100
@@ -18,16 +18,13 @@ struct Xgt_desc_struct {
 	unsigned short pad;
 } __attribute__ ((packed));
 
+DECLARE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]);
+static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
+{
+	return per_cpu(cpu_gdt, cpu);
+}
+
 extern struct Xgt_desc_struct idt_descr;
-DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
-DECLARE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]);
-extern struct Xgt_desc_struct early_gdt_descr;
-
-static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
-{
-	return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
-}
-
 extern struct desc_struct idt_table[];
 extern void set_intr_gate(unsigned int irq, void * addr);
 



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Allow per-cpu variables to be page-aligned
  2007-03-21  6:10 [PATCH] Allow per-cpu variables to be page-aligned Rusty Russell
  2007-03-21  6:30 ` [PATCH 0/4] i386 GDT cleanups Rusty Russell
@ 2007-03-21  9:21 ` Eric W. Biederman
  2007-03-21 11:44   ` Rusty Russell
  1 sibling, 1 reply; 19+ messages in thread
From: Eric W. Biederman @ 2007-03-21  9:21 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andrew Morton, Ingo Molnar, Andi Kleen, lkml - Kernel Mailing List

Rusty Russell <rusty@rustcorp.com.au> writes:

> [This was part of the GDT cleanups and per-cpu-> pda changes, which I
> have revised, but this stands on its own.  The only change is catching
> the x86-64 per-cpu allocator too].
> ==
> Let's allow page-alignment in general for per-cpu data (wanted by Xen,
> and Ingo suggested KVM as well).
>
> Because larger alignments can use more room, we increase the max
> per-cpu memory to 64k rather than 32k: it's getting a little tight.
>
> Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
> Acked-by: Ingo Molnar <mingo@elte.hu>

> ===================================================================
> --- a/kernel/module.c
> +++ b/kernel/module.c
> @@ -346,10 +346,10 @@ static void *percpu_modalloc(unsigned lo
>  	unsigned int i;
>  	void *ptr;
>  
> -	if (align > SMP_CACHE_BYTES) {
> -		printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n",
> -		       name, align, SMP_CACHE_BYTES);
> -		align = SMP_CACHE_BYTES;
> +	if (align > PAGE_SIZE) {
> +		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
> +		       name, align, PAGE_SIZE);
> +		align = PAGE_SIZE;
>  	}
>  
>  	ptr = __per_cpu_start;
> @@ -430,7 +430,7 @@ static int percpu_modinit(void)
>  	pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
>  			    GFP_KERNEL);
>  	/* Static in-kernel percpu data (used). */
> -	pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, SMP_CACHE_BYTES);
> +	pcpu_size[0] = -ALIGN(__per_cpu_end-__per_cpu_start, PAGE_SIZE);
>  	/* Free room. */
>  	pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
>  	if (pcpu_size[1] < 0) {

Do we really want to allow modules to be able to allocate page sized
per cpu memory.  If my memory servers on how this code works we will wind
up allocating 1 page of per cpu memory for every module that allocates a
per cpu variable.  128 bytes sucks 4k is an order of magnitude worse.

On x86_64 we are only reserving 8K for modules...

Eric

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/4] i386 GDT cleanups: Use per-cpu GDT immediately upon boot
  2007-03-21  6:35     ` [PATCH 2/4] i386 GDT cleanups: Use per-cpu GDT immediately upon boot Rusty Russell
  2007-03-21  6:36       ` [PATCH 3/4] i386 GDT cleanups: clean up cpu_init() Rusty Russell
@ 2007-03-21  9:31       ` Eric W. Biederman
  2007-03-21 11:55         ` Rusty Russell
  1 sibling, 1 reply; 19+ messages in thread
From: Eric W. Biederman @ 2007-03-21  9:31 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andrew Morton, Ingo Molnar, Andi Kleen, lkml - Kernel Mailing List

Rusty Russell <rusty@rustcorp.com.au> writes:

> Now we are no longer dynamically allocating the GDT, we don't need the
> "cpu_gdt_table" at all: we can switch straight from "boot_gdt_table"
> to the per-cpu GDT.  This means initializing the cpu_gdt array in C.
>
> The boot CPU uses the per-cpu var directly, then in smp_prepare_cpus()
> it switches to the per-cpu copy just allocated.  For secondary CPUs,
> the early_gdt_descr is set to point directly to their per-cpu copy.
>
> For UP the code is very simple: it keeps using the "per-cpu" GDT as
> per SMP, but we never have to move.

> diff -r a8a4e2f9da08 arch/i386/kernel/head.S
> --- a/arch/i386/kernel/head.S	Wed Mar 21 15:20:48 2007 +1100
> +++ b/arch/i386/kernel/head.S	Wed Mar 21 15:32:38 2007 +1100
> @@ -599,67 +599,10 @@ idt_descr:
>  	.word 0				# 32 bit align gdt_desc.address
>  ENTRY(early_gdt_descr)
>  	.word GDT_ENTRIES*8-1
> -	.long cpu_gdt_table
> -
> -/*
> - * The boot_gdt_table must mirror the equivalent in setup.S and is
> - * used only for booting.
> - */

It looks like you are killing a useful comment here for no good reason.

> +	.long per_cpu__cpu_gdt		/* Overwritten for secondary CPUs */
> +
>  	.align L1_CACHE_BYTES
>  ENTRY(boot_gdt_table)
>  	.fill GDT_ENTRY_BOOT_CS,8,0
>  	.quad 0x00cf9a000000ffff	/* kernel 4GB code at 0x00000000 */
>  	.quad 0x00cf92000000ffff	/* kernel 4GB data at 0x00000000 */


Eric

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Allow per-cpu variables to be page-aligned
  2007-03-21  9:21 ` [PATCH] Allow per-cpu variables to be page-aligned Eric W. Biederman
@ 2007-03-21 11:44   ` Rusty Russell
  2007-03-21 16:49     ` Eric W. Biederman
  0 siblings, 1 reply; 19+ messages in thread
From: Rusty Russell @ 2007-03-21 11:44 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrew Morton, Ingo Molnar, Andi Kleen, lkml - Kernel Mailing List

On Wed, 2007-03-21 at 03:21 -0600, Eric W. Biederman wrote:
> Do we really want to allow modules to be able to allocate page sized
> per cpu memory.

Hi Eric!

	They always could, of course, they just wouldn't get correct alignment.
I think the principle of least surprise says that if we support this, it
will also work in modules...

>   If my memory servers on how this code works we will wind
> up allocating 1 page of per cpu memory for every module that allocates a
> per cpu variable.  128 bytes sucks 4k is an order of magnitude worse.

Not quite.  We allocate a total amount of per-cpu memory at boot, then
anything left over gets used for per-cpu vars in modules.

Looking at the module per-cpu code again, the rounding up of the memory
used by the kernel seems unnecessary though.  I'll try ripping that
out...

> On x86_64 we are only reserving 8K for modules...

Really?  I can't see that.

It did look like the x86-64 setup_per_cpu_areas should be moved into
common code though (it's numa-aware).  Maybe that breaks some platforms.

It means the x86 cpu_pda initialization would have to be done in
smp_prepare_boot_cpu tho...

Cheers!
Rusty.



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/4] i386 GDT cleanups: Use per-cpu GDT immediately upon boot
  2007-03-21  9:31       ` [PATCH 2/4] i386 GDT cleanups: Use per-cpu GDT immediately upon boot Eric W. Biederman
@ 2007-03-21 11:55         ` Rusty Russell
  2007-03-21 16:51           ` Eric W. Biederman
  0 siblings, 1 reply; 19+ messages in thread
From: Rusty Russell @ 2007-03-21 11:55 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrew Morton, Ingo Molnar, Andi Kleen, lkml - Kernel Mailing List

On Wed, 2007-03-21 at 03:31 -0600, Eric W. Biederman wrote:
> Rusty Russell <rusty@rustcorp.com.au> writes:
> > -/*
> > - * The boot_gdt_table must mirror the equivalent in setup.S and is
> > - * used only for booting.
> > - */
> 
> It looks like you are killing a useful comment here for no good reason.

Hi Eric,

	I think one has to look harder, then.  There is no "equivalent in
setup.S": there is no setup.S, and it's certainly not clear what GDT
this "must mirror": it doesn't mirror any GDT at the moment.

	This leaves us with "is only used for booting".  The name
"boot_gdt_table" and the code itself are pretty clear.

	A comment might well deserve to live here, but not this one 8)

Hope that clarifies,
Rusty.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Allow per-cpu variables to be page-aligned
  2007-03-21 11:44   ` Rusty Russell
@ 2007-03-21 16:49     ` Eric W. Biederman
  2007-03-22  0:09       ` Rusty Russell
  0 siblings, 1 reply; 19+ messages in thread
From: Eric W. Biederman @ 2007-03-21 16:49 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andrew Morton, Ingo Molnar, Andi Kleen, lkml - Kernel Mailing List

Rusty Russell <rusty@rustcorp.com.au> writes:

> On Wed, 2007-03-21 at 03:21 -0600, Eric W. Biederman wrote:
>> Do we really want to allow modules to be able to allocate page sized
>> per cpu memory.
>
> Hi Eric!
>
> 	They always could, of course, they just wouldn't get correct alignment.
> I think the principle of least surprise says that if we support this, it
> will also work in modules...

The module load would fail.

>>   If my memory servers on how this code works we will wind
>> up allocating 1 page of per cpu memory for every module that allocates a
>> per cpu variable.  128 bytes sucks 4k is an order of magnitude worse.
>
> Not quite.  We allocate a total amount of per-cpu memory at boot, then
> anything left over gets used for per-cpu vars in modules.
>
> Looking at the module per-cpu code again, the rounding up of the memory
> used by the kernel seems unnecessary though.  I'll try ripping that
> out...

I want to say that when dealing with cpu stuff aligning to a cache
line makes sense as it prevents multiple variables from sharing
the same cache line.  However we rarely access per cpu variables from
other cpus (the point) so the extra alignment doesn't seem to have
a justification in this context.

Although I'm not quite certain what this will do to the per cpu
memory allocator...

>> On x86_64 we are only reserving 8K for modules...
>
> Really?  I can't see that.

Look at how we define PERCPU_MODULE_RESERVE and PERCPU_ENOUGH_ROOM.

> It did look like the x86-64 setup_per_cpu_areas should be moved into
> common code though (it's numa-aware).  Maybe that breaks some platforms.

After increasing NR_IRQS on x86_64 to (NR_CPUS*32) the per cpu irq
stats got much bigger especially as NR_CPUS went up.  The only
reasonable way I could see to fix this at the time was to just make
PER_CPU_ENOUGH_ROOM do the right thing and change size dynamically
with the size of the per cpu section.  I added PERCPU_MODULE_RESERVE
to allocate the amount that we did not have compile information on.
8K was roughly what we had left over for modules before I made the
change so I just preserved that.

We do have to be careful though as some architectures have fixed sized
areas they are putting the per cpu stats into.

> It means the x86 cpu_pda initialization would have to be done in
> smp_prepare_boot_cpu tho...

Well that is earlier than trap_init so it shouldn't be a problem...

Eric

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/4] i386 GDT cleanups: Use per-cpu GDT immediately upon boot
  2007-03-21 11:55         ` Rusty Russell
@ 2007-03-21 16:51           ` Eric W. Biederman
  2007-03-21 23:58             ` Rusty Russell
  0 siblings, 1 reply; 19+ messages in thread
From: Eric W. Biederman @ 2007-03-21 16:51 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Andrew Morton, Ingo Molnar, Andi Kleen, lkml - Kernel Mailing List

Rusty Russell <rusty@rustcorp.com.au> writes:

> On Wed, 2007-03-21 at 03:31 -0600, Eric W. Biederman wrote:
>> Rusty Russell <rusty@rustcorp.com.au> writes:
>> > -/*
>> > - * The boot_gdt_table must mirror the equivalent in setup.S and is
>> > - * used only for booting.
>> > - */
>> 
>> It looks like you are killing a useful comment here for no good reason.
>
> Hi Eric,
>
> 	I think one has to look harder, then.  There is no "equivalent in
> setup.S": there is no setup.S, and it's certainly not clear what GDT
> this "must mirror": it doesn't mirror any GDT at the moment.

see the gdt in:
arch/i386/boot/setup.S

> 	This leaves us with "is only used for booting".  The name
> "boot_gdt_table" and the code itself are pretty clear.
>
> 	A comment might well deserve to live here, but not this one 8)
>
> Hope that clarifies,

Some of your confusion at least.

If anything the comment should read these values are fixed by the boot
protocol and we can't change them.

Eric

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/4] i386 GDT cleanups: Use per-cpu GDT immediately upon boot
  2007-03-21 16:51           ` Eric W. Biederman
@ 2007-03-21 23:58             ` Rusty Russell
  2007-03-22  8:10               ` Sébastien Dugué
  0 siblings, 1 reply; 19+ messages in thread
From: Rusty Russell @ 2007-03-21 23:58 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrew Morton, Ingo Molnar, Andi Kleen, lkml - Kernel Mailing List

On Wed, 2007-03-21 at 10:51 -0600, Eric W. Biederman wrote:
> Rusty Russell <rusty@rustcorp.com.au> writes:
> 
> > On Wed, 2007-03-21 at 03:31 -0600, Eric W. Biederman wrote:
> >> Rusty Russell <rusty@rustcorp.com.au> writes:
> >> > -/*
> >> > - * The boot_gdt_table must mirror the equivalent in setup.S and is
> >> > - * used only for booting.
> >> > - */
> >> 
> >> It looks like you are killing a useful comment here for no good reason.
> >
> > Hi Eric,
> >
> > 	I think one has to look harder, then.  There is no "equivalent in
> > setup.S": there is no setup.S, and it's certainly not clear what GDT
> > this "must mirror": it doesn't mirror any GDT at the moment.
> 
> see the gdt in:
> arch/i386/boot/setup.S

Erk, what a dumb mistake.  Apologies for my snarky comment above 8(

> If anything the comment should read these values are fixed by the boot
> protocol and we can't change them.

Since lguest doesn't use setup.S, it's outside my experience.  I'll just
leave the comment, and try to pretend this never happened 8)

Thanks muchly,
Rusty.
==
Now we are no longer dynamically allocating the GDT, we don't need the
"cpu_gdt_table" at all: we can switch straight from "boot_gdt_table"
to the per-cpu GDT.  This means initializing the cpu_gdt array in C.

The boot CPU uses the per-cpu var directly, then in smp_prepare_cpus()
it switches to the per-cpu copy just allocated.  For secondary CPUs,
the early_gdt_descr is set to point directly to their per-cpu copy.

For UP the code is very simple: it keeps using the "per-cpu" GDT as
per SMP, but we never have to move.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/i386/kernel/cpu/common.c        |   74 ++++++++++++----------------------
 arch/i386/kernel/head.S              |   65 -----------------------------
 arch/i386/kernel/smpboot.c           |   59 ++++++++++++++++++++++-----
 arch/i386/mach-voyager/voyager_smp.c |    6 --
 include/asm-i386/desc.h              |    2
 include/asm-i386/processor.h         |    1
 6 files changed, 77 insertions(+), 130 deletions(-)

diff -r 9db59163584b arch/i386/kernel/cpu/common.c
--- a/arch/i386/kernel/cpu/common.c	Thu Mar 22 10:54:53 2007 +1100
+++ b/arch/i386/kernel/cpu/common.c	Thu Mar 22 10:56:49 2007 +1100
@@ -25,7 +25,33 @@ DEFINE_PER_CPU(struct Xgt_desc_struct, c
 DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr);
 EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr);
 
-DEFINE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]);
+DEFINE_PER_CPU(struct desc_struct, cpu_gdt[GDT_ENTRIES]) = {
+	[GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 },
+	[GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 },
+	[GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 },
+	[GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 },
+	/*
+	 * Segments used for calling PnP BIOS have byte granularity.
+	 * They code segments and data segments have fixed 64k limits,
+	 * the transfer segment sizes are set at run time.
+	 */
+	[GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
+	[GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */
+	[GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */
+	[GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */
+	[GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */
+	/*
+	 * The APM segments have byte granularity and their bases
+	 * are set at run time.  All have 64k limits.
+	 */
+	[GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */
+	/* 16-bit code */
+	[GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 },
+	[GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */
+
+	[GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 },
+	[GDT_ENTRY_PDA] = { 0x00000000, 0x00c09200 }, /* set in setup_pda */
+};
 
 DEFINE_PER_CPU(struct i386_pda, _cpu_pda);
 EXPORT_PER_CPU_SYMBOL(_cpu_pda);
@@ -618,46 +644,6 @@ struct i386_pda boot_pda = {
 	.pcurrent = &init_task,
 };
 
-static inline void set_kernel_fs(void)
-{
-	/* Set %fs for this CPU's PDA.  Memory clobber is to create a
-	   barrier with respect to any PDA operations, so the compiler
-	   doesn't move any before here. */
-	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
-}
-
-/* Initialize the CPU's GDT and PDA.  This is either the boot CPU doing itself
-   (still using cpu_gdt_table), or a CPU doing it for a secondary which
-   will soon come up. */
-__cpuinit void init_gdt(int cpu, struct task_struct *idle)
-{
-	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
-	struct desc_struct *gdt = per_cpu(cpu_gdt, cpu);
-	struct i386_pda *pda = &per_cpu(_cpu_pda, cpu);
-
- 	memcpy(gdt, cpu_gdt_table, GDT_SIZE);
- 	cpu_gdt_descr->address = (unsigned long)gdt;
-	cpu_gdt_descr->size = GDT_SIZE - 1;
-
-	pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
-			(u32 *)&gdt[GDT_ENTRY_PDA].b,
-			(unsigned long)pda, sizeof(*pda) - 1,
-			0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
-
-	memset(pda, 0, sizeof(*pda));
-	pda->_pda = pda;
-	pda->cpu_number = cpu;
-	pda->pcurrent = idle;
-}
-
-void __cpuinit cpu_set_gdt(int cpu)
-{
-	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
-
-	load_gdt(cpu_gdt_descr);
-	set_kernel_fs();
-}
-
 /* Common CPU init for both boot and secondary CPUs */
 static void __cpuinit _cpu_init(int cpu, struct task_struct *curr)
 {
@@ -740,10 +726,6 @@ void __cpuinit cpu_init(void)
 	int cpu = smp_processor_id();
 	struct task_struct *curr = current;
 
-	/* Set up the real GDT and PDA, so we can transition from the
-	   boot_gdt_table & boot_pda. */
-	init_gdt(cpu, curr);
-	cpu_set_gdt(cpu);
 	_cpu_init(cpu, curr);
 }
 
diff -r 9db59163584b arch/i386/kernel/head.S
--- a/arch/i386/kernel/head.S	Thu Mar 22 10:54:53 2007 +1100
+++ b/arch/i386/kernel/head.S	Thu Mar 22 10:56:49 2007 +1100
@@ -599,7 +599,7 @@ idt_descr:
 	.word 0				# 32 bit align gdt_desc.address
 ENTRY(early_gdt_descr)
 	.word GDT_ENTRIES*8-1
-	.long cpu_gdt_table
+	.long per_cpu__cpu_gdt		/* Overwritten for secondary CPUs */
 
 /*
  * The boot_gdt_table must mirror the equivalent in setup.S and is
@@ -610,56 +610,3 @@ ENTRY(boot_gdt_table)
 	.fill GDT_ENTRY_BOOT_CS,8,0
 	.quad 0x00cf9a000000ffff	/* kernel 4GB code at 0x00000000 */
 	.quad 0x00cf92000000ffff	/* kernel 4GB data at 0x00000000 */
-
-/*
- * The Global Descriptor Table contains 32 quadwords, per-CPU.
- */
-	.align L1_CACHE_BYTES
-ENTRY(cpu_gdt_table)
-	.quad 0x0000000000000000	/* NULL descriptor */
-	.quad 0x0000000000000000	/* 0x0b reserved */
-	.quad 0x0000000000000000	/* 0x13 reserved */
-	.quad 0x0000000000000000	/* 0x1b reserved */
-	.quad 0x0000000000000000	/* 0x20 unused */
-	.quad 0x0000000000000000	/* 0x28 unused */
-	.quad 0x0000000000000000	/* 0x33 TLS entry 1 */
-	.quad 0x0000000000000000	/* 0x3b TLS entry 2 */
-	.quad 0x0000000000000000	/* 0x43 TLS entry 3 */
-	.quad 0x0000000000000000	/* 0x4b reserved */
-	.quad 0x0000000000000000	/* 0x53 reserved */
-	.quad 0x0000000000000000	/* 0x5b reserved */
-
-	.quad 0x00cf9a000000ffff	/* 0x60 kernel 4GB code at 0x00000000 */
-	.quad 0x00cf92000000ffff	/* 0x68 kernel 4GB data at 0x00000000 */
-	.quad 0x00cffa000000ffff	/* 0x73 user 4GB code at 0x00000000 */
-	.quad 0x00cff2000000ffff	/* 0x7b user 4GB data at 0x00000000 */
-
-	.quad 0x0000000000000000	/* 0x80 TSS descriptor */
-	.quad 0x0000000000000000	/* 0x88 LDT descriptor */
-
-	/*
-	 * Segments used for calling PnP BIOS have byte granularity.
-	 * The code segments and data segments have fixed 64k limits,
-	 * the transfer segment sizes are set at run time.
-	 */
-	.quad 0x00409a000000ffff	/* 0x90 32-bit code */
-	.quad 0x00009a000000ffff	/* 0x98 16-bit code */
-	.quad 0x000092000000ffff	/* 0xa0 16-bit data */
-	.quad 0x0000920000000000	/* 0xa8 16-bit data */
-	.quad 0x0000920000000000	/* 0xb0 16-bit data */
-
-	/*
-	 * The APM segments have byte granularity and their bases
-	 * are set at run time.  All have 64k limits.
-	 */
-	.quad 0x00409a000000ffff	/* 0xb8 APM CS    code */
-	.quad 0x00009a000000ffff	/* 0xc0 APM CS 16 code (16 bit) */
-	.quad 0x004092000000ffff	/* 0xc8 APM DS    data */
-
-	.quad 0x00c0920000000000	/* 0xd0 - ESPFIX SS */
-	.quad 0x00cf92000000ffff	/* 0xd8 - PDA */
-	.quad 0x0000000000000000	/* 0xe0 - unused */
-	.quad 0x0000000000000000	/* 0xe8 - unused */
-	.quad 0x0000000000000000	/* 0xf0 - unused */
-	.quad 0x0000000000000000	/* 0xf8 - GDT entry 31: double-fault TSS */
-
diff -r 9db59163584b arch/i386/kernel/smpboot.c
--- a/arch/i386/kernel/smpboot.c	Thu Mar 22 10:54:53 2007 +1100
+++ b/arch/i386/kernel/smpboot.c	Thu Mar 22 10:56:49 2007 +1100
@@ -440,12 +440,6 @@ void __devinit initialize_secondary(void
 void __devinit initialize_secondary(void)
 {
 	/*
-	 * switch to the per CPU GDT we already set up
-	 * in do_boot_cpu()
-	 */
-	cpu_set_gdt(current_thread_info()->cpu);
-
-	/*
 	 * We don't actually need to load the full TSS,
 	 * basically just the stack pointer and the eip.
 	 */
@@ -787,6 +781,32 @@ static inline struct task_struct * alloc
 #define alloc_idle_task(cpu) fork_idle(cpu)
 #endif
 
+/* Initialize the CPU's GDT.  This is either the boot CPU doing itself
+   (still using the master per-cpu area), or a CPU doing it for a
+   secondary which will soon come up. */
+static __cpuinit void init_gdt(int cpu, struct task_struct *idle)
+{
+	struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu);
+	struct desc_struct *gdt = per_cpu(cpu_gdt, cpu);
+	struct i386_pda *pda = &per_cpu(_cpu_pda, cpu);
+
+ 	cpu_gdt_descr->address = (unsigned long)gdt;
+	cpu_gdt_descr->size = GDT_SIZE - 1;
+
+	pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a,
+			(u32 *)&gdt[GDT_ENTRY_PDA].b,
+			(unsigned long)pda, sizeof(*pda) - 1,
+			0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */
+
+	memset(pda, 0, sizeof(*pda));
+	pda->_pda = pda;
+	pda->cpu_number = cpu;
+	pda->pcurrent = idle;
+}
+
+/* Defined in head.S */
+extern struct Xgt_desc_struct early_gdt_descr;
+
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -809,6 +829,8 @@ static int __cpuinit do_boot_cpu(int api
 		panic("failed fork for CPU %d", cpu);
 
 	init_gdt(cpu, idle);
+	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+	start_pda = cpu_pda(cpu);
 
 	idle->thread.eip = (unsigned long) start_secondary;
 	/* start_eip had better be page-aligned! */
@@ -1161,13 +1183,26 @@ void __init smp_prepare_cpus(unsigned in
 	smp_boot_cpus(max_cpus);
 }
 
-void __devinit smp_prepare_boot_cpu(void)
-{
-	cpu_set(smp_processor_id(), cpu_online_map);
-	cpu_set(smp_processor_id(), cpu_callout_map);
-	cpu_set(smp_processor_id(), cpu_present_map);
-	cpu_set(smp_processor_id(), cpu_possible_map);
-	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+/* Current gdt points %fs at the "master" per-cpu area: after this,
+ * it's on the real one. */
+static inline void switch_to_new_gdt(void)
+{
+	load_gdt(&per_cpu(cpu_gdt_descr, smp_processor_id()));
+	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
+}
+
+void __init smp_prepare_boot_cpu(void)
+{
+	unsigned int cpu = smp_processor_id();
+
+	init_gdt(cpu, current);
+	switch_to_new_gdt();
+
+	cpu_set(cpu, cpu_online_map);
+	cpu_set(cpu, cpu_callout_map);
+	cpu_set(cpu, cpu_present_map);
+	cpu_set(cpu, cpu_possible_map);
+	__get_cpu_var(cpu_state) = CPU_ONLINE;
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
diff -r 9db59163584b arch/i386/lguest/lguest.c
--- a/arch/i386/lguest/lguest.c	Thu Mar 22 10:54:53 2007 +1100
+++ b/arch/i386/lguest/lguest.c	Thu Mar 22 10:56:49 2007 +1100
@@ -449,6 +449,9 @@ static void lguest_power_off(void)
 	hcall(LHCALL_CRASH, __pa("Power down"), 0, 0);
 }
 
+/* From head.S */
+extern void setup_pda(void);
+
 static __attribute_used__ __init void lguest_init(void)
 {
 	paravirt_ops.name = "lguest";
@@ -510,10 +513,7 @@ static __attribute_used__ __init void lg
 	init_pg_tables_end = __pa(pg0);
 
 	/* set up PDA descriptor */
-	pack_descriptor((u32 *)&cpu_gdt_table[GDT_ENTRY_PDA].a,
-			(u32 *)&cpu_gdt_table[GDT_ENTRY_PDA].b,
-			(unsigned)&boot_pda, sizeof(boot_pda)-1,
-			0x80 | DESCTYPE_S | 0x02, 0);
+	setup_pda();
 	load_gdt(&early_gdt_descr);
 	asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory");
 
diff -r 9db59163584b arch/i386/mach-voyager/voyager_smp.c
--- a/arch/i386/mach-voyager/voyager_smp.c	Thu Mar 22 10:54:53 2007 +1100
+++ b/arch/i386/mach-voyager/voyager_smp.c	Thu Mar 22 10:54:53 2007 +1100
@@ -763,12 +763,6 @@ initialize_secondary(void)
 	// AC kernels only
 	set_current(hard_get_current());
 #endif
-
-	/*
-	 * switch to the per CPU GDT we already set up
-	 * in do_boot_cpu()
-	 */
-	cpu_set_gdt(current_thread_info()->cpu);
 
 	/*
 	 * We don't actually need to load the full TSS,
diff -r 9db59163584b include/asm-i386/desc.h
--- a/include/asm-i386/desc.h	Thu Mar 22 10:54:53 2007 +1100
+++ b/include/asm-i386/desc.h	Thu Mar 22 10:56:49 2007 +1100
@@ -11,8 +11,6 @@
 #include <linux/percpu.h>
 
 #include <asm/mmu.h>
-
-extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
 
 struct Xgt_desc_struct {
 	unsigned short size;
diff -r 9db59163584b include/asm-i386/processor.h
--- a/include/asm-i386/processor.h	Thu Mar 22 10:54:53 2007 +1100
+++ b/include/asm-i386/processor.h	Thu Mar 22 10:56:49 2007 +1100
@@ -743,7 +743,6 @@ extern void enable_sep_cpu(void);
 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
 
-extern void init_gdt(int cpu, struct task_struct *idle);
 extern void cpu_set_gdt(int);
 extern void secondary_cpu_init(void);
 



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] Allow per-cpu variables to be page-aligned
  2007-03-21 16:49     ` Eric W. Biederman
@ 2007-03-22  0:09       ` Rusty Russell
  0 siblings, 0 replies; 19+ messages in thread
From: Rusty Russell @ 2007-03-22  0:09 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrew Morton, Ingo Molnar, Andi Kleen, lkml - Kernel Mailing List

On Wed, 2007-03-21 at 10:49 -0600, Eric W. Biederman wrote:
> Rusty Russell <rusty@rustcorp.com.au> writes:
> 
> > On Wed, 2007-03-21 at 03:21 -0600, Eric W. Biederman wrote:
> >> Do we really want to allow modules to be able to allocate page sized
> >> per cpu memory.
> >
> > Hi Eric!
> >
> > 	They always could, of course, they just wouldn't get correct alignment.
> > I think the principle of least surprise says that if we support this, it
> > will also work in modules...
> 
> The module load would fail.

Hi again Eric,

	Unfortunately not.  It probably should, though: people ignore printks.
I was probably thinking that large alignment constraints were only for
performance when I wrote this code, but a page-aligned requirement for
hypervisors changes that.

> > Looking at the module per-cpu code again, the rounding up of the memory
> > used by the kernel seems unnecessary though.  I'll try ripping that
> > out...
>
> I want to say that when dealing with cpu stuff aligning to a cache
> line makes sense as it prevents multiple variables from sharing
> the same cache line.  However we rarely access per cpu variables from
> other cpus (the point) so the extra alignment doesn't seem to have
> a justification in this context.

Um, yes, always good to remember.  I wrote the per-cpu infrastructure,
and I haven't forgotten 8)

> Although I'm not quite certain what this will do to the per cpu
> memory allocator...

It should Just Work.  My only hesitation is that I obviously thought
different when I wrote this code, so am I smarter now, or then?

> After increasing NR_IRQS on x86_64 to (NR_CPUS*32) the per cpu irq
> stats got much bigger especially as NR_CPUS went up.  The only
> reasonable way I could see to fix this at the time was to just make
> PER_CPU_ENOUGH_ROOM do the right thing and change size dynamically
> with the size of the per cpu section.  I added PERCPU_MODULE_RESERVE
> to allocate the amount that we did not have compile information on.
> 8K was roughly what we had left over for modules before I made the
> change so I just preserved that.

This makes a lot of sense.  A fixed constant seemed sensible at the
time, but now we know that the majority of per-cpu vars are in code
which can never be a module.  Reasons are obvious, and seem unlikely to
change.

> > It means the x86 cpu_pda initialization would have to be done in
> > smp_prepare_boot_cpu tho...
> 
> Well that is earlier than trap_init so it shouldn't be a problem...

But it doesn't get called on UP.  Don't know if that matters, but it
wasn't immediately obvious.

Thanks,
Rusty.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/4] i386 GDT cleanups: Use per-cpu GDT immediately upon boot
  2007-03-21 23:58             ` Rusty Russell
@ 2007-03-22  8:10               ` Sébastien Dugué
  2007-03-22  9:24                 ` Rusty Russell
  0 siblings, 1 reply; 19+ messages in thread
From: Sébastien Dugué @ 2007-03-22  8:10 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Eric W. Biederman, Andrew Morton, Ingo Molnar, Andi Kleen,
	lkml - Kernel Mailing List


  Hi Rusty,

On Thu, 22 Mar 2007 10:58:30 +1100 Rusty Russell <rusty@rustcorp.com.au> wrote:

> On Wed, 2007-03-21 at 10:51 -0600, Eric W. Biederman wrote:
> > Rusty Russell <rusty@rustcorp.com.au> writes:
> > 
> > > On Wed, 2007-03-21 at 03:31 -0600, Eric W. Biederman wrote:
> > >> Rusty Russell <rusty@rustcorp.com.au> writes:
> > >> > -/*
> > >> > - * The boot_gdt_table must mirror the equivalent in setup.S and is
> > >> > - * used only for booting.
> > >> > - */
> > >> 
> > >> It looks like you are killing a useful comment here for no good reason.
> > >
> > > Hi Eric,
> > >
> > > 	I think one has to look harder, then.  There is no "equivalent in
> > > setup.S": there is no setup.S, and it's certainly not clear what GDT
> > > this "must mirror": it doesn't mirror any GDT at the moment.
> > 
> > see the gdt in:
> > arch/i386/boot/setup.S
> 
> Erk, what a dumb mistake.  Apologies for my snarky comment above 8(
> 
> > If anything the comment should read these values are fixed by the boot
> > protocol and we can't change them.
> 
> Since lguest doesn't use setup.S, it's outside my experience.  I'll just
> leave the comment, and try to pretend this never happened 8)
> 
> Thanks muchly,
> Rusty.
> ==
> Now we are no longer dynamically allocating the GDT, we don't need the
> "cpu_gdt_table" at all: we can switch straight from "boot_gdt_table"
> to the per-cpu GDT.  This means initializing the cpu_gdt array in C.


  Why not take on the opportunity to rename boot_gt_table to boot_gtd,
to avoid the duplicate T(able)?

  Sébastien.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/4] i386 GDT cleanups: Use per-cpu GDT immediately upon boot
  2007-03-22  8:10               ` Sébastien Dugué
@ 2007-03-22  9:24                 ` Rusty Russell
  2007-03-22 15:59                   ` [PATCH] i386 GDT cleanups: Rename boot_gdt_table to boot_gdt Sébastien Dugué
  0 siblings, 1 reply; 19+ messages in thread
From: Rusty Russell @ 2007-03-22  9:24 UTC (permalink / raw)
  To: Sébastien Dugué
  Cc: Eric W. Biederman, Andrew Morton, Ingo Molnar, Andi Kleen,
	lkml - Kernel Mailing List

On Thu, 2007-03-22 at 09:10 +0100, Sébastien Dugué wrote:
>   Why not take on the opportunity to rename boot_gt_table to boot_gtd,
> to avoid the duplicate T(able)?

That's not a bad idea, but IMHO deserves its own patch.

I look forward to it!
Rusty.



^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH] i386 GDT cleanups: Rename boot_gdt_table to boot_gdt
  2007-03-22  9:24                 ` Rusty Russell
@ 2007-03-22 15:59                   ` Sébastien Dugué
  2007-03-23  7:19                     ` Rusty Russell
  2007-03-23 14:15                     ` Andi Kleen
  0 siblings, 2 replies; 19+ messages in thread
From: Sébastien Dugué @ 2007-03-22 15:59 UTC (permalink / raw)
  To: Rusty Russell
  Cc: Eric W. Biederman, Andrew Morton, Ingo Molnar, Andi Kleen,
	lkml - Kernel Mailing List


  Rusty,

On Thu, 22 Mar 2007 20:24:10 +1100 Rusty Russell <rusty@rustcorp.com.au> wrote:

> On Thu, 2007-03-22 at 09:10 +0100, Sébastien Dugué wrote:
> >   Why not take on the opportunity to rename boot_gt_table to boot_gtd,
> > to avoid the duplicate T(able)?
> 
> That's not a bad idea, but IMHO deserves its own patch.
> 
> I look forward to it!
> Rusty.
> 
> 

  Here it is, boot tested on an SMP x86 box.

  Sébastien.



 Rename boot_gdt_table to boot_gdt to avoid the duplicate
T(able).

Signed-off-by: Sébastien Dugué <sebastien.dugue@bull.net>

---
 arch/i386/kernel/head.S       |    9 ++++-----
 arch/i386/kernel/trampoline.S |   12 ++++++------
 2 files changed, 10 insertions(+), 11 deletions(-)

Index: linux-2.6/arch/i386/kernel/head.S
===================================================================
--- linux-2.6.orig/arch/i386/kernel/head.S	2007-03-22 15:09:02.000000000 +0100
+++ linux-2.6/arch/i386/kernel/head.S	2007-03-22 16:14:38.000000000 +0100
@@ -147,8 +147,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
 /*
  * Non-boot CPU entry point; entered from trampoline.S
  * We can't lgdt here, because lgdt itself uses a data segment, but
- * we know the trampoline has already loaded the boot_gdt_table GDT
- * for us.
+ * we know the trampoline has already loaded the boot_gdt for us.
  *
  * If cpu hotplug is not supported then this code can go in init section
  * which will be freed later
@@ -588,7 +587,7 @@ fault_msg:
 	.word 0				# 32 bit align gdt_desc.address
 boot_gdt_descr:
 	.word __BOOT_DS+7
-	.long boot_gdt_table - __PAGE_OFFSET
+	.long boot_gdt - __PAGE_OFFSET
 
 	.word 0				# 32-bit align idt_desc.address
 idt_descr:
@@ -602,11 +601,11 @@ ENTRY(early_gdt_descr)
 	.long cpu_gdt_table
 
 /*
- * The boot_gdt_table must mirror the equivalent in setup.S and is
+ * The boot_gdt must mirror the equivalent in setup.S and is
  * used only for booting.
  */
 	.align L1_CACHE_BYTES
-ENTRY(boot_gdt_table)
+ENTRY(boot_gdt)
 	.fill GDT_ENTRY_BOOT_CS,8,0
 	.quad 0x00cf9a000000ffff	/* kernel 4GB code at 0x00000000 */
 	.quad 0x00cf92000000ffff	/* kernel 4GB data at 0x00000000 */
Index: linux-2.6/arch/i386/kernel/trampoline.S
===================================================================
--- linux-2.6.orig/arch/i386/kernel/trampoline.S	2007-03-22 15:09:02.000000000 +0100
+++ linux-2.6/arch/i386/kernel/trampoline.S	2007-03-22 16:13:23.000000000 +0100
@@ -29,7 +29,7 @@
  *
  *	TYPE              VALUE
  *	R_386_32          startup_32_smp
- *	R_386_32          boot_gdt_table
+ *	R_386_32          boot_gdt
  */
 
 #include <linux/linkage.h>
@@ -62,8 +62,8 @@ r_base = .
 	 * to 32 bit.
 	 */
 
-	lidtl	boot_idt - r_base	# load idt with 0, 0
-	lgdtl	boot_gdt - r_base	# load gdt with whatever is appropriate
+	lidtl	boot_idt_descr - r_base	# load idt with 0, 0
+	lgdtl	boot_gdt_descr - r_base	# load gdt with whatever is appropriate
 
 	xor	%ax, %ax
 	inc	%ax		# protected mode (PE) bit
@@ -73,11 +73,11 @@ r_base = .
 
 	# These need to be in the same 64K segment as the above;
 	# hence we don't use the boot_gdt_descr defined in head.S
-boot_gdt:
+boot_gdt_descr:
 	.word	__BOOT_DS + 7			# gdt limit
-	.long	boot_gdt_table-__PAGE_OFFSET	# gdt base
+	.long	boot_gdt - __PAGE_OFFSET	# gdt base
 
-boot_idt:
+boot_idt_descr:
 	.word	0				# idt limit = 0
 	.long	0				# idt base = 0L
 

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] i386 GDT cleanups: Rename boot_gdt_table to boot_gdt
  2007-03-22 15:59                   ` [PATCH] i386 GDT cleanups: Rename boot_gdt_table to boot_gdt Sébastien Dugué
@ 2007-03-23  7:19                     ` Rusty Russell
  2007-03-23 14:15                     ` Andi Kleen
  1 sibling, 0 replies; 19+ messages in thread
From: Rusty Russell @ 2007-03-23  7:19 UTC (permalink / raw)
  To: Sébastien Dugué
  Cc: Eric W. Biederman, Andrew Morton, Ingo Molnar, Andi Kleen,
	lkml - Kernel Mailing List

On Thu, 2007-03-22 at 16:59 +0100, Sébastien Dugué wrote:
>  Rename boot_gdt_table to boot_gdt to avoid the duplicate
> T(able).
> 
> Signed-off-by: Sébastien Dugué <sebastien.dugue@bull.net>
> 
> ---
>  arch/i386/kernel/head.S       |    9 ++++-----
>  arch/i386/kernel/trampoline.S |   12 ++++++------
>  2 files changed, 10 insertions(+), 11 deletions(-)

Acked-by: Rusty Russell <rusty@rustcorp.com.au>

In future, I'd recommend adding a witty comment to any such trivial
patch: it's really the only way to get it featured on LWN's Kernel Quote
of the Week.

Damn you Jon for turning us all into show ponies! (*Hi mum!*)
Rusty.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH] i386 GDT cleanups: Rename boot_gdt_table to boot_gdt
  2007-03-22 15:59                   ` [PATCH] i386 GDT cleanups: Rename boot_gdt_table to boot_gdt Sébastien Dugué
  2007-03-23  7:19                     ` Rusty Russell
@ 2007-03-23 14:15                     ` Andi Kleen
  1 sibling, 0 replies; 19+ messages in thread
From: Andi Kleen @ 2007-03-23 14:15 UTC (permalink / raw)
  To: Sébastien Dugué
  Cc: Rusty Russell, Eric W. Biederman, Andrew Morton, Ingo Molnar,
	Andi Kleen, lkml - Kernel Mailing List

>  Rename boot_gdt_table to boot_gdt to avoid the duplicate
> T(able).

We already have so much code churn in this area now, please
no pointless renames for at least two releases.

Thanks.

-Andi

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2007-03-23 14:15 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-03-21  6:10 [PATCH] Allow per-cpu variables to be page-aligned Rusty Russell
2007-03-21  6:30 ` [PATCH 0/4] i386 GDT cleanups Rusty Russell
2007-03-21  6:32   ` [PATCH 1/4] i386 GDT cleanups: Use per-cpu variables for GDT, PDA Rusty Russell
2007-03-21  6:35     ` [PATCH 2/4] i386 GDT cleanups: Use per-cpu GDT immediately upon boot Rusty Russell
2007-03-21  6:36       ` [PATCH 3/4] i386 GDT cleanups: clean up cpu_init() Rusty Russell
2007-03-21  6:53         ` [PATCH 4/4] i386 GDT cleanups: cleanup GDT Access Rusty Russell
2007-03-21  9:31       ` [PATCH 2/4] i386 GDT cleanups: Use per-cpu GDT immediately upon boot Eric W. Biederman
2007-03-21 11:55         ` Rusty Russell
2007-03-21 16:51           ` Eric W. Biederman
2007-03-21 23:58             ` Rusty Russell
2007-03-22  8:10               ` Sébastien Dugué
2007-03-22  9:24                 ` Rusty Russell
2007-03-22 15:59                   ` [PATCH] i386 GDT cleanups: Rename boot_gdt_table to boot_gdt Sébastien Dugué
2007-03-23  7:19                     ` Rusty Russell
2007-03-23 14:15                     ` Andi Kleen
2007-03-21  9:21 ` [PATCH] Allow per-cpu variables to be page-aligned Eric W. Biederman
2007-03-21 11:44   ` Rusty Russell
2007-03-21 16:49     ` Eric W. Biederman
2007-03-22  0:09       ` Rusty Russell

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.