linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/4] x86_64: Optimize percpu accesses
@ 2008-07-25 21:11 Mike Travis
  2008-07-25 21:11 ` [PATCH 1/4] x86_64: Cleanup early setup_percpu references Mike Travis
                   ` (6 more replies)
  0 siblings, 7 replies; 12+ messages in thread
From: Mike Travis @ 2008-07-25 21:11 UTC (permalink / raw)
  To: Ingo Molnar, Andrew Morton
  Cc: Eric W. Biederman, Hugh Dickins, Jack Steiner,
	Jeremy Fitzhardinge, H. Peter Anvin, linux-kernel


This patchset provides the following:

  * x86_64: Cleanup setup_percpu by fixing some minor potential
    problems as well as add some debugging aids.

  * x86_64: Rebase per cpu variables to zero

    Rebase per cpu variables to zero in preparation for the following
    patch to fold the pda into the per cpu area.

  * x86_64: Fold pda into per cpu area

    Declare the pda as a per cpu variable. This will allow the per cpu
    variables to be accessible on the x86_64 using %gs as the base of
    the percpu areas for each cpu:

	%gs:per_cpu_xxxx

  * x86_64: Reference zero-based percpu variables offset from gs

    Actually implement the above operation for __get_cpu_var() and
    __put_cpu_var().  Since this is now a single instruction, we
    can remove the non-preemptible versions of x86_read_percpu()
    and x86_write_percpu().

Note that the following changes are NOT in this patchset as the plan now
seems to be that the common (to x86) variables that are in the pda should
be made individual per cpu variables, leaving only the stack canary in place.

  * x86_64: Replace cpu_pda ops with percpu ops
  * x86_64: Replace xxx_pda() operations with x86_xxx_percpu().
  * x86_64: Remove xxx_pda() operations
  * x86_64: Remove cpu_pda() macro

Based on linux-2.6.tip/master.

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Mike Travis <travis@sgi.com>
---

-- 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/4] x86_64: Cleanup early setup_percpu references
  2008-07-25 21:11 [PATCH 0/4] x86_64: Optimize percpu accesses Mike Travis
@ 2008-07-25 21:11 ` Mike Travis
  2008-07-25 21:11 ` [PATCH 2/4] x86_64: Base percpu variables at zero Mike Travis
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 12+ messages in thread
From: Mike Travis @ 2008-07-25 21:11 UTC (permalink / raw)
  To: Ingo Molnar, Andrew Morton
  Cc: Eric W. Biederman, Hugh Dickins, Jack Steiner,
	Jeremy Fitzhardinge, H. Peter Anvin, linux-kernel

[-- Attachment #1: cleanup_percpu --]
[-- Type: text/plain, Size: 5280 bytes --]

  * Ruggedize some calls in setup_percpu.c to prevent mishaps
    in early calls, particularly for non-critical functions.

  * Cleanup DEBUG_PER_CPU_MAPS usages and some comments.

Based on linux-2.6.tip/master with following patches applied:

	cpumask: Make cpumask_of_cpu_map generic
	cpumask: Put cpumask_of_cpu_map in the initdata section
	cpumask: Change cpumask_of_cpu_ptr to use new cpumask_of_cpu

Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/x86/kernel/setup_percpu.c |   66 +++++++++++++++++++++++++++++------------
 1 file changed, 47 insertions(+), 19 deletions(-)

--- linux-2.6.tip.orig/arch/x86/kernel/setup_percpu.c
+++ linux-2.6.tip/arch/x86/kernel/setup_percpu.c
@@ -15,6 +15,12 @@
 #include <asm/apicdef.h>
 #include <asm/highmem.h>
 
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+# define DBG(x...) printk(KERN_DEBUG x)
+#else
+# define DBG(x...)
+#endif
+
 #ifdef CONFIG_X86_LOCAL_APIC
 unsigned int num_processors;
 unsigned disabled_cpus __cpuinitdata;
@@ -27,31 +33,39 @@ EXPORT_SYMBOL(boot_cpu_physical_apicid);
 physid_mask_t phys_cpu_present_map;
 #endif
 
-/* map cpu index to physical APIC ID */
+/*
+ * Map cpu index to physical APIC ID
+ */
 DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID);
 DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID);
 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
 EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
 
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
-#define	X86_64_NUMA	1
+#define	X86_64_NUMA	1	/* (used later) */
 
-/* map cpu index to node index */
+/*
+ * Map cpu index to node index
+ */
 DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
 EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
 
-/* which logical CPUs are on which nodes */
+/*
+ * Which logical CPUs are on which nodes
+ */
 cpumask_t *node_to_cpumask_map;
 EXPORT_SYMBOL(node_to_cpumask_map);
 
-/* setup node_to_cpumask_map */
+/*
+ * Setup node_to_cpumask_map
+ */
 static void __init setup_node_to_cpumask_map(void);
 
 #else
 static inline void setup_node_to_cpumask_map(void) { }
 #endif
 
-#if defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) && defined(CONFIG_X86_SMP)
+#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
 /*
  * Copy data used in early init routines from the initial arrays to the
  * per cpu data areas.  These arrays then become expendable and the
@@ -90,11 +104,16 @@ static void __init setup_per_cpu_maps(vo
 static void __init setup_cpumask_of_cpu(void)
 {
 	int i;
+	cpumask_t *new_map;
 
 	/* alloc_bootmem zeroes memory */
-	cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
+	new_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
+	DBG("cpumask_of_cpu_map at %p\n", new_map);
+
 	for (i = 0; i < nr_cpu_ids; i++)
-		cpu_set(i, cpumask_of_cpu_map[i]);
+		cpu_set(i, new_map[i]);
+
+	cpumask_of_cpu_map = (const cpumask_t *)new_map;
 }
 #else
 static inline void setup_cpumask_of_cpu(void) { }
@@ -189,9 +208,10 @@ void __init setup_per_cpu_areas(void)
 		per_cpu_offset(cpu) = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 
+		DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
 	}
 
-	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
+	printk(KERN_INFO "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids: %d\n",
 		NR_CPUS, nr_cpu_ids, nr_node_ids);
 
 	/* Setup percpu data maps */
@@ -213,6 +233,7 @@ void __init setup_per_cpu_areas(void)
  * Requires node_possible_map to be valid.
  *
  * Note: node_to_cpumask() is not valid until after this is done.
+ * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
  */
 static void __init setup_node_to_cpumask_map(void)
 {
@@ -228,6 +249,7 @@ static void __init setup_node_to_cpumask
 
 	/* allocate the map */
 	map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
+	DBG("node_to_cpumask_map at %p for %d nodes\n", map, nr_node_ids);
 
 	pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n",
 		 map, nr_node_ids);
@@ -240,17 +262,23 @@ void __cpuinit numa_set_node(int cpu, in
 {
 	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
 
-	if (cpu_pda(cpu) && node != NUMA_NO_NODE)
-		cpu_pda(cpu)->nodenumber = node;
-
-	if (cpu_to_node_map)
+	/* early setting, no percpu area yet */
+	if (cpu_to_node_map) {
 		cpu_to_node_map[cpu] = node;
+		return;
+	}
 
-	else if (per_cpu_offset(cpu))
-		per_cpu(x86_cpu_to_node_map, cpu) = node;
+#ifdef CONFIG_DEBUG_PER_CPU_MAPS
+	if (cpu >= nr_cpu_ids || !per_cpu_offset(cpu)) {
+		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
+		dump_stack();
+		return;
+	}
+#endif
+	per_cpu(x86_cpu_to_node_map, cpu) = node;
 
-	else
-		pr_debug("Setting node for non-present cpu %d\n", cpu);
+	if (node != NUMA_NO_NODE)
+		cpu_pda(cpu)->nodenumber = node;
 }
 
 void __cpuinit numa_clear_node(int cpu)
@@ -267,7 +295,7 @@ void __cpuinit numa_add_cpu(int cpu)
 
 void __cpuinit numa_remove_cpu(int cpu)
 {
-	cpu_clear(cpu, node_to_cpumask_map[cpu_to_node(cpu)]);
+	cpu_clear(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
 }
 
 #else /* CONFIG_DEBUG_PER_CPU_MAPS */
@@ -277,7 +305,7 @@ void __cpuinit numa_remove_cpu(int cpu)
  */
 static void __cpuinit numa_set_cpumask(int cpu, int enable)
 {
-	int node = cpu_to_node(cpu);
+	int node = early_cpu_to_node(cpu);
 	cpumask_t *mask;
 	char buf[64];
 

-- 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2/4] x86_64: Base percpu variables at zero
  2008-07-25 21:11 [PATCH 0/4] x86_64: Optimize percpu accesses Mike Travis
  2008-07-25 21:11 ` [PATCH 1/4] x86_64: Cleanup early setup_percpu references Mike Travis
@ 2008-07-25 21:11 ` Mike Travis
  2008-07-25 21:11 ` [PATCH 3/4] x86_64: Fold pda into per cpu area Mike Travis
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 12+ messages in thread
From: Mike Travis @ 2008-07-25 21:11 UTC (permalink / raw)
  To: Ingo Molnar, Andrew Morton
  Cc: Eric W. Biederman, Hugh Dickins, Jack Steiner,
	Jeremy Fitzhardinge, H. Peter Anvin, linux-kernel,
	Christoph Lameter

[-- Attachment #1: zero_based_only --]
[-- Type: text/plain, Size: 6744 bytes --]

WARNING: There is still a FIXME in this patch (see arch/x86/kernel/acpi/sleep.c)
         [Advice on how to fix it most welcome... ;-)]

  * Make the x86_64 per cpu area start at zero.

  * Relocate the per_cpu(gdt_page) in head_64.S for the boot cpu (0).
    For secondary cpus, do_boot_cpu() sets up the correct gdt_page pointer.

  * Initialize per_cpu_offset to point to static pda in the per_cpu area
    (@ __per_cpu_load).

  * After allocation of the per cpu area for the boot cpu (0), reload the
    gdt page pointer.

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/x86/Kconfig                 |    3 ++
 arch/x86/kernel/acpi/sleep.c     |    9 ++++++++
 arch/x86/kernel/head_64.S        |   26 ++++++++++++++++++++++--
 arch/x86/kernel/setup_percpu.c   |   42 ++++++++++++++++++++++++++++++++-------
 arch/x86/kernel/vmlinux_64.lds.S |    1 
 5 files changed, 72 insertions(+), 9 deletions(-)

--- linux-2.6.tip.orig/arch/x86/Kconfig
+++ linux-2.6.tip/arch/x86/Kconfig
@@ -129,6 +129,9 @@ config HAVE_SETUP_PER_CPU_AREA
 config HAVE_CPUMASK_OF_CPU_MAP
 	def_bool X86_64_SMP
 
+config HAVE_ZERO_BASED_PER_CPU
+	def_bool X86_64_SMP
+
 config ARCH_HIBERNATION_POSSIBLE
 	def_bool y
 	depends on !SMP || !X86_VOYAGER
--- linux-2.6.tip.orig/arch/x86/kernel/acpi/sleep.c
+++ linux-2.6.tip/arch/x86/kernel/acpi/sleep.c
@@ -99,6 +99,15 @@ int acpi_save_state_mem(void)
 #ifdef CONFIG_SMP
 	stack_start.sp = temp_stack + 4096;
 #endif
+	/*
+	 * FIXME: with zero-based percpu variables, the pda and gdt_page
+	 * addresses must be offset by the base of this cpu's percpu area.
+	 * Where/how should we do this?
+	 *
+	 * for secondary cpu startup in smpboot.c:do_boot_cpu() this is done:
+	 *	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+	 *	initial_pda = (unsigned long)get_cpu_pda(cpu);
+	 */
 	initial_code = (unsigned long)wakeup_long64;
 	saved_magic = 0x123456789abcdef0;
 #endif /* CONFIG_64BIT */
--- linux-2.6.tip.orig/arch/x86/kernel/head_64.S
+++ linux-2.6.tip/arch/x86/kernel/head_64.S
@@ -12,6 +12,7 @@
 #include <linux/linkage.h>
 #include <linux/threads.h>
 #include <linux/init.h>
+#include <asm/asm-offsets.h>
 #include <asm/desc.h>
 #include <asm/segment.h>
 #include <asm/pgtable.h>
@@ -210,7 +211,27 @@ ENTRY(secondary_startup_64)
 	 * addresses where we're currently running on. We have to do that here
 	 * because in 32bit we couldn't load a 64bit linear address.
 	 */
-	lgdt	early_gdt_descr(%rip)
+
+#ifdef CONFIG_SMP
+	 /*
+	 * For zero-based percpu variables, the base (__per_cpu_load) must
+	 * be added to the offset of per_cpu__gdt_page.  This is only needed
+	 * for the boot cpu but we can't do this prior to secondary_startup_64.
+	 * So we use a NULL gdt adrs to indicate that we are starting up the
+	 * boot cpu and not the secondary cpus.  do_boot_cpu() will fixup
+	 * the gdt adrs for those cpus.
+	 */
+#define PER_CPU_GDT_PAGE	0
+	movq	early_gdt_descr_base(%rip), %rax
+	testq	%rax, %rax
+	jnz	1f
+	movq	$__per_cpu_load, %rax
+	addq	$per_cpu__gdt_page, %rax
+	movq	%rax, early_gdt_descr_base(%rip)
+#else
+#define PER_CPU_GDT_PAGE	per_cpu__gdt_page
+#endif
+1:	lgdt	early_gdt_descr(%rip)
 
 	/* set up data segments. actually 0 would do too */
 	movl $__KERNEL_DS,%eax
@@ -401,7 +422,8 @@ NEXT_PAGE(level2_spare_pgt)
 	.globl early_gdt_descr
 early_gdt_descr:
 	.word	GDT_ENTRIES*8-1
-	.quad   per_cpu__gdt_page
+early_gdt_descr_base:
+	.quad	PER_CPU_GDT_PAGE	# Overwritten for secondary CPUs
 
 ENTRY(phys_base)
 	/* This must match the first entry in level2_kernel_pgt */
--- linux-2.6.tip.orig/arch/x86/kernel/setup_percpu.c
+++ linux-2.6.tip/arch/x86/kernel/setup_percpu.c
@@ -14,6 +14,7 @@
 #include <asm/mpspec.h>
 #include <asm/apicdef.h>
 #include <asm/highmem.h>
+#include <asm/desc.h>
 
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 # define DBG(x...) printk(KERN_DEBUG x)
@@ -119,16 +120,21 @@ static void __init setup_cpumask_of_cpu(
 static inline void setup_cpumask_of_cpu(void) { }
 #endif
 
-#ifdef CONFIG_X86_32
 /*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
+ * Pointers to per cpu areas for each cpu
  */
+#ifdef CONFIG_HAVE_ZERO_BASED_PER_CPU
+
+/* Initialize percpu offset for boot cpu (0) */
+unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
+	[0] = (unsigned long)__per_cpu_load
+};
+#else
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
+#endif
 EXPORT_SYMBOL(__per_cpu_offset);
-static inline void setup_cpu_pda_map(void) { }
 
-#elif !defined(CONFIG_SMP)
+#if !defined(CONFIG_SMP) || !defined(CONFIG_X86_64)
 static inline void setup_cpu_pda_map(void) { }
 
 #else /* CONFIG_SMP && CONFIG_X86_64 */
@@ -160,8 +166,10 @@ static void __init setup_cpu_pda_map(voi
 		if (cpu == 0) {
 			/* leave boot cpu pda in place */
 			new_cpu_pda[0] = cpu_pda(0);
+			DBG("cpu %4d pda %p\n", cpu, cpu_pda(0));
 			continue;
 		}
+		DBG("cpu %4d pda %p\n", cpu, pda);
 		new_cpu_pda[cpu] = (struct x8664_pda *)pda;
 		new_cpu_pda[cpu]->in_bootmem = 1;
 		pda += size;
@@ -191,6 +199,8 @@ void __init setup_per_cpu_areas(void)
 	printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
 			  size);
 
+	DBG("PERCPU: __per_cpu_start %p\n", __per_cpu_start);
+
 	for_each_possible_cpu(cpu) {
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 		ptr = alloc_bootmem_pages(size);
@@ -205,10 +215,28 @@ void __init setup_per_cpu_areas(void)
 		else
 			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
 #endif
+		DBG("PERCPU: cpu %4d %p pda %p %p\n",
+			     cpu, ptr, _cpu_pda[cpu], cpu_pda(cpu));
+
+		/* Initialize each cpu's per_cpu area and save pointer */
+		memcpy(ptr, __per_cpu_load, __per_cpu_size);
 		per_cpu_offset(cpu) = ptr - __per_cpu_start;
-		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 
-		DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
+#ifdef CONFIG_X86_64
+		/* save for __my_cpu_offset() */
+		cpu_pda(cpu)->data_offset = (unsigned long)ptr;
+
+		/*
+		 * The boot cpu gdt page must be reloaded as we moved it
+		 * from the static per cpu area to the newly allocated area.
+		 */
+		if (cpu == 0) {
+			struct desc_ptr	gdt_descr = early_gdt_descr;
+
+			gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
+			native_load_gdt(&gdt_descr);
+		}
+#endif
 	}
 
 	printk(KERN_INFO "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids: %d\n",
--- linux-2.6.tip.orig/arch/x86/kernel/vmlinux_64.lds.S
+++ linux-2.6.tip/arch/x86/kernel/vmlinux_64.lds.S
@@ -16,6 +16,7 @@ jiffies_64 = jiffies;
 _proxy_pda = 1;
 PHDRS {
 	text PT_LOAD FLAGS(5);	/* R_E */
+	percpu PT_LOAD FLAGS(7);	/* RWE */
 	data PT_LOAD FLAGS(7);	/* RWE */
 	user PT_LOAD FLAGS(7);	/* RWE */
 	data.init PT_LOAD FLAGS(7);	/* RWE */

-- 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 3/4] x86_64: Fold pda into per cpu area
  2008-07-25 21:11 [PATCH 0/4] x86_64: Optimize percpu accesses Mike Travis
  2008-07-25 21:11 ` [PATCH 1/4] x86_64: Cleanup early setup_percpu references Mike Travis
  2008-07-25 21:11 ` [PATCH 2/4] x86_64: Base percpu variables at zero Mike Travis
@ 2008-07-25 21:11 ` Mike Travis
  2008-07-25 21:11 ` [PATCH 4/4] x86_64: Reference zero-based percpu variables offset from gs Mike Travis
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 12+ messages in thread
From: Mike Travis @ 2008-07-25 21:11 UTC (permalink / raw)
  To: Ingo Molnar, Andrew Morton
  Cc: Eric W. Biederman, Hugh Dickins, Jack Steiner,
	Jeremy Fitzhardinge, H. Peter Anvin, linux-kernel,
	Christoph Lameter

[-- Attachment #1: fold_pda_into_percpu --]
[-- Type: text/plain, Size: 13594 bytes --]

WARNING: there are two FIXME's in arch/x86/xen/enlighten.c
	 and arch/x86/xen/smp.c that I'm not sure how to handle...?

  * Declare the pda as a per cpu variable.

  * Relocate the initial pda in head_64.S for the boot cpu (0).
    For secondary cpus, do_boot_cpu() sets up the correct initial pda.

Based on linux-2.6.tip/master

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/x86/kernel/cpu/common_64.c |    4 -
 arch/x86/kernel/head64.c        |   29 +-----------
 arch/x86/kernel/head_64.S       |   19 ++++++--
 arch/x86/kernel/setup_percpu.c  |   93 +++++++++++-----------------------------
 arch/x86/kernel/smpboot.c       |   53 ----------------------
 arch/x86/xen/enlighten.c        |   10 ++++
 arch/x86/xen/smp.c              |   11 +---
 include/asm-x86/desc.h          |    5 ++
 include/asm-x86/pda.h           |    3 -
 include/asm-x86/percpu.h        |   13 -----
 include/asm-x86/setup.h         |    1 
 include/asm-x86/smp.h           |    2 
 include/asm-x86/trampoline.h    |    1 
 13 files changed, 72 insertions(+), 172 deletions(-)

--- linux-2.6.tip.orig/arch/x86/kernel/cpu/common_64.c
+++ linux-2.6.tip/arch/x86/kernel/cpu/common_64.c
@@ -418,8 +418,8 @@ __setup("clearcpuid=", setup_disablecpui
 
 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
+DEFINE_PER_CPU_FIRST(struct x8664_pda, pda);
+EXPORT_PER_CPU_SYMBOL(pda);
 
 struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 
--- linux-2.6.tip.orig/arch/x86/kernel/head64.c
+++ linux-2.6.tip/arch/x86/kernel/head64.c
@@ -25,27 +25,6 @@
 #include <asm/e820.h>
 #include <asm/bios_ebda.h>
 
-/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
-
-#ifdef CONFIG_SMP
-/*
- * We install an empty cpu_pda pointer table to indicate to early users
- * (numa_set_node) that the cpu_pda pointer table for cpus other than
- * the boot cpu is not yet setup.
- */
-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
-#else
-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
-#endif
-
-void __init x86_64_init_pda(void)
-{
-	_cpu_pda = __cpu_pda;
-	cpu_pda(0) = &_boot_cpu_pda;
-	pda_init(0);
-}
-
 static void __init zap_identity_mappings(void)
 {
 	pgd_t *pgd = pgd_offset_k(0UL);
@@ -98,6 +77,10 @@ void __init x86_64_start_kernel(char * r
 	/* Cleanup the over mapped high alias */
 	cleanup_highmap();
 
+	/* Initialize boot cpu_pda data */
+	/* (See head_64.S for earlier pda/gdt initialization) */
+	pda_init(0);
+
 	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
 #ifdef CONFIG_EARLY_PRINTK
 		set_intr_gate(i, &early_idt_handlers[i]);
@@ -109,10 +92,6 @@ void __init x86_64_start_kernel(char * r
 
 	early_printk("Kernel alive\n");
 
-	x86_64_init_pda();
-
-	early_printk("Kernel really alive\n");
-
 	x86_64_start_reservations(real_mode_data);
 }
 
--- linux-2.6.tip.orig/arch/x86/kernel/head_64.S
+++ linux-2.6.tip/arch/x86/kernel/head_64.S
@@ -248,14 +248,21 @@ ENTRY(secondary_startup_64)
 	movl %eax,%gs
 
 	/* 
-	 * Setup up a dummy PDA. this is just for some early bootup code
-	 * that does in_interrupt() 
+	 * Setup up the real PDA.
+	 *
+	 * For SMP, the boot cpu (0) uses the static pda which is the first
+	 * element in the percpu area (@__per_cpu_load).  This pda is moved
+	 * to the real percpu area once that is allocated.  Secondary cpus
+	 * will use the initial_pda value setup in do_boot_cpu().
 	 */ 
 	movl	$MSR_GS_BASE,%ecx
-	movq	$empty_zero_page,%rax
+	movq	initial_pda(%rip), %rax
 	movq    %rax,%rdx
 	shrq	$32,%rdx
 	wrmsr	
+#ifdef CONFIG_SMP
+	movq	%rax, %gs:pda_data_offset
+#endif
 
 	/* esi is pointer to real mode structure with interesting info.
 	   pass it to C */
@@ -278,6 +285,12 @@ ENTRY(secondary_startup_64)
 	.align	8
 	ENTRY(initial_code)
 	.quad	x86_64_start_kernel
+	ENTRY(initial_pda)
+#ifdef CONFIG_SMP
+	.quad	__per_cpu_load		# Overwritten for secondary CPUs
+#else
+	.quad	per_cpu__pda
+#endif
 	__FINITDATA
 
 	ENTRY(stack_start)
--- linux-2.6.tip.orig/arch/x86/kernel/setup_percpu.c
+++ linux-2.6.tip/arch/x86/kernel/setup_percpu.c
@@ -134,56 +134,8 @@ unsigned long __per_cpu_offset[NR_CPUS] 
 #endif
 EXPORT_SYMBOL(__per_cpu_offset);
 
-#if !defined(CONFIG_SMP) || !defined(CONFIG_X86_64)
-static inline void setup_cpu_pda_map(void) { }
-
-#else /* CONFIG_SMP && CONFIG_X86_64 */
-
-/*
- * Allocate cpu_pda pointer table and array via alloc_bootmem.
- */
-static void __init setup_cpu_pda_map(void)
-{
-	char *pda;
-	struct x8664_pda **new_cpu_pda;
-	unsigned long size;
-	int cpu;
-
-	size = roundup(sizeof(struct x8664_pda), cache_line_size());
-
-	/* allocate cpu_pda array and pointer table */
-	{
-		unsigned long tsize = nr_cpu_ids * sizeof(void *);
-		unsigned long asize = size * (nr_cpu_ids - 1);
-
-		tsize = roundup(tsize, cache_line_size());
-		new_cpu_pda = alloc_bootmem(tsize + asize);
-		pda = (char *)new_cpu_pda + tsize;
-	}
-
-	/* initialize pointer table to static pda's */
-	for_each_possible_cpu(cpu) {
-		if (cpu == 0) {
-			/* leave boot cpu pda in place */
-			new_cpu_pda[0] = cpu_pda(0);
-			DBG("cpu %4d pda %p\n", cpu, cpu_pda(0));
-			continue;
-		}
-		DBG("cpu %4d pda %p\n", cpu, pda);
-		new_cpu_pda[cpu] = (struct x8664_pda *)pda;
-		new_cpu_pda[cpu]->in_bootmem = 1;
-		pda += size;
-	}
-
-	/* point to new pointer table */
-	_cpu_pda = new_cpu_pda;
-}
-#endif
-
 /*
- * Great future plan:
- * Declare PDA itself and support (irqstack,tss,pgd) as per cpu data.
- * Always point %gs to its beginning
+ * Allocate and initialize the per cpu areas which include the PDAs.
  */
 void __init setup_per_cpu_areas(void)
 {
@@ -191,16 +143,11 @@ void __init setup_per_cpu_areas(void)
 	char *ptr;
 	int cpu;
 
-	/* Setup cpu_pda map */
-	setup_cpu_pda_map();
-
 	/* Copy section for each CPU (we discard the original) */
 	size = PERCPU_ENOUGH_ROOM;
 	printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
 			  size);
 
-	DBG("PERCPU: __per_cpu_start %p\n", __per_cpu_start);
-
 	for_each_possible_cpu(cpu) {
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 		ptr = alloc_bootmem_pages(size);
@@ -215,26 +162,38 @@ void __init setup_per_cpu_areas(void)
 		else
 			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
 #endif
-		DBG("PERCPU: cpu %4d %p pda %p %p\n",
-			     cpu, ptr, _cpu_pda[cpu], cpu_pda(cpu));
-
 		/* Initialize each cpu's per_cpu area and save pointer */
 		memcpy(ptr, __per_cpu_load, __per_cpu_size);
 		per_cpu_offset(cpu) = ptr - __per_cpu_start;
 
-#ifdef CONFIG_X86_64
-		/* save for __my_cpu_offset() */
-		cpu_pda(cpu)->data_offset = (unsigned long)ptr;
+		DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
 
+#ifdef CONFIG_X86_64
 		/*
-		 * The boot cpu gdt page must be reloaded as we moved it
-		 * from the static per cpu area to the newly allocated area.
+		 * Note the boot cpu (0) has been using the static per_cpu load
+		 * area for it's pda.  We need to zero out the pdas for the
+		 * other cpus that are coming online.
+		 *
+		 * Additionally, for the boot cpu the gdt page must be reloaded
+		 * as we moved it from the static per cpu area to the newly
+		 * allocated area.
 		 */
-		if (cpu == 0) {
-			struct desc_ptr	gdt_descr = early_gdt_descr;
-
-			gdt_descr.address = (unsigned long)get_cpu_gdt_table(0);
-			native_load_gdt(&gdt_descr);
+		{
+			/* We rely on the fact that pda is the first element */
+			struct x8664_pda *pda = (struct x8664_pda *)ptr;
+
+			if (cpu) {
+				memset(pda, 0, sizeof(*pda));
+				pda->data_offset = (unsigned long)ptr;
+			} else {
+				struct desc_ptr	gdt_descr = early_gdt_descr;
+
+				pda->data_offset = (unsigned long)ptr;
+				gdt_descr.address =
+					(unsigned long)get_cpu_gdt_table(0);
+				native_load_gdt(&gdt_descr);
+				pda_init(0);
+			}
 		}
 #endif
 	}
--- linux-2.6.tip.orig/arch/x86/kernel/smpboot.c
+++ linux-2.6.tip/arch/x86/kernel/smpboot.c
@@ -744,45 +744,6 @@ static void __cpuinit do_fork_idle(struc
 	complete(&c_idle->done);
 }
 
-#ifdef CONFIG_X86_64
-/*
- * Allocate node local memory for the AP pda.
- *
- * Must be called after the _cpu_pda pointer table is initialized.
- */
-int __cpuinit get_local_pda(int cpu)
-{
-	struct x8664_pda *oldpda, *newpda;
-	unsigned long size = sizeof(struct x8664_pda);
-	int node = cpu_to_node(cpu);
-
-	if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
-		return 0;
-
-	oldpda = cpu_pda(cpu);
-	newpda = kmalloc_node(size, GFP_ATOMIC, node);
-	if (!newpda) {
-		printk(KERN_ERR "Could not allocate node local PDA "
-			"for CPU %d on node %d\n", cpu, node);
-
-		if (oldpda)
-			return 0;	/* have a usable pda */
-		else
-			return -1;
-	}
-
-	if (oldpda) {
-		memcpy(newpda, oldpda, size);
-		if (!after_bootmem)
-			free_bootmem((unsigned long)oldpda, size);
-	}
-
-	newpda->in_bootmem = 0;
-	cpu_pda(cpu) = newpda;
-	return 0;
-}
-#endif /* CONFIG_X86_64 */
-
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -800,16 +761,6 @@ static int __cpuinit do_boot_cpu(int api
 	};
 	INIT_WORK(&c_idle.work, do_fork_idle);
 
-#ifdef CONFIG_X86_64
-	/* Allocate node local memory for AP pdas */
-	if (cpu > 0) {
-		boot_error = get_local_pda(cpu);
-		if (boot_error)
-			goto restore_state;
-			/* if can't get pda memory, can't start cpu */
-	}
-#endif
-
 	alternatives_smp_switch(1);
 
 	c_idle.idle = get_idle_for_cpu(cpu);
@@ -847,6 +798,7 @@ do_rest:
 #else
 	cpu_pda(cpu)->pcurrent = c_idle.idle;
 	clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
+	initial_pda = (unsigned long)get_cpu_pda(cpu);
 #endif
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 	initial_code = (unsigned long)start_secondary;
@@ -921,9 +873,6 @@ do_rest:
 				inquire_remote_apic(apicid);
 		}
 	}
-#ifdef CONFIG_X86_64
-restore_state:
-#endif
 	if (boot_error) {
 		/* Try to put things back the way they were before ... */
 		numa_remove_cpu(cpu); /* was set by numa_add_cpu */
--- linux-2.6.tip.orig/arch/x86/xen/enlighten.c
+++ linux-2.6.tip/arch/x86/xen/enlighten.c
@@ -1748,8 +1748,18 @@ asmlinkage void __init xen_start_kernel(
 #ifdef CONFIG_X86_64
 	/* Disable until direct per-cpu data access. */
 	have_vcpu_info_placement = 0;
+#if 0
+	/*
+	 * FIXME: is the above still true?
+	 * Also, x86_64_init_pda() has been removed...
+	 *   should anything replace it?
+	 *   (The offset for cpu_pda(0) is statically initialized
+	 *   to __per_cpu_load, while the remaining pda's come online
+	 *   in setup_per_cpu_areas().)
+	 */
 	x86_64_init_pda();
 #endif
+#endif
 
 	xen_smp_init();
 
--- linux-2.6.tip.orig/arch/x86/xen/smp.c
+++ linux-2.6.tip/arch/x86/xen/smp.c
@@ -285,13 +285,10 @@ static int __cpuinit xen_cpu_up(unsigned
 #endif
 
 #ifdef CONFIG_X86_64
-	/* Allocate node local memory for AP pdas */
-	WARN_ON(cpu == 0);
-	if (cpu > 0) {
-		rc = get_local_pda(cpu);
-		if (rc)
-			return rc;
-	}
+	/*
+	 * FIXME: I don't believe that calling get_local_pda() is
+	 * required any more...?
+	 */
 #endif
 
 #ifdef CONFIG_X86_32
--- linux-2.6.tip.orig/include/asm-x86/desc.h
+++ linux-2.6.tip/include/asm-x86/desc.h
@@ -41,6 +41,11 @@ static inline struct desc_struct *get_cp
 
 #ifdef CONFIG_X86_64
 
+static inline struct x8664_pda *get_cpu_pda(unsigned int cpu)
+{
+	return &per_cpu(pda, cpu);
+}
+
 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
 			     unsigned dpl, unsigned ist, unsigned seg)
 {
--- linux-2.6.tip.orig/include/asm-x86/pda.h
+++ linux-2.6.tip/include/asm-x86/pda.h
@@ -37,10 +37,9 @@ struct x8664_pda {
 	unsigned irq_spurious_count;
 } ____cacheline_aligned_in_smp;
 
-extern struct x8664_pda **_cpu_pda;
 extern void pda_init(int);
 
-#define cpu_pda(i) (_cpu_pda[i])
+#define cpu_pda(cpu) (&per_cpu(pda, cpu))
 
 /*
  * There is no fast way to get the base address of the PDA, all the accesses
--- linux-2.6.tip.orig/include/asm-x86/percpu.h
+++ linux-2.6.tip/include/asm-x86/percpu.h
@@ -3,20 +3,11 @@
 
 #ifdef CONFIG_X86_64
 #include <linux/compiler.h>
-
-/* Same as asm-generic/percpu.h, except that we store the per cpu offset
-   in the PDA. Longer term the PDA and every per cpu variable
-   should be just put into a single section and referenced directly
-   from %gs */
-
-#ifdef CONFIG_SMP
 #include <asm/pda.h>
 
-#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
+/* Same as asm-generic/percpu.h */
+#ifdef CONFIG_SMP
 #define __my_cpu_offset read_pda(data_offset)
-
-#define per_cpu_offset(x) (__per_cpu_offset(x))
-
 #endif
 #include <asm-generic/percpu.h>
 
--- linux-2.6.tip.orig/include/asm-x86/setup.h
+++ linux-2.6.tip/include/asm-x86/setup.h
@@ -92,7 +92,6 @@ extern unsigned long init_pg_tables_star
 extern unsigned long init_pg_tables_end;
 
 #else
-void __init x86_64_init_pda(void);
 void __init x86_64_start_kernel(char *real_mode);
 void __init x86_64_start_reservations(char *real_mode_data);
 
--- linux-2.6.tip.orig/include/asm-x86/smp.h
+++ linux-2.6.tip/include/asm-x86/smp.h
@@ -25,8 +25,6 @@ extern cpumask_t cpu_callin_map;
 extern void (*mtrr_hook)(void);
 extern void zap_low_mappings(void);
 
-extern int __cpuinit get_local_pda(int cpu);
-
 extern int smp_num_siblings;
 extern unsigned int num_processors;
 extern cpumask_t cpu_initialized;
--- linux-2.6.tip.orig/include/asm-x86/trampoline.h
+++ linux-2.6.tip/include/asm-x86/trampoline.h
@@ -12,6 +12,7 @@ extern unsigned char *trampoline_base;
 
 extern unsigned long init_rsp;
 extern unsigned long initial_code;
+extern unsigned long initial_pda;
 
 #define TRAMPOLINE_BASE 0x6000
 extern unsigned long setup_trampoline(void);

-- 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 4/4] x86_64: Reference zero-based percpu variables offset from gs
  2008-07-25 21:11 [PATCH 0/4] x86_64: Optimize percpu accesses Mike Travis
                   ` (2 preceding siblings ...)
  2008-07-25 21:11 ` [PATCH 3/4] x86_64: Fold pda into per cpu area Mike Travis
@ 2008-07-25 21:11 ` Mike Travis
  2008-07-25 23:26 ` [PATCH 0/4] x86_64: Optimize percpu accesses Jeremy Fitzhardinge
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 12+ messages in thread
From: Mike Travis @ 2008-07-25 21:11 UTC (permalink / raw)
  To: Ingo Molnar, Andrew Morton
  Cc: Eric W. Biederman, Hugh Dickins, Jack Steiner,
	Jeremy Fitzhardinge, H. Peter Anvin, linux-kernel,
	Christoph Lameter

[-- Attachment #1: zero_based_use_gs --]
[-- Type: text/plain, Size: 3411 bytes --]

  * Now that %gs is pointing to the pda, it will then also point to the
    per cpu variables and the __get_cpu_var() and __put_cpu_var() macros
    can use:

        %gs:[&per_cpu_xxxx - __per_cpu_start]

    ... and since __per_cpu_start == 0 then:

        %gs:&per_cpu_var(xxx)
	
    becomes the optimized effective address.

    Since this is now a single instruction, we can remove the x86_64
    non-preemptible versions of x86_read_percpu() and x86_write_percpu().

  * Other cleanups in include/asm-x86/percpu.h

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 include/asm-x86/percpu.h |   62 +++++++++--------------------------------------
 1 file changed, 13 insertions(+), 49 deletions(-)

--- linux-2.6.tip.orig/include/asm-x86/percpu.h
+++ linux-2.6.tip/include/asm-x86/percpu.h
@@ -5,41 +5,19 @@
 #include <linux/compiler.h>
 #include <asm/pda.h>
 
-/* Same as asm-generic/percpu.h */
+/* Same as asm-generic/percpu.h, except we use %gs as a segment offset. */
 #ifdef CONFIG_SMP
 #define __my_cpu_offset read_pda(data_offset)
+#define __percpu_seg "%%gs:"
+#else
+#define __percpu_seg ""
 #endif
+
 #include <asm-generic/percpu.h>
 
 DECLARE_PER_CPU(struct x8664_pda, pda);
 
-/*
- * These are supposed to be implemented as a single instruction which
- * operates on the per-cpu data base segment.  x86-64 doesn't have
- * that yet, so this is a fairly inefficient workaround for the
- * meantime.  The single instruction is atomic with respect to
- * preemption and interrupts, so we need to explicitly disable
- * interrupts here to achieve the same effect.  However, because it
- * can be used from within interrupt-disable/enable, we can't actually
- * disable interrupts; disabling preemption is enough.
- */
-#define x86_read_percpu(var)						\
-	({								\
-		typeof(per_cpu_var(var)) __tmp;				\
-		preempt_disable();					\
-		__tmp = __get_cpu_var(var);				\
-		preempt_enable();					\
-		__tmp;							\
-	})
-
-#define x86_write_percpu(var, val)					\
-	do {								\
-		preempt_disable();					\
-		__get_cpu_var(var) = (val);				\
-		preempt_enable();					\
-	} while(0)
-
-#else /* CONFIG_X86_64 */
+#else /* !CONFIG_X86_64 */
 
 #ifdef __ASSEMBLY__
 
@@ -68,36 +46,23 @@ DECLARE_PER_CPU(struct x8664_pda, pda);
 
 #else /* ...!ASSEMBLY */
 
-/*
- * PER_CPU finds an address of a per-cpu variable.
- *
- * Args:
- *    var - variable name
- *    cpu - 32bit register containing the current CPU number
- *
- * The resulting address is stored in the "cpu" argument.
- *
- * Example:
- *    PER_CPU(cpu_gdt_descr, %ebx)
- */
 #ifdef CONFIG_SMP
-
 #define __my_cpu_offset x86_read_percpu(this_cpu_off)
-
-/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
 #define __percpu_seg "%%fs:"
-
-#else  /* !SMP */
-
+#else
 #define __percpu_seg ""
-
-#endif	/* SMP */
+#endif
 
 #include <asm-generic/percpu.h>
 
 /* We can use this directly for local CPU (faster). */
 DECLARE_PER_CPU(unsigned long, this_cpu_off);
 
+#endif /* __ASSEMBLY__ */
+#endif /* !CONFIG_X86_64 */
+
+#ifndef __ASSEMBLY__
+
 /* For arch-specific code, we can use direct single-insn ops (they
  * don't give an lvalue though). */
 extern void __bad_percpu_size(void);
@@ -232,7 +197,6 @@ do {							\
 				percpu_cmpxchg_op(per_cpu_var(var), old, new)
 
 #endif /* !__ASSEMBLY__ */
-#endif /* !CONFIG_X86_64 */
 
 #ifdef CONFIG_SMP
 

-- 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4] x86_64: Optimize percpu accesses
  2008-07-25 21:11 [PATCH 0/4] x86_64: Optimize percpu accesses Mike Travis
                   ` (3 preceding siblings ...)
  2008-07-25 21:11 ` [PATCH 4/4] x86_64: Reference zero-based percpu variables offset from gs Mike Travis
@ 2008-07-25 23:26 ` Jeremy Fitzhardinge
  2008-07-26  0:27   ` Mike Travis
  2008-07-26 12:38 ` Ingo Molnar
  2008-07-28 15:52 ` [crash] " Ingo Molnar
  6 siblings, 1 reply; 12+ messages in thread
From: Jeremy Fitzhardinge @ 2008-07-25 23:26 UTC (permalink / raw)
  To: Mike Travis
  Cc: Ingo Molnar, Andrew Morton, Eric W. Biederman, Hugh Dickins,
	Jack Steiner, H. Peter Anvin, linux-kernel

Mike Travis wrote:
> This patchset provides the following:
>
>   * x86_64: Cleanup setup_percpu by fixing some minor potential
>     problems as well as add some debugging aids.
>
>   * x86_64: Rebase per cpu variables to zero
>
>     Rebase per cpu variables to zero in preparation for the following
>     patch to fold the pda into the per cpu area.
>
>   * x86_64: Fold pda into per cpu area
>
>     Declare the pda as a per cpu variable. This will allow the per cpu
>     variables to be accessible on the x86_64 using %gs as the base of
>     the percpu areas for each cpu:
>
> 	%gs:per_cpu_xxxx
>
>   * x86_64: Reference zero-based percpu variables offset from gs
>
>     Actually implement the above operation for __get_cpu_var() and
>     __put_cpu_var().  Since this is now a single instruction, we
>     can remove the non-preemptible versions of x86_read_percpu()
>     and x86_write_percpu().
>   

No, I think you've misunderstood these calls.

get_cpu_var(x) evaluates to an lvalue of this cpu's 'x'.  It disables 
preemption, in the same manner as get_cpu().

put_cpu_var(x) does nothing more than re-enable preemption, to pair with 
get_cpu_var().

__get_cpu_var(x) is the same as get_cpu_var, but it assumes that 
preemption is already disabled.  There is no __put_cpu_var().

The important point is that an expression like "__get_cpu_var(x) = foo" 
does not evaluate to a single instruction, and is not preempt or 
interrupt -atomic.  That's the reason x86_X_percpu() exist, since 
they're a single instruction in an asm.  However, with %gs: based 
addressing they can be easily unified.

    J

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4] x86_64: Optimize percpu accesses
  2008-07-25 23:26 ` [PATCH 0/4] x86_64: Optimize percpu accesses Jeremy Fitzhardinge
@ 2008-07-26  0:27   ` Mike Travis
  2008-07-26  0:30     ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 12+ messages in thread
From: Mike Travis @ 2008-07-26  0:27 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Ingo Molnar, Andrew Morton, Eric W. Biederman, Hugh Dickins,
	Jack Steiner, H. Peter Anvin, linux-kernel

Jeremy Fitzhardinge wrote:
> Mike Travis wrote:
>> This patchset provides the following:
>>
>>   * x86_64: Cleanup setup_percpu by fixing some minor potential
>>     problems as well as add some debugging aids.
>>
>>   * x86_64: Rebase per cpu variables to zero
>>
>>     Rebase per cpu variables to zero in preparation for the following
>>     patch to fold the pda into the per cpu area.
>>
>>   * x86_64: Fold pda into per cpu area
>>
>>     Declare the pda as a per cpu variable. This will allow the per cpu
>>     variables to be accessible on the x86_64 using %gs as the base of
>>     the percpu areas for each cpu:
>>
>>     %gs:per_cpu_xxxx
>>
>>   * x86_64: Reference zero-based percpu variables offset from gs
>>
>>     Actually implement the above operation for __get_cpu_var() and
>>     __put_cpu_var().  Since this is now a single instruction, we
>>     can remove the non-preemptible versions of x86_read_percpu()
>>     and x86_write_percpu().
>>   
> 
> No, I think you've misunderstood these calls.
> 
> get_cpu_var(x) evaluates to an lvalue of this cpu's 'x'.  It disables
> preemption, in the same manner as get_cpu().
> 
> put_cpu_var(x) does nothing more than re-enable preemption, to pair with
> get_cpu_var().
> 
> __get_cpu_var(x) is the same as get_cpu_var, but it assumes that
> preemption is already disabled.  There is no __put_cpu_var().
> 
> The important point is that an expression like "__get_cpu_var(x) = foo"
> does not evaluate to a single instruction, and is not preempt or
> interrupt -atomic.  That's the reason x86_X_percpu() exist, since
> they're a single instruction in an asm.  However, with %gs: based
> addressing they can be easily unified.
> 
>    J

Yes, you're right, I wrote that quickly without really reading it back.
My point is that now that x86_read_percpu() and x86_write_percpu() do
evaluate to a single instruction (by definition atomic), then it doesn't
need to be surrounded by the preempt_disable()/preempt_enable() calls.

It appears as if I'm implying that's the case for get/put_cpu_var().

Thanks,
Mike



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4] x86_64: Optimize percpu accesses
  2008-07-26  0:27   ` Mike Travis
@ 2008-07-26  0:30     ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 12+ messages in thread
From: Jeremy Fitzhardinge @ 2008-07-26  0:30 UTC (permalink / raw)
  To: Mike Travis
  Cc: Ingo Molnar, Andrew Morton, Eric W. Biederman, Hugh Dickins,
	Jack Steiner, H. Peter Anvin, linux-kernel

Mike Travis wrote:
> Yes, you're right, I wrote that quickly without really reading it back.
> My point is that now that x86_read_percpu() and x86_write_percpu() do
> evaluate to a single instruction (by definition atomic), then it doesn't
> need to be surrounded by the preempt_disable()/preempt_enable() calls.
>   

Yep, correct.

> It appears as if I'm implying that's the case for get/put_cpu_var().
>   

Right.

    J

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4] x86_64: Optimize percpu accesses
  2008-07-25 21:11 [PATCH 0/4] x86_64: Optimize percpu accesses Mike Travis
                   ` (4 preceding siblings ...)
  2008-07-25 23:26 ` [PATCH 0/4] x86_64: Optimize percpu accesses Jeremy Fitzhardinge
@ 2008-07-26 12:38 ` Ingo Molnar
  2008-07-28 18:33   ` Mike Travis
  2008-07-28 15:52 ` [crash] " Ingo Molnar
  6 siblings, 1 reply; 12+ messages in thread
From: Ingo Molnar @ 2008-07-26 12:38 UTC (permalink / raw)
  To: Mike Travis
  Cc: Andrew Morton, Eric W. Biederman, Hugh Dickins, Jack Steiner,
	Jeremy Fitzhardinge, H. Peter Anvin, linux-kernel


* Mike Travis <travis@sgi.com> wrote:

> This patchset provides the following:
> 
>   * x86_64: Cleanup setup_percpu by fixing some minor potential
>     problems as well as add some debugging aids.
> 
>   * x86_64: Rebase per cpu variables to zero
> 
>     Rebase per cpu variables to zero in preparation for the following
>     patch to fold the pda into the per cpu area.
> 
>   * x86_64: Fold pda into per cpu area
> 
>     Declare the pda as a per cpu variable. This will allow the per cpu
>     variables to be accessible on the x86_64 using %gs as the base of
>     the percpu areas for each cpu:
> 
> 	%gs:per_cpu_xxxx
> 
>   * x86_64: Reference zero-based percpu variables offset from gs
> 
>     Actually implement the above operation for __get_cpu_var() and
>     __put_cpu_var().  Since this is now a single instruction, we
>     can remove the non-preemptible versions of x86_read_percpu()
>     and x86_write_percpu().
> 
> Note that the following changes are NOT in this patchset as the plan now
> seems to be that the common (to x86) variables that are in the pda should
> be made individual per cpu variables, leaving only the stack canary in place.
> 
>   * x86_64: Replace cpu_pda ops with percpu ops
>   * x86_64: Replace xxx_pda() operations with x86_xxx_percpu().
>   * x86_64: Remove xxx_pda() operations
>   * x86_64: Remove cpu_pda() macro
> 
> Based on linux-2.6.tip/master.

i've added these patches to tip/x86/percpu-zerobased, but not yet merged 
into tip/master. I've made it -git based - does this patchset have any 
functional dependencies on other patches?

	Ingo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [crash] Re: [PATCH 0/4] x86_64: Optimize percpu accesses
  2008-07-25 21:11 [PATCH 0/4] x86_64: Optimize percpu accesses Mike Travis
                   ` (5 preceding siblings ...)
  2008-07-26 12:38 ` Ingo Molnar
@ 2008-07-28 15:52 ` Ingo Molnar
  2008-07-28 19:39   ` Mike Travis
  6 siblings, 1 reply; 12+ messages in thread
From: Ingo Molnar @ 2008-07-28 15:52 UTC (permalink / raw)
  To: Mike Travis
  Cc: Andrew Morton, Eric W. Biederman, Hugh Dickins, Jack Steiner,
	Jeremy Fitzhardinge, H. Peter Anvin, linux-kernel,
	Thomas Gleixner


ok, i have integrated tip/x86/percpu-zerobased into tip/master briefly, 
but it blew up almost immediately in testing, on two boxes.

one bad config is:

  http://redhat.com/~mingo/misc/config-Mon_Jul_28_17_35_00_CEST_2008.bad

failure pattern: it booted up fine to userspace and seemed function, but 
then produced a spontaneous reboot while building a kernel, without any 
log entries.

other bad config is:

  http://redhat.com/~mingo/misc/config-Mon_Jul_28_17_30_39_CEST_2008.bad

failure pattern: early crash at:

  PANIC: early exception 0e rip 10:fffffff817dfc1a error 0 cr2 28

which corresponds to:

ffffffff817dfc0f <machine_specific_memory_setup>:
ffffffff817dfc0f:       48 8b 05 aa cf 04 00    mov    315306(%rip),%rax
# ffffffff8182cbc0 <x86_quirks>
ffffffff817dfc16:       55                      push   %rbp
ffffffff817dfc17:       48 89 e5                mov    %rsp,%rbp
ffffffff817dfc1a:       48 8b 40 28             mov    0x28(%rax),%rax  [*]
ffffffff817dfc1e:       48 85 c0                test   %rax,%rax

i.e. RAX was zero.

i've pushed out the tip/tmp.x86/percpu-zerobased.bad branch which shows 
the exact kernel that failed. It was generated by:

 git-checkout tip/master
 git-merge tip/x86/percpu-zerobased

	Ingo

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 0/4] x86_64: Optimize percpu accesses
  2008-07-26 12:38 ` Ingo Molnar
@ 2008-07-28 18:33   ` Mike Travis
  0 siblings, 0 replies; 12+ messages in thread
From: Mike Travis @ 2008-07-28 18:33 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Eric W. Biederman, Hugh Dickins, Jack Steiner,
	Jeremy Fitzhardinge, H. Peter Anvin, linux-kernel

Ingo Molnar wrote:
> * Mike Travis <travis@sgi.com> wrote:
> 
>> This patchset provides the following:
>>
>>   * x86_64: Cleanup setup_percpu by fixing some minor potential
>>     problems as well as add some debugging aids.
>>
>>   * x86_64: Rebase per cpu variables to zero
>>
>>     Rebase per cpu variables to zero in preparation for the following
>>     patch to fold the pda into the per cpu area.
>>
>>   * x86_64: Fold pda into per cpu area
>>
>>     Declare the pda as a per cpu variable. This will allow the per cpu
>>     variables to be accessible on the x86_64 using %gs as the base of
>>     the percpu areas for each cpu:
>>
>> 	%gs:per_cpu_xxxx
>>
>>   * x86_64: Reference zero-based percpu variables offset from gs
>>
>>     Actually implement the above operation for __get_cpu_var() and
>>     __put_cpu_var().  Since this is now a single instruction, we
>>     can remove the non-preemptible versions of x86_read_percpu()
>>     and x86_write_percpu().
>>
>> Note that the following changes are NOT in this patchset as the plan now
>> seems to be that the common (to x86) variables that are in the pda should
>> be made individual per cpu variables, leaving only the stack canary in place.
>>
>>   * x86_64: Replace cpu_pda ops with percpu ops
>>   * x86_64: Replace xxx_pda() operations with x86_xxx_percpu().
>>   * x86_64: Remove xxx_pda() operations
>>   * x86_64: Remove cpu_pda() macro
>>
>> Based on linux-2.6.tip/master.
> 
> i've added these patches to tip/x86/percpu-zerobased, but not yet merged 
> into tip/master. I've made it -git based - does this patchset have any 
> functional dependencies on other patches?
> 
> 	Ingo

I think the other patches have been in place for a while.  This was actually
patch 3 of about 20 that finalized with the CPU_ALLOC changes.  In my tree
the 2 prior to this one are:

	b3a0cb456d848e10b2f7b371ba05e44f1384520a
	Subject: Zero based percpu: Infrastructure to rebase the per cpu area to zero
	
	d3794979a8a80c222ce9d016a6dfc4bed36965d0
	Subject: x86: Extend percpu ops to 64 bit

Thanks,
Mike

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [crash] Re: [PATCH 0/4] x86_64: Optimize percpu accesses
  2008-07-28 15:52 ` [crash] " Ingo Molnar
@ 2008-07-28 19:39   ` Mike Travis
  0 siblings, 0 replies; 12+ messages in thread
From: Mike Travis @ 2008-07-28 19:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Eric W. Biederman, Hugh Dickins, Jack Steiner,
	Jeremy Fitzhardinge, H. Peter Anvin, linux-kernel,
	Thomas Gleixner

Ingo Molnar wrote:
> ok, i have integrated tip/x86/percpu-zerobased into tip/master briefly, 
> but it blew up almost immediately in testing, on two boxes.
> 
> one bad config is:
> 
>   http://redhat.com/~mingo/misc/config-Mon_Jul_28_17_35_00_CEST_2008.bad
> 
> failure pattern: it booted up fine to userspace and seemed function, but 
> then produced a spontaneous reboot while building a kernel, without any 
> log entries.
> 
> other bad config is:
> 
>   http://redhat.com/~mingo/misc/config-Mon_Jul_28_17_30_39_CEST_2008.bad
> 
> failure pattern: early crash at:
> 
>   PANIC: early exception 0e rip 10:fffffff817dfc1a error 0 cr2 28
> 
> which corresponds to:
> 
> ffffffff817dfc0f <machine_specific_memory_setup>:
> ffffffff817dfc0f:       48 8b 05 aa cf 04 00    mov    315306(%rip),%rax
> # ffffffff8182cbc0 <x86_quirks>
> ffffffff817dfc16:       55                      push   %rbp
> ffffffff817dfc17:       48 89 e5                mov    %rsp,%rbp
> ffffffff817dfc1a:       48 8b 40 28             mov    0x28(%rax),%rax  [*]
> ffffffff817dfc1e:       48 85 c0                test   %rax,%rax
> 
> i.e. RAX was zero.
> 
> i've pushed out the tip/tmp.x86/percpu-zerobased.bad branch which shows 
> the exact kernel that failed. It was generated by:
> 
>  git-checkout tip/master
>  git-merge tip/x86/percpu-zerobased
> 
> 	Ingo

Ok, thanks, I'll take a look.  There were some questions that I had
(and I should have RFC'd the patch since there are still questions.)

Mike

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2008-07-28 19:39 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-07-25 21:11 [PATCH 0/4] x86_64: Optimize percpu accesses Mike Travis
2008-07-25 21:11 ` [PATCH 1/4] x86_64: Cleanup early setup_percpu references Mike Travis
2008-07-25 21:11 ` [PATCH 2/4] x86_64: Base percpu variables at zero Mike Travis
2008-07-25 21:11 ` [PATCH 3/4] x86_64: Fold pda into per cpu area Mike Travis
2008-07-25 21:11 ` [PATCH 4/4] x86_64: Reference zero-based percpu variables offset from gs Mike Travis
2008-07-25 23:26 ` [PATCH 0/4] x86_64: Optimize percpu accesses Jeremy Fitzhardinge
2008-07-26  0:27   ` Mike Travis
2008-07-26  0:30     ` Jeremy Fitzhardinge
2008-07-26 12:38 ` Ingo Molnar
2008-07-28 18:33   ` Mike Travis
2008-07-28 15:52 ` [crash] " Ingo Molnar
2008-07-28 19:39   ` Mike Travis

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).