All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] x86-64: Convert the PDA to percpu.
@ 2008-12-23 17:15 Brian Gerst
  2008-12-23 17:15 ` [PATCH 2/3] x86-64: Unify x86_*_percpu() functions Brian Gerst
  2008-12-27 10:41 ` [PATCH 1/3] x86-64: Convert the PDA to percpu Ingo Molnar
  0 siblings, 2 replies; 10+ messages in thread
From: Brian Gerst @ 2008-12-23 17:15 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, Brian Gerst

This patch makes the PDA a normal per-cpu variable, allowing the
removal of the special allocator code.  %gs still points to the
base of the PDA.

Tested on a dual-core AMD64 system.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
 arch/x86/include/asm/pda.h     |    3 --
 arch/x86/include/asm/percpu.h  |    3 --
 arch/x86/include/asm/setup.h   |    1 -
 arch/x86/kernel/cpu/common.c   |    6 ++--
 arch/x86/kernel/dumpstack_64.c |    8 ++--
 arch/x86/kernel/head64.c       |   23 +------------
 arch/x86/kernel/irq.c          |    2 +-
 arch/x86/kernel/nmi.c          |    2 +-
 arch/x86/kernel/setup_percpu.c |   70 ++++++++--------------------------------
 arch/x86/kernel/smpboot.c      |   58 +--------------------------------
 arch/x86/xen/enlighten.c       |    2 +-
 arch/x86/xen/smp.c             |   12 +------
 12 files changed, 27 insertions(+), 163 deletions(-)

diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 2fbfff8..b008127 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -39,11 +39,8 @@ struct x8664_pda {
 	unsigned irq_spurious_count;
 } ____cacheline_aligned_in_smp;
 
-extern struct x8664_pda **_cpu_pda;
 extern void pda_init(int);
 
-#define cpu_pda(i) (_cpu_pda[i])
-
 /*
  * There is no fast way to get the base address of the PDA, all the accesses
  * have to mention %fs/%gs.  So it needs to be done this Torvaldian way.
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index ece7205..6f866fd 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -12,11 +12,8 @@
 #ifdef CONFIG_SMP
 #include <asm/pda.h>
 
-#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
 #define __my_cpu_offset read_pda(data_offset)
 
-#define per_cpu_offset(x) (__per_cpu_offset(x))
-
 #endif
 #include <asm-generic/percpu.h>
 
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index f12d372..b751aaf 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -93,7 +93,6 @@ extern unsigned long init_pg_tables_start;
 extern unsigned long init_pg_tables_end;
 
 #else
-void __init x86_64_init_pda(void);
 void __init x86_64_start_kernel(char *real_mode);
 void __init x86_64_start_reservations(char *real_mode_data);
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b9c9ea0..7d741ef 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -857,8 +857,8 @@ __setup("clearcpuid=", setup_disablecpuid);
 cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
 
 #ifdef CONFIG_X86_64
-struct x8664_pda **_cpu_pda __read_mostly;
-EXPORT_SYMBOL(_cpu_pda);
+DEFINE_PER_CPU_PAGE_ALIGNED(struct x8664_pda, pda);
+EXPORT_PER_CPU_SYMBOL(pda);
 
 struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
 
@@ -866,7 +866,7 @@ char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
 
 void __cpuinit pda_init(int cpu)
 {
-	struct x8664_pda *pda = cpu_pda(cpu);
+	struct x8664_pda *pda = &per_cpu(pda, cpu);
 
 	/* Setup up data that may be needed in __get_free_pages early */
 	loadsegment(fs, 0);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 96a5db7..1098b21 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -163,7 +163,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
 		const struct stacktrace_ops *ops, void *data)
 {
 	const unsigned cpu = get_cpu();
-	unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
+	unsigned long *irqstack_end = (unsigned long *)per_cpu(pda, cpu).irqstackptr;
 	unsigned used = 0;
 	struct thread_info *tinfo;
 
@@ -306,9 +306,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 	int i;
 	const int cpu = smp_processor_id();
 	unsigned long *irqstack_end =
-		(unsigned long *) (cpu_pda(cpu)->irqstackptr);
+		(unsigned long *) (per_cpu(pda, cpu).irqstackptr);
 	unsigned long *irqstack =
-		(unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
+		(unsigned long *) (per_cpu(pda, cpu).irqstackptr - IRQSTACKSIZE);
 
 	/*
 	 * debugging aid: "show_stack(NULL, NULL);" prints the
@@ -374,7 +374,7 @@ void show_registers(struct pt_regs *regs)
 	int i;
 	unsigned long sp;
 	const int cpu = smp_processor_id();
-	struct task_struct *cur = cpu_pda(cpu)->pcurrent;
+	struct task_struct *cur = per_cpu(pda, cpu).pcurrent;
 
 	sp = regs->sp;
 	printk("CPU %d ", cpu);
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index d16084f..274e2a9 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -25,27 +25,6 @@
 #include <asm/e820.h>
 #include <asm/bios_ebda.h>
 
-/* boot cpu pda */
-static struct x8664_pda _boot_cpu_pda __read_mostly;
-
-#ifdef CONFIG_SMP
-/*
- * We install an empty cpu_pda pointer table to indicate to early users
- * (numa_set_node) that the cpu_pda pointer table for cpus other than
- * the boot cpu is not yet setup.
- */
-static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
-#else
-static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
-#endif
-
-void __init x86_64_init_pda(void)
-{
-	_cpu_pda = __cpu_pda;
-	cpu_pda(0) = &_boot_cpu_pda;
-	pda_init(0);
-}
-
 static void __init zap_identity_mappings(void)
 {
 	pgd_t *pgd = pgd_offset_k(0UL);
@@ -111,7 +90,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	if (console_loglevel == 10)
 		early_printk("Kernel alive\n");
 
-	x86_64_init_pda();
+	pda_init(0);
 
 	x86_64_start_reservations(real_mode_data);
 }
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc5..066e680 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -38,7 +38,7 @@ void ack_bad_irq(unsigned int irq)
 #ifdef CONFIG_X86_32
 # define irq_stats(x)		(&per_cpu(irq_stat, x))
 #else
-# define irq_stats(x)		cpu_pda(x)
+# define irq_stats(x)		(&per_cpu(pda, x))
 #endif
 /*
  * /proc/interrupts printing:
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 2c97f07..4a5bb40 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -64,7 +64,7 @@ static int endflag __initdata;
 static inline unsigned int get_nmi_count(int cpu)
 {
 #ifdef CONFIG_X86_64
-	return cpu_pda(cpu)->__nmi_count;
+	return per_cpu(pda, cpu).__nmi_count;
 #else
 	return nmi_count(cpu);
 #endif
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ae0c0d3..fb0ccdc 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -80,58 +80,8 @@ static void __init setup_per_cpu_maps(void)
 #endif
 }
 
-#ifdef CONFIG_X86_32
-/*
- * Great future not-so-futuristic plan: make i386 and x86_64 do it
- * the same way
- */
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
-static inline void setup_cpu_pda_map(void) { }
-
-#elif !defined(CONFIG_SMP)
-static inline void setup_cpu_pda_map(void) { }
-
-#else /* CONFIG_SMP && CONFIG_X86_64 */
-
-/*
- * Allocate cpu_pda pointer table and array via alloc_bootmem.
- */
-static void __init setup_cpu_pda_map(void)
-{
-	char *pda;
-	struct x8664_pda **new_cpu_pda;
-	unsigned long size;
-	int cpu;
-
-	size = roundup(sizeof(struct x8664_pda), cache_line_size());
-
-	/* allocate cpu_pda array and pointer table */
-	{
-		unsigned long tsize = nr_cpu_ids * sizeof(void *);
-		unsigned long asize = size * (nr_cpu_ids - 1);
-
-		tsize = roundup(tsize, cache_line_size());
-		new_cpu_pda = alloc_bootmem(tsize + asize);
-		pda = (char *)new_cpu_pda + tsize;
-	}
-
-	/* initialize pointer table to static pda's */
-	for_each_possible_cpu(cpu) {
-		if (cpu == 0) {
-			/* leave boot cpu pda in place */
-			new_cpu_pda[0] = cpu_pda(0);
-			continue;
-		}
-		new_cpu_pda[cpu] = (struct x8664_pda *)pda;
-		new_cpu_pda[cpu]->in_bootmem = 1;
-		pda += size;
-	}
-
-	/* point to new pointer table */
-	_cpu_pda = new_cpu_pda;
-}
-#endif
 
 /*
  * Great future plan:
@@ -145,9 +95,6 @@ void __init setup_per_cpu_areas(void)
 	int cpu;
 	unsigned long align = 1;
 
-	/* Setup cpu_pda map */
-	setup_cpu_pda_map();
-
 	/* Copy section for each CPU (we discard the original) */
 	old_size = PERCPU_ENOUGH_ROOM;
 	align = max_t(unsigned long, PAGE_SIZE, align);
@@ -179,10 +126,21 @@ void __init setup_per_cpu_areas(void)
 					 cpu, node, __pa(ptr));
 		}
 #endif
-		per_cpu_offset(cpu) = ptr - __per_cpu_start;
+		__per_cpu_offset[cpu] = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+#ifdef CONFIG_X86_64
+		if (cpu)
+			memset(&per_cpu(pda, cpu), 0, sizeof(struct x8664_pda));
+		per_cpu(pda, cpu).data_offset = __per_cpu_offset[cpu];
+#endif
 	}
 
+#ifdef CONFIG_X86_64
+	mb();
+	wrmsrl(MSR_GS_BASE, &per_cpu(pda, 0));
+	mb();
+#endif
+
 	printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
 		NR_CPUS, nr_cpu_ids, nr_node_ids);
 
@@ -229,8 +187,8 @@ void __cpuinit numa_set_node(int cpu, int node)
 {
 	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
 
-	if (cpu_pda(cpu) && node != NUMA_NO_NODE)
-		cpu_pda(cpu)->nodenumber = node;
+	if (node != NUMA_NO_NODE)
+		per_cpu(pda, cpu).nodenumber = node;
 
 	if (cpu_to_node_map)
 		cpu_to_node_map[cpu] = node;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f71f96f..d25b989 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -753,52 +753,6 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
 	complete(&c_idle->done);
 }
 
-#ifdef CONFIG_X86_64
-
-/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
-static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
-{
-	if (!after_bootmem)
-		free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
-}
-
-/*
- * Allocate node local memory for the AP pda.
- *
- * Must be called after the _cpu_pda pointer table is initialized.
- */
-int __cpuinit get_local_pda(int cpu)
-{
-	struct x8664_pda *oldpda, *newpda;
-	unsigned long size = sizeof(struct x8664_pda);
-	int node = cpu_to_node(cpu);
-
-	if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
-		return 0;
-
-	oldpda = cpu_pda(cpu);
-	newpda = kmalloc_node(size, GFP_ATOMIC, node);
-	if (!newpda) {
-		printk(KERN_ERR "Could not allocate node local PDA "
-			"for CPU %d on node %d\n", cpu, node);
-
-		if (oldpda)
-			return 0;	/* have a usable pda */
-		else
-			return -1;
-	}
-
-	if (oldpda) {
-		memcpy(newpda, oldpda, size);
-		free_bootmem_pda(oldpda);
-	}
-
-	newpda->in_bootmem = 0;
-	cpu_pda(cpu) = newpda;
-	return 0;
-}
-#endif /* CONFIG_X86_64 */
-
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -816,16 +770,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 	};
 	INIT_WORK(&c_idle.work, do_fork_idle);
 
-#ifdef CONFIG_X86_64
-	/* Allocate node local memory for AP pdas */
-	if (cpu > 0) {
-		boot_error = get_local_pda(cpu);
-		if (boot_error)
-			goto restore_state;
-			/* if can't get pda memory, can't start cpu */
-	}
-#endif
-
 	alternatives_smp_switch(1);
 
 	c_idle.idle = get_idle_for_cpu(cpu);
@@ -861,7 +805,7 @@ do_rest:
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	irq_ctx_init(cpu);
 #else
-	cpu_pda(cpu)->pcurrent = c_idle.idle;
+	per_cpu(pda, cpu).pcurrent = c_idle.idle;
 	clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
 #endif
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5e4686d..0160bb6 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1649,7 +1649,7 @@ asmlinkage void __init xen_start_kernel(void)
 #ifdef CONFIG_X86_64
 	/* Disable until direct per-cpu data access. */
 	have_vcpu_info_placement = 0;
-	x86_64_init_pda();
+	pda_init(0);
 #endif
 
 	xen_smp_init();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index acd9b67..17823cb 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -280,22 +280,12 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
 	struct task_struct *idle = idle_task(cpu);
 	int rc;
 
-#ifdef CONFIG_X86_64
-	/* Allocate node local memory for AP pdas */
-	WARN_ON(cpu == 0);
-	if (cpu > 0) {
-		rc = get_local_pda(cpu);
-		if (rc)
-			return rc;
-	}
-#endif
-
 #ifdef CONFIG_X86_32
 	init_gdt(cpu);
 	per_cpu(current_task, cpu) = idle;
 	irq_ctx_init(cpu);
 #else
-	cpu_pda(cpu)->pcurrent = idle;
+	per_cpu(pda, cpu).pcurrent = idle;
 	clear_tsk_thread_flag(idle, TIF_FORK);
 #endif
 	xen_setup_timer(cpu);
-- 
1.6.1.rc1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/3] x86-64: Unify x86_*_percpu() functions.
  2008-12-23 17:15 [PATCH 1/3] x86-64: Convert the PDA to percpu Brian Gerst
@ 2008-12-23 17:15 ` Brian Gerst
  2008-12-23 17:15   ` [PATCH 3/3] x86-64: Move cpu number from PDA to per-cpu and consolidate with 32-bit Brian Gerst
  2008-12-27 11:03   ` [PATCH 2/3] x86-64: Unify x86_*_percpu() functions Jeremy Fitzhardinge
  2008-12-27 10:41 ` [PATCH 1/3] x86-64: Convert the PDA to percpu Ingo Molnar
  1 sibling, 2 replies; 10+ messages in thread
From: Brian Gerst @ 2008-12-23 17:15 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, Brian Gerst

Merge the 32-bit and 64-bit versions of these functions.  Unlike 32-bit,
the segment base is the current cpu's PDA instead of the offset from the
original per-cpu area.  This is because GCC hardcodes the stackprotector
canary at %gs:40.  Since the assembler is incapable of relocating against
multiple symbols, the code ends up looking like:

	movq $per_cpu__var, reg
	subq $per_cpu__pda, reg
	movq %gs:(reg), reg

This is still atomic since the offset is a constant (just calculated at
runtime) and not dependant on the cpu number.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
 arch/x86/include/asm/percpu.h |   92 +++++++++++++++++-----------------------
 1 files changed, 39 insertions(+), 53 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 6f866fd..f704243 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -1,54 +1,9 @@
 #ifndef _ASM_X86_PERCPU_H
 #define _ASM_X86_PERCPU_H
 
-#ifdef CONFIG_X86_64
-#include <linux/compiler.h>
-
-/* Same as asm-generic/percpu.h, except that we store the per cpu offset
-   in the PDA. Longer term the PDA and every per cpu variable
-   should be just put into a single section and referenced directly
-   from %gs */
-
-#ifdef CONFIG_SMP
-#include <asm/pda.h>
-
-#define __my_cpu_offset read_pda(data_offset)
-
-#endif
-#include <asm-generic/percpu.h>
-
-DECLARE_PER_CPU(struct x8664_pda, pda);
-
-/*
- * These are supposed to be implemented as a single instruction which
- * operates on the per-cpu data base segment.  x86-64 doesn't have
- * that yet, so this is a fairly inefficient workaround for the
- * meantime.  The single instruction is atomic with respect to
- * preemption and interrupts, so we need to explicitly disable
- * interrupts here to achieve the same effect.  However, because it
- * can be used from within interrupt-disable/enable, we can't actually
- * disable interrupts; disabling preemption is enough.
- */
-#define x86_read_percpu(var)						\
-	({								\
-		typeof(per_cpu_var(var)) __tmp;				\
-		preempt_disable();					\
-		__tmp = __get_cpu_var(var);				\
-		preempt_enable();					\
-		__tmp;							\
-	})
-
-#define x86_write_percpu(var, val)					\
-	do {								\
-		preempt_disable();					\
-		__get_cpu_var(var) = (val);				\
-		preempt_enable();					\
-	} while(0)
-
-#else /* CONFIG_X86_64 */
-
 #ifdef __ASSEMBLY__
 
+#ifdef CONFIG_X86_32
 /*
  * PER_CPU finds an address of a per-cpu variable.
  *
@@ -72,6 +27,8 @@ DECLARE_PER_CPU(struct x8664_pda, pda);
 #define PER_CPU_VAR(var)	per_cpu__##var
 #endif	/* SMP */
 
+#endif	/* X86_32 */
+
 #else /* ...!ASSEMBLY */
 
 /*
@@ -88,19 +45,37 @@ DECLARE_PER_CPU(struct x8664_pda, pda);
  */
 #ifdef CONFIG_SMP
 
+#ifdef CONFIG_X86_32
+
 #define __my_cpu_offset x86_read_percpu(this_cpu_off)
 
 /* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
 #define __percpu_seg "%%fs:"
+#define __percpu_seg_off(x) (x)
+
+#else
+
+#define __my_cpu_offset read_pda(data_offset)
+
+#define __percpu_seg "%%gs:"
+#define __percpu_seg_off(x) RELOC_HIDE((x), -(unsigned long)&per_cpu__pda)
+
+#endif
 
 #else  /* !SMP */
 
 #define __percpu_seg ""
+#define __percpu_seg_off(x) (x)
 
 #endif	/* SMP */
 
 #include <asm-generic/percpu.h>
 
+#ifdef CONFIG_X86_64
+#include <asm/pda.h>
+DECLARE_PER_CPU(struct x8664_pda, pda);
+#endif
+
 /* We can use this directly for local CPU (faster). */
 DECLARE_PER_CPU(unsigned long, this_cpu_off);
 
@@ -111,6 +86,7 @@ extern void __bad_percpu_size(void);
 #define percpu_to_op(op, var, val)			\
 do {							\
 	typedef typeof(var) T__;			\
+	typeof(var) *var__ = __percpu_seg_off(&var);	\
 	if (0) {					\
 		T__ tmp__;				\
 		tmp__ = (val);				\
@@ -118,17 +94,22 @@ do {							\
 	switch (sizeof(var)) {				\
 	case 1:						\
 		asm(op "b %1,"__percpu_seg"%0"		\
-		    : "+m" (var)			\
+		    : "+m" (*var__)			\
 		    : "ri" ((T__)val));			\
 		break;					\
 	case 2:						\
 		asm(op "w %1,"__percpu_seg"%0"		\
-		    : "+m" (var)			\
+		    : "+m" (*var__)			\
 		    : "ri" ((T__)val));			\
 		break;					\
 	case 4:						\
 		asm(op "l %1,"__percpu_seg"%0"		\
-		    : "+m" (var)			\
+		    : "+m" (*var__)			\
+		    : "ri" ((T__)val));			\
+		break;					\
+	case 8:						\
+		asm(op "q %1,"__percpu_seg"%0"		\
+		    : "+m" (*var__)			\
 		    : "ri" ((T__)val));			\
 		break;					\
 	default: __bad_percpu_size();			\
@@ -138,21 +119,27 @@ do {							\
 #define percpu_from_op(op, var)				\
 ({							\
 	typeof(var) ret__;				\
+	typeof(var) *var__ = __percpu_seg_off(&var);	\
 	switch (sizeof(var)) {				\
 	case 1:						\
 		asm(op "b "__percpu_seg"%1,%0"		\
 		    : "=r" (ret__)			\
-		    : "m" (var));			\
+		    : "m" (*var__));			\
 		break;					\
 	case 2:						\
 		asm(op "w "__percpu_seg"%1,%0"		\
 		    : "=r" (ret__)			\
-		    : "m" (var));			\
+		    : "m" (*var__));			\
 		break;					\
 	case 4:						\
 		asm(op "l "__percpu_seg"%1,%0"		\
 		    : "=r" (ret__)			\
-		    : "m" (var));			\
+		    : "m" (*var__));			\
+		break;					\
+	case 8:						\
+		asm(op "q "__percpu_seg"%1,%0"		\
+		    : "=r" (ret__)			\
+		    : "m" (*var__));			\
 		break;					\
 	default: __bad_percpu_size();			\
 	}						\
@@ -165,7 +152,6 @@ do {							\
 #define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
 #define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
 #endif /* !__ASSEMBLY__ */
-#endif /* !CONFIG_X86_64 */
 
 #ifdef CONFIG_SMP
 
-- 
1.6.1.rc1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/3] x86-64: Move cpu number from PDA to per-cpu and consolidate with 32-bit.
  2008-12-23 17:15 ` [PATCH 2/3] x86-64: Unify x86_*_percpu() functions Brian Gerst
@ 2008-12-23 17:15   ` Brian Gerst
  2008-12-27 11:03   ` [PATCH 2/3] x86-64: Unify x86_*_percpu() functions Jeremy Fitzhardinge
  1 sibling, 0 replies; 10+ messages in thread
From: Brian Gerst @ 2008-12-23 17:15 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: linux-kernel, Brian Gerst

A simple example of what can now be done to consolidate with the 32-bit
code.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
---
 arch/x86/include/asm/pda.h       |    2 +-
 arch/x86/include/asm/smp.h       |    4 +---
 arch/x86/kernel/asm-offsets_64.c |    1 -
 arch/x86/kernel/cpu/common.c     |    1 -
 arch/x86/kernel/process_32.c     |    3 ---
 arch/x86/kernel/setup_percpu.c   |    4 ++++
 arch/x86/kernel/smpcommon.c      |    1 -
 7 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index b008127..3ace2b9 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -15,7 +15,7 @@ struct x8664_pda {
 	unsigned long kernelstack;	/* 16 top of kernel stack for current */
 	unsigned long oldrsp;		/* 24 user rsp for system call */
 	int irqcount;			/* 32 Irq nesting counter. Starts -1 */
-	unsigned int cpunumber;		/* 36 Logical CPU number */
+	unsigned int unused1;		/* 36 was cpunumber */
 #ifdef CONFIG_CC_STACKPROTECTOR
 	unsigned long stack_canary;	/* 40 stack canary value */
 					/* gcc-ABI: this canary MUST be at
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index d12811c..f2e4b9c 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -34,9 +34,7 @@ extern cpumask_t cpu_initialized;
 DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
 DECLARE_PER_CPU(cpumask_t, cpu_core_map);
 DECLARE_PER_CPU(u16, cpu_llc_id);
-#ifdef CONFIG_X86_32
 DECLARE_PER_CPU(int, cpu_number);
-#endif
 
 DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
 DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
@@ -169,7 +167,7 @@ extern unsigned disabled_cpus __cpuinitdata;
 extern int safe_smp_processor_id(void);
 
 #elif defined(CONFIG_X86_64_SMP)
-#define raw_smp_processor_id()	read_pda(cpunumber)
+#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
 
 #define stack_smp_processor_id()					\
 ({								\
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 7fcf63d..b14bb4e 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -51,7 +51,6 @@ int main(void)
 	ENTRY(oldrsp); 
 	ENTRY(pcurrent); 
 	ENTRY(irqcount);
-	ENTRY(cpunumber);
 	ENTRY(irqstackptr);
 	ENTRY(data_offset);
 	BLANK();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7d741ef..82ad755 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -876,7 +876,6 @@ void __cpuinit pda_init(int cpu)
 	wrmsrl(MSR_GS_BASE, pda);
 	mb();
 
-	pda->cpunumber = cpu;
 	pda->irqcount = -1;
 	pda->kernelstack = (unsigned long)stack_thread_info() -
 				 PDA_STACKOFFSET + THREAD_SIZE;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0a1302f..4bb0a7d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -65,9 +65,6 @@ asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);
 
-DEFINE_PER_CPU(int, cpu_number);
-EXPORT_PER_CPU_SYMBOL(cpu_number);
-
 /*
  * Return saved PC of a blocked thread.
  */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index fb0ccdc..f013781 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -15,6 +15,9 @@
 #include <asm/apicdef.h>
 #include <asm/highmem.h>
 
+DEFINE_PER_CPU(int, cpu_number);
+EXPORT_PER_CPU_SYMBOL(cpu_number);
+
 #ifdef CONFIG_X86_LOCAL_APIC
 unsigned int num_processors;
 unsigned disabled_cpus __cpuinitdata;
@@ -133,6 +136,7 @@ void __init setup_per_cpu_areas(void)
 			memset(&per_cpu(pda, cpu), 0, sizeof(struct x8664_pda));
 		per_cpu(pda, cpu).data_offset = __per_cpu_offset[cpu];
 #endif
+		per_cpu(cpu_number, cpu) = cpu;
 	}
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 397e309..db7c5a6 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -25,6 +25,5 @@ __cpuinit void init_gdt(int cpu)
 			GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
 
 	per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
-	per_cpu(cpu_number, cpu) = cpu;
 }
 #endif
-- 
1.6.1.rc1


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] x86-64: Convert the PDA to percpu.
  2008-12-23 17:15 [PATCH 1/3] x86-64: Convert the PDA to percpu Brian Gerst
  2008-12-23 17:15 ` [PATCH 2/3] x86-64: Unify x86_*_percpu() functions Brian Gerst
@ 2008-12-27 10:41 ` Ingo Molnar
  2008-12-27 15:30   ` Brian Gerst
  1 sibling, 1 reply; 10+ messages in thread
From: Ingo Molnar @ 2008-12-27 10:41 UTC (permalink / raw)
  To: Brian Gerst, Christoph Lameter, Thomas Gleixner, H. Peter Anvin,
	Jeremy Fitzhardinge, Alexander van Heukelum
  Cc: linux-kernel


(Cc:-ed a few more people who might be interested in this)

* Brian Gerst <brgerst@gmail.com> wrote:

> This patch makes the PDA a normal per-cpu variable, allowing the
> removal of the special allocator code.  %gs still points to the
> base of the PDA.
> 
> Tested on a dual-core AMD64 system.
> 
> Signed-off-by: Brian Gerst <brgerst@gmail.com>
> ---
>  arch/x86/include/asm/pda.h     |    3 --
>  arch/x86/include/asm/percpu.h  |    3 --
>  arch/x86/include/asm/setup.h   |    1 -
>  arch/x86/kernel/cpu/common.c   |    6 ++--
>  arch/x86/kernel/dumpstack_64.c |    8 ++--
>  arch/x86/kernel/head64.c       |   23 +------------
>  arch/x86/kernel/irq.c          |    2 +-
>  arch/x86/kernel/nmi.c          |    2 +-
>  arch/x86/kernel/setup_percpu.c |   70 ++++++++--------------------------------
>  arch/x86/kernel/smpboot.c      |   58 +--------------------------------
>  arch/x86/xen/enlighten.c       |    2 +-
>  arch/x86/xen/smp.c             |   12 +------
>  12 files changed, 27 insertions(+), 163 deletions(-)

the simplification factor is significant. I'm wondering, have you measured 
the code size impact of this on say the defconfig x86 kernel? That will 
generally tell us how much worse optimizations the compiler does under 
this scheme.

	Ingo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/3] x86-64: Unify x86_*_percpu() functions.
  2008-12-23 17:15 ` [PATCH 2/3] x86-64: Unify x86_*_percpu() functions Brian Gerst
  2008-12-23 17:15   ` [PATCH 3/3] x86-64: Move cpu number from PDA to per-cpu and consolidate with 32-bit Brian Gerst
@ 2008-12-27 11:03   ` Jeremy Fitzhardinge
  2008-12-27 11:09     ` Ingo Molnar
  1 sibling, 1 reply; 10+ messages in thread
From: Jeremy Fitzhardinge @ 2008-12-27 11:03 UTC (permalink / raw)
  To: Brian Gerst; +Cc: Ingo Molnar, linux-kernel

Brian Gerst wrote:
> Merge the 32-bit and 64-bit versions of these functions.  Unlike 32-bit,
> the segment base is the current cpu's PDA instead of the offset from the
> original per-cpu area.  This is because GCC hardcodes the stackprotector
> canary at %gs:40.  Since the assembler is incapable of relocating against
> multiple symbols, the code ends up looking like:
>
> 	movq $per_cpu__var, reg
> 	subq $per_cpu__pda, reg
> 	movq %gs:(reg), reg
>
> This is still atomic since the offset is a constant (just calculated at
> runtime) and not dependant on the cpu number.
>   

Yeah, it's a real pity we can't convince the linker to do this simple 
computation as a single %gs:ADDR addressing mode.  On the other hand, if 
the compiler can reuse the computation of %reg 2-3 times, then the 
generated code could well end up being denser.

    J

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/3] x86-64: Unify x86_*_percpu() functions.
  2008-12-27 11:03   ` [PATCH 2/3] x86-64: Unify x86_*_percpu() functions Jeremy Fitzhardinge
@ 2008-12-27 11:09     ` Ingo Molnar
  2008-12-27 11:20       ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 10+ messages in thread
From: Ingo Molnar @ 2008-12-27 11:09 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Brian Gerst, linux-kernel, Christoph Lameter, Mike Travis,
	Thomas Gleixner, H. Peter Anvin, Alexander van Heukelum


* Jeremy Fitzhardinge <jeremy@goop.org> wrote:

> Brian Gerst wrote:
>> Merge the 32-bit and 64-bit versions of these functions.  Unlike 32-bit,
>> the segment base is the current cpu's PDA instead of the offset from the
>> original per-cpu area.  This is because GCC hardcodes the stackprotector
>> canary at %gs:40.  Since the assembler is incapable of relocating against
>> multiple symbols, the code ends up looking like:
>>
>> 	movq $per_cpu__var, reg
>> 	subq $per_cpu__pda, reg
>> 	movq %gs:(reg), reg
>>
>> This is still atomic since the offset is a constant (just calculated at
>> runtime) and not dependant on the cpu number.
>>   
>
> Yeah, it's a real pity we can't convince the linker to do this simple 
> computation as a single %gs:ADDR addressing mode.  On the other hand, if 
> the compiler can reuse the computation of %reg 2-3 times, then the 
> generated code could well end up being denser.

There's a nice project for linker hackers?

I'd like to see some kernel image size measurements done on x86 defconfig 
to see how much real impact this has on code density. Unless the impact is 
horribly unacceptable, removing ~200 lines of weird x86-specific APIs is a 
definitive plus.

	Ingo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 2/3] x86-64: Unify x86_*_percpu() functions.
  2008-12-27 11:09     ` Ingo Molnar
@ 2008-12-27 11:20       ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 10+ messages in thread
From: Jeremy Fitzhardinge @ 2008-12-27 11:20 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Brian Gerst, linux-kernel, Christoph Lameter, Mike Travis,
	Thomas Gleixner, H. Peter Anvin, Alexander van Heukelum

Ingo Molnar wrote:
> * Jeremy Fitzhardinge <jeremy@goop.org> wrote:
>
>   
>> Brian Gerst wrote:
>>     
>>> Merge the 32-bit and 64-bit versions of these functions.  Unlike 32-bit,
>>> the segment base is the current cpu's PDA instead of the offset from the
>>> original per-cpu area.  This is because GCC hardcodes the stackprotector
>>> canary at %gs:40.  Since the assembler is incapable of relocating against
>>> multiple symbols, the code ends up looking like:
>>>
>>> 	movq $per_cpu__var, reg
>>> 	subq $per_cpu__pda, reg
>>> 	movq %gs:(reg), reg
>>>
>>> This is still atomic since the offset is a constant (just calculated at
>>> runtime) and not dependant on the cpu number.
>>>   
>>>       
>> Yeah, it's a real pity we can't convince the linker to do this simple 
>> computation as a single %gs:ADDR addressing mode.  On the other hand, if 
>> the compiler can reuse the computation of %reg 2-3 times, then the 
>> generated code could well end up being denser.
>>     
>
> There's a nice project for linker hackers?
>
> I'd like to see some kernel image size measurements done on x86 defconfig 
> to see how much real impact this has on code density. Unless the impact is 
> horribly unacceptable, removing ~200 lines of weird x86-specific APIs is a 
> definitive plus.

Yep, I'm all for it.  I don't think there'll be much of a size impact at 
all.

    J


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] x86-64: Convert the PDA to percpu.
  2008-12-27 10:41 ` [PATCH 1/3] x86-64: Convert the PDA to percpu Ingo Molnar
@ 2008-12-27 15:30   ` Brian Gerst
  2008-12-27 15:53     ` Ingo Molnar
  0 siblings, 1 reply; 10+ messages in thread
From: Brian Gerst @ 2008-12-27 15:30 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christoph Lameter, Thomas Gleixner, H. Peter Anvin,
	Jeremy Fitzhardinge, Alexander van Heukelum, linux-kernel

On Sat, Dec 27, 2008 at 5:41 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> (Cc:-ed a few more people who might be interested in this)
>
> * Brian Gerst <brgerst@gmail.com> wrote:
>
>> This patch makes the PDA a normal per-cpu variable, allowing the
>> removal of the special allocator code.  %gs still points to the
>> base of the PDA.
>>
>> Tested on a dual-core AMD64 system.
>>
>> Signed-off-by: Brian Gerst <brgerst@gmail.com>
>> ---
>>  arch/x86/include/asm/pda.h     |    3 --
>>  arch/x86/include/asm/percpu.h  |    3 --
>>  arch/x86/include/asm/setup.h   |    1 -
>>  arch/x86/kernel/cpu/common.c   |    6 ++--
>>  arch/x86/kernel/dumpstack_64.c |    8 ++--
>>  arch/x86/kernel/head64.c       |   23 +------------
>>  arch/x86/kernel/irq.c          |    2 +-
>>  arch/x86/kernel/nmi.c          |    2 +-
>>  arch/x86/kernel/setup_percpu.c |   70 ++++++++--------------------------------
>>  arch/x86/kernel/smpboot.c      |   58 +--------------------------------
>>  arch/x86/xen/enlighten.c       |    2 +-
>>  arch/x86/xen/smp.c             |   12 +------
>>  12 files changed, 27 insertions(+), 163 deletions(-)
>
> the simplification factor is significant. I'm wondering, have you measured
> the code size impact of this on say the defconfig x86 kernel? That will
> generally tell us how much worse optimizations the compiler does under
> this scheme.
>
>        Ingo
>

Patch #1 by itself doesn't change how the PDA is accessed, only how it
is allocated.  The text size goes down significantly with patch #1,
but data goes up.  Changing the PDA to cacheline-aligned (1a) brings
it back in line.

   text	   data	    bss	    dec	    hex	filename
7033648	1754476	 758508	9546632	 91ab88	vmlinux.0   (vanilla 2.6.28)
7029563	1758428	 758508	9546499	 91ab03	vmlinux.1   (with patch #1)
7029563	1754460	 758508	9542531	 919b83	vmlinux.1a  (with patch #1 cache align)
7036694	1758428	 758508	9553630	 91c6de	vmlinux.3   (with all three patches)

I think the first patch (with the alignment fix) is a clear win.  As
for the other patches, they add about 8 bytes per use of a PDA
variable.  cpu_number is used 903 times in this compile, so this is
likely the most extreme example.  I have an idea to optimize this
particular case further that I'd like to look at which would lessen
the impact.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] x86-64: Convert the PDA to percpu.
  2008-12-27 15:30   ` Brian Gerst
@ 2008-12-27 15:53     ` Ingo Molnar
  2008-12-27 17:16       ` Brian Gerst
  0 siblings, 1 reply; 10+ messages in thread
From: Ingo Molnar @ 2008-12-27 15:53 UTC (permalink / raw)
  To: Brian Gerst
  Cc: Christoph Lameter, Thomas Gleixner, H. Peter Anvin,
	Jeremy Fitzhardinge, Alexander van Heukelum, linux-kernel


* Brian Gerst <brgerst@gmail.com> wrote:

> On Sat, Dec 27, 2008 at 5:41 AM, Ingo Molnar <mingo@elte.hu> wrote:
> >
> > (Cc:-ed a few more people who might be interested in this)
> >
> > * Brian Gerst <brgerst@gmail.com> wrote:
> >
> >> This patch makes the PDA a normal per-cpu variable, allowing the
> >> removal of the special allocator code.  %gs still points to the
> >> base of the PDA.
> >>
> >> Tested on a dual-core AMD64 system.
> >>
> >> Signed-off-by: Brian Gerst <brgerst@gmail.com>
> >> ---
> >>  arch/x86/include/asm/pda.h     |    3 --
> >>  arch/x86/include/asm/percpu.h  |    3 --
> >>  arch/x86/include/asm/setup.h   |    1 -
> >>  arch/x86/kernel/cpu/common.c   |    6 ++--
> >>  arch/x86/kernel/dumpstack_64.c |    8 ++--
> >>  arch/x86/kernel/head64.c       |   23 +------------
> >>  arch/x86/kernel/irq.c          |    2 +-
> >>  arch/x86/kernel/nmi.c          |    2 +-
> >>  arch/x86/kernel/setup_percpu.c |   70 ++++++++--------------------------------
> >>  arch/x86/kernel/smpboot.c      |   58 +--------------------------------
> >>  arch/x86/xen/enlighten.c       |    2 +-
> >>  arch/x86/xen/smp.c             |   12 +------
> >>  12 files changed, 27 insertions(+), 163 deletions(-)
> >
> > the simplification factor is significant. I'm wondering, have you measured
> > the code size impact of this on say the defconfig x86 kernel? That will
> > generally tell us how much worse optimizations the compiler does under
> > this scheme.
> >
> >        Ingo
> >
> 
> Patch #1 by itself doesn't change how the PDA is accessed, only how it
> is allocated.  The text size goes down significantly with patch #1,
> but data goes up.  Changing the PDA to cacheline-aligned (1a) brings
> it back in line.
> 
>    text	   data	    bss	    dec	    hex	filename
> 7033648	1754476	 758508	9546632	 91ab88	vmlinux.0   (vanilla 2.6.28)
> 7029563	1758428	 758508	9546499	 91ab03	vmlinux.1   (with patch #1)
> 7029563	1754460	 758508	9542531	 919b83	vmlinux.1a  (with patch #1 cache align)
> 7036694	1758428	 758508	9553630	 91c6de	vmlinux.3   (with all three patches)
> 
> I think the first patch (with the alignment fix) is a clear win.  As for 
> the other patches, they add about 8 bytes per use of a PDA variable.  
> cpu_number is used 903 times in this compile, so this is likely the most 
> extreme example.  I have an idea to optimize this particular case 
> further that I'd like to look at which would lessen the impact.

curious, what idea is that?

	Ingo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/3] x86-64: Convert the PDA to percpu.
  2008-12-27 15:53     ` Ingo Molnar
@ 2008-12-27 17:16       ` Brian Gerst
  0 siblings, 0 replies; 10+ messages in thread
From: Brian Gerst @ 2008-12-27 17:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Christoph Lameter, Thomas Gleixner, H. Peter Anvin,
	Jeremy Fitzhardinge, Alexander van Heukelum, linux-kernel

On Sat, Dec 27, 2008 at 10:53 AM, Ingo Molnar <mingo@elte.hu> wrote:
>
> * Brian Gerst <brgerst@gmail.com> wrote:
>
>> On Sat, Dec 27, 2008 at 5:41 AM, Ingo Molnar <mingo@elte.hu> wrote:
>> >
>> > (Cc:-ed a few more people who might be interested in this)
>> >
>> > * Brian Gerst <brgerst@gmail.com> wrote:
>> >
>> >> This patch makes the PDA a normal per-cpu variable, allowing the
>> >> removal of the special allocator code.  %gs still points to the
>> >> base of the PDA.
>> >>
>> >> Tested on a dual-core AMD64 system.
>> >>
>> >> Signed-off-by: Brian Gerst <brgerst@gmail.com>
>> >> ---
>> >>  arch/x86/include/asm/pda.h     |    3 --
>> >>  arch/x86/include/asm/percpu.h  |    3 --
>> >>  arch/x86/include/asm/setup.h   |    1 -
>> >>  arch/x86/kernel/cpu/common.c   |    6 ++--
>> >>  arch/x86/kernel/dumpstack_64.c |    8 ++--
>> >>  arch/x86/kernel/head64.c       |   23 +------------
>> >>  arch/x86/kernel/irq.c          |    2 +-
>> >>  arch/x86/kernel/nmi.c          |    2 +-
>> >>  arch/x86/kernel/setup_percpu.c |   70 ++++++++--------------------------------
>> >>  arch/x86/kernel/smpboot.c      |   58 +--------------------------------
>> >>  arch/x86/xen/enlighten.c       |    2 +-
>> >>  arch/x86/xen/smp.c             |   12 +------
>> >>  12 files changed, 27 insertions(+), 163 deletions(-)
>> >
>> > the simplification factor is significant. I'm wondering, have you measured
>> > the code size impact of this on say the defconfig x86 kernel? That will
>> > generally tell us how much worse optimizations the compiler does under
>> > this scheme.
>> >
>> >        Ingo
>> >
>>
>> Patch #1 by itself doesn't change how the PDA is accessed, only how it
>> is allocated.  The text size goes down significantly with patch #1,
>> but data goes up.  Changing the PDA to cacheline-aligned (1a) brings
>> it back in line.
>>
>>    text          data     bss     dec     hex filename
>> 7033648       1754476  758508 9546632  91ab88 vmlinux.0   (vanilla 2.6.28)
>> 7029563       1758428  758508 9546499  91ab03 vmlinux.1   (with patch #1)
>> 7029563       1754460  758508 9542531  919b83 vmlinux.1a  (with patch #1 cache align)
>> 7036694       1758428  758508 9553630  91c6de vmlinux.3   (with all three patches)
>>
>> I think the first patch (with the alignment fix) is a clear win.  As for
>> the other patches, they add about 8 bytes per use of a PDA variable.
>> cpu_number is used 903 times in this compile, so this is likely the most
>> extreme example.  I have an idea to optimize this particular case
>> further that I'd like to look at which would lessen the impact.
>
> curious, what idea is that?
>
>        Ingo
>

Something like this:
+#define raw_smp_processor_id()                                         \
+({                                                                     \
+       extern int gsoff__cpu_number;                                   \
+       int cpu;                                                        \
+       __asm__("movl %%gs:%1, %0" : "=r" (cpu)                         \
+                                  : "m" (gsoff__cpu_number);           \
+       cpu;                                                            \
+})

And add this to vmlinux_64.lds.S:
+#define GSOFF(x) gsoff__##x = per_cpu__##x - per_cpu__pda
+  GSOFF(cpu_number);

The trick is that the linker can calculate against multiple symbols,
but it must be done in the final link.  The problem with this approach
is that only a limited set of symbols can be used.  There isn't a
simple solution for all per-cpu variables.  Some post-processing would
have to be done, similar to kallsyms.

Looking some more at the usage statistics of the PDA members, there
are four heavy hitters:
pda->pcurrent (2719)
pda->kernelstack (1055)
pda->cpunumber (933)
pda->data_offset (327)

The rest of the PDA members have an insignificant number of accesses.
I think for now I'll avoid converting the above four fields until an
optimal solution can be agreed on, but the others (primarily the TLB
and irqstat fields) can be converted without bloating the kernel code
alot.

--
Brian Gerst

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2008-12-27 17:16 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-12-23 17:15 [PATCH 1/3] x86-64: Convert the PDA to percpu Brian Gerst
2008-12-23 17:15 ` [PATCH 2/3] x86-64: Unify x86_*_percpu() functions Brian Gerst
2008-12-23 17:15   ` [PATCH 3/3] x86-64: Move cpu number from PDA to per-cpu and consolidate with 32-bit Brian Gerst
2008-12-27 11:03   ` [PATCH 2/3] x86-64: Unify x86_*_percpu() functions Jeremy Fitzhardinge
2008-12-27 11:09     ` Ingo Molnar
2008-12-27 11:20       ` Jeremy Fitzhardinge
2008-12-27 10:41 ` [PATCH 1/3] x86-64: Convert the PDA to percpu Ingo Molnar
2008-12-27 15:30   ` Brian Gerst
2008-12-27 15:53     ` Ingo Molnar
2008-12-27 17:16       ` Brian Gerst

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.