All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86/i386: make sure stack-protector segment base is cache aligned
@ 2009-09-03 19:27 Jeremy Fitzhardinge
  2009-09-03 19:47 ` Eric Dumazet
  2009-09-03 20:03 ` [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned tip-bot for Jeremy Fitzhardinge
  0 siblings, 2 replies; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2009-09-03 19:27 UTC (permalink / raw)
  To: H. Peter Anvin; +Cc: the arch/x86 maintainers, Linux Kernel Mailing List

The Intel Optimization Reference Guide says:

	In Intel Atom microarchitecture, the address generation unit
	assumes that the segment base will be 0 by default. Non-zero
	segment base will cause load and store operations to experience
	a delay.
		- If the segment base isn't aligned to a cache line
		  boundary, the max throughput of memory operations is
		  reduced to one [e]very 9 cycles.
	[...]
	Assembly/Compiler Coding Rule 15. (H impact, ML generality)
	For Intel Atom processors, use segments with base set to 0
	whenever possible; avoid non-zero segment base address that is
	not aligned to cache line boundary at all cost.

We can't avoid having a non-zero base for the stack-protector segment, but
we can make it cache-aligned.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 0bfcf7e..f7d2c8f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -403,7 +403,17 @@ extern unsigned long kernel_eflags;
 extern asmlinkage void ignore_sysret(void);
 #else	/* X86_64 */
 #ifdef CONFIG_CC_STACKPROTECTOR
-DECLARE_PER_CPU(unsigned long, stack_canary);
+/*
+ * Make sure stack canary segment base is cached-aligned:
+ *   "For Intel Atom processors, avoid non zero segment base address
+ *    that is not aligned to cache line boundary at all cost."
+ * (Optim Ref Manual Assembly/Compiler Coding Rule 15.)
+ */
+struct stack_canary {
+	char __pad[20];		/* canary at %gs:20 */
+	unsigned long canary;
+};
+DECLARE_PER_CPU(struct stack_canary, stack_canary) ____cacheline_aligned;
 #endif
 #endif	/* X86_64 */
 
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index c2d742c..decad97 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -78,14 +78,14 @@ static __always_inline void boot_init_stack_canary(void)
 #ifdef CONFIG_X86_64
 	percpu_write(irq_stack_union.stack_canary, canary);
 #else
-	percpu_write(stack_canary, canary);
+	percpu_write(stack_canary.canary, canary);
 #endif
 }
 
 static inline void setup_stack_canary_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
-	unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu) - 20;
+	unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu);
 	struct desc_struct *gdt_table = get_cpu_gdt_table(cpu);
 	struct desc_struct desc;
 
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 643c59b..5bd119b 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -31,7 +31,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	"movl %P[task_canary](%[next]), %%ebx\n\t"			\
 	"movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
 #define __switch_canary_oparam						\
-	, [stack_canary] "=m" (per_cpu_var(stack_canary))
+	, [stack_canary] "=m" (per_cpu_var(stack_canary.canary))
 #define __switch_canary_iparam						\
 	, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
 #else	/* CC_STACKPROTECTOR */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5ce60a8..e338b5c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1043,7 +1043,7 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist);
 #else	/* CONFIG_X86_64 */
 
 #ifdef CONFIG_CC_STACKPROTECTOR
-DEFINE_PER_CPU(unsigned long, stack_canary);
+DEFINE_PER_CPU(struct stack_canary, stack_canary) ____cacheline_aligned;
 #endif
 
 /* Make sure %fs and %gs are initialized properly in idle threads */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index cc827ac..7ffec6b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -439,7 +439,6 @@ is386:	movl $2,%ecx		# set MP
 	jne 1f
 	movl $per_cpu__gdt_page,%eax
 	movl $per_cpu__stack_canary,%ecx
-	subl $20, %ecx
 	movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
 	shrl $16, %ecx
 	movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [PATCH] x86/i386: make sure stack-protector segment base is cache aligned
  2009-09-03 19:27 [PATCH] x86/i386: make sure stack-protector segment base is cache aligned Jeremy Fitzhardinge
@ 2009-09-03 19:47 ` Eric Dumazet
  2009-09-03 20:41   ` Jeremy Fitzhardinge
  2009-09-03 20:03 ` [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned tip-bot for Jeremy Fitzhardinge
  1 sibling, 1 reply; 29+ messages in thread
From: Eric Dumazet @ 2009-09-03 19:47 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: H. Peter Anvin, the arch/x86 maintainers, Linux Kernel Mailing List

Jeremy Fitzhardinge a écrit :
> The Intel Optimization Reference Guide says:
> 
> 	In Intel Atom microarchitecture, the address generation unit
> 	assumes that the segment base will be 0 by default. Non-zero
> 	segment base will cause load and store operations to experience
> 	a delay.
> 		- If the segment base isn't aligned to a cache line
> 		  boundary, the max throughput of memory operations is
> 		  reduced to one [e]very 9 cycles.
> 	[...]
> 	Assembly/Compiler Coding Rule 15. (H impact, ML generality)
> 	For Intel Atom processors, use segments with base set to 0
> 	whenever possible; avoid non-zero segment base address that is
> 	not aligned to cache line boundary at all cost.
> 
> We can't avoid having a non-zero base for the stack-protector segment, but
> we can make it cache-aligned.
> 
> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
> 
> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
> index 0bfcf7e..f7d2c8f 100644
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -403,7 +403,17 @@ extern unsigned long kernel_eflags;
>  extern asmlinkage void ignore_sysret(void);
>  #else	/* X86_64 */
>  #ifdef CONFIG_CC_STACKPROTECTOR
> -DECLARE_PER_CPU(unsigned long, stack_canary);
> +/*
> + * Make sure stack canary segment base is cached-aligned:
> + *   "For Intel Atom processors, avoid non zero segment base address
> + *    that is not aligned to cache line boundary at all cost."
> + * (Optim Ref Manual Assembly/Compiler Coding Rule 15.)
> + */
> +struct stack_canary {
> +	char __pad[20];		/* canary at %gs:20 */
> +	unsigned long canary;
> +};
> +DECLARE_PER_CPU(struct stack_canary, stack_canary) ____cacheline_aligned;

DECLARE_PER_CPU_SHARED_ALIGNED()

Or else, we'll have many holes in percpu section, because of linker encapsulation

>  #endif
>  #endif	/* X86_64 */
>  
> diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
> index c2d742c..decad97 100644
> --- a/arch/x86/include/asm/stackprotector.h
> +++ b/arch/x86/include/asm/stackprotector.h
> @@ -78,14 +78,14 @@ static __always_inline void boot_init_stack_canary(void)
>  #ifdef CONFIG_X86_64
>  	percpu_write(irq_stack_union.stack_canary, canary);
>  #else
> -	percpu_write(stack_canary, canary);
> +	percpu_write(stack_canary.canary, canary);
>  #endif
>  }
>  
>  static inline void setup_stack_canary_segment(int cpu)
>  {
>  #ifdef CONFIG_X86_32
> -	unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu) - 20;
> +	unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu);
>  	struct desc_struct *gdt_table = get_cpu_gdt_table(cpu);
>  	struct desc_struct desc;
>  
> diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
> index 643c59b..5bd119b 100644
> --- a/arch/x86/include/asm/system.h
> +++ b/arch/x86/include/asm/system.h
> @@ -31,7 +31,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
>  	"movl %P[task_canary](%[next]), %%ebx\n\t"			\
>  	"movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
>  #define __switch_canary_oparam						\
> -	, [stack_canary] "=m" (per_cpu_var(stack_canary))
> +	, [stack_canary] "=m" (per_cpu_var(stack_canary.canary))
>  #define __switch_canary_iparam						\
>  	, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
>  #else	/* CC_STACKPROTECTOR */
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index 5ce60a8..e338b5c 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -1043,7 +1043,7 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist);
>  #else	/* CONFIG_X86_64 */
>  
>  #ifdef CONFIG_CC_STACKPROTECTOR
> -DEFINE_PER_CPU(unsigned long, stack_canary);
> +DEFINE_PER_CPU(struct stack_canary, stack_canary) ____cacheline_aligned;

same here, DECLARE_PER_CPU_SHARED_ALIGNED

>  #endif
>  
>  /* Make sure %fs and %gs are initialized properly in idle threads */
> diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
> index cc827ac..7ffec6b 100644
> --- a/arch/x86/kernel/head_32.S
> +++ b/arch/x86/kernel/head_32.S
> @@ -439,7 +439,6 @@ is386:	movl $2,%ecx		# set MP
>  	jne 1f
>  	movl $per_cpu__gdt_page,%eax
>  	movl $per_cpu__stack_canary,%ecx
> -	subl $20, %ecx
>  	movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
>  	shrl $16, %ecx
>  	movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
> 
> 
> --


^ permalink raw reply	[flat|nested] 29+ messages in thread

* [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-03 19:27 [PATCH] x86/i386: make sure stack-protector segment base is cache aligned Jeremy Fitzhardinge
  2009-09-03 19:47 ` Eric Dumazet
@ 2009-09-03 20:03 ` tip-bot for Jeremy Fitzhardinge
  2009-09-03 20:26   ` H. Peter Anvin
  1 sibling, 1 reply; 29+ messages in thread
From: tip-bot for Jeremy Fitzhardinge @ 2009-09-03 20:03 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, jeremy.fitzhardinge, jeremy, stable,
	tglx, mingo

Commit-ID:  1ea0d14e480c245683927eecc03a70faf06e80c8
Gitweb:     http://git.kernel.org/tip/1ea0d14e480c245683927eecc03a70faf06e80c8
Author:     Jeremy Fitzhardinge <jeremy@goop.org>
AuthorDate: Thu, 3 Sep 2009 12:27:15 -0700
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Thu, 3 Sep 2009 21:30:51 +0200

x86/i386: Make sure stack-protector segment base is cache aligned

The Intel Optimization Reference Guide says:

	In Intel Atom microarchitecture, the address generation unit
	assumes that the segment base will be 0 by default. Non-zero
	segment base will cause load and store operations to experience
	a delay.
		- If the segment base isn't aligned to a cache line
		  boundary, the max throughput of memory operations is
		  reduced to one [e]very 9 cycles.
	[...]
	Assembly/Compiler Coding Rule 15. (H impact, ML generality)
	For Intel Atom processors, use segments with base set to 0
	whenever possible; avoid non-zero segment base address that is
	not aligned to cache line boundary at all cost.

We can't avoid having a non-zero base for the stack-protector
segment, but we can make it cache-aligned.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: <stable@kernel.org>
LKML-Reference: <4AA01893.6000507@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 arch/x86/include/asm/processor.h      |   12 +++++++++++-
 arch/x86/include/asm/stackprotector.h |    4 ++--
 arch/x86/include/asm/system.h         |    2 +-
 arch/x86/kernel/cpu/common.c          |    2 +-
 arch/x86/kernel/head_32.S             |    1 -
 5 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c776826..e597ecc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -403,7 +403,17 @@ extern unsigned long kernel_eflags;
 extern asmlinkage void ignore_sysret(void);
 #else	/* X86_64 */
 #ifdef CONFIG_CC_STACKPROTECTOR
-DECLARE_PER_CPU(unsigned long, stack_canary);
+/*
+ * Make sure stack canary segment base is cached-aligned:
+ *   "For Intel Atom processors, avoid non zero segment base address
+ *    that is not aligned to cache line boundary at all cost."
+ * (Optim Ref Manual Assembly/Compiler Coding Rule 15.)
+ */
+struct stack_canary {
+	char __pad[20];		/* canary at %gs:20 */
+	unsigned long canary;
+};
+DECLARE_PER_CPU(struct stack_canary, stack_canary) ____cacheline_aligned;
 #endif
 #endif	/* X86_64 */
 
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index 44efdff..1575177 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -78,14 +78,14 @@ static __always_inline void boot_init_stack_canary(void)
 #ifdef CONFIG_X86_64
 	percpu_write(irq_stack_union.stack_canary, canary);
 #else
-	percpu_write(stack_canary, canary);
+	percpu_write(stack_canary.canary, canary);
 #endif
 }
 
 static inline void setup_stack_canary_segment(int cpu)
 {
 #ifdef CONFIG_X86_32
-	unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu) - 20;
+	unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu);
 	struct desc_struct *gdt_table = get_cpu_gdt_table(cpu);
 	struct desc_struct desc;
 
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 75c49c7..f08f973 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -31,7 +31,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 	"movl %P[task_canary](%[next]), %%ebx\n\t"			\
 	"movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
 #define __switch_canary_oparam						\
-	, [stack_canary] "=m" (per_cpu_var(stack_canary))
+	, [stack_canary] "=m" (per_cpu_var(stack_canary.canary))
 #define __switch_canary_iparam						\
 	, [task_canary] "i" (offsetof(struct task_struct, stack_canary))
 #else	/* CC_STACKPROTECTOR */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ced07ba..7d84bc4 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1043,7 +1043,7 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist);
 #else	/* CONFIG_X86_64 */
 
 #ifdef CONFIG_CC_STACKPROTECTOR
-DEFINE_PER_CPU(unsigned long, stack_canary);
+DEFINE_PER_CPU(struct stack_canary, stack_canary) ____cacheline_aligned;
 #endif
 
 /* Make sure %fs and %gs are initialized properly in idle threads */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index cc827ac..7ffec6b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -439,7 +439,6 @@ is386:	movl $2,%ecx		# set MP
 	jne 1f
 	movl $per_cpu__gdt_page,%eax
 	movl $per_cpu__stack_canary,%ecx
-	subl $20, %ecx
 	movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
 	shrl $16, %ecx
 	movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-03 20:03 ` [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned tip-bot for Jeremy Fitzhardinge
@ 2009-09-03 20:26   ` H. Peter Anvin
  2009-09-03 20:45     ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 29+ messages in thread
From: H. Peter Anvin @ 2009-09-03 20:26 UTC (permalink / raw)
  To: mingo, hpa, linux-kernel, jeremy.fitzhardinge, jeremy, stable,
	tglx, mingo
  Cc: linux-tip-commits, Tejun Heo

On 09/03/2009 01:03 PM, tip-bot for Jeremy Fitzhardinge wrote:
> Commit-ID:  1ea0d14e480c245683927eecc03a70faf06e80c8
> Gitweb:     http://git.kernel.org/tip/1ea0d14e480c245683927eecc03a70faf06e80c8
> Author:     Jeremy Fitzhardinge <jeremy@goop.org>
> AuthorDate: Thu, 3 Sep 2009 12:27:15 -0700
> Committer:  Ingo Molnar <mingo@elte.hu>
> CommitDate: Thu, 3 Sep 2009 21:30:51 +0200
> 
> x86/i386: Make sure stack-protector segment base is cache aligned
> 
> The Intel Optimization Reference Guide says:
> 
> 	In Intel Atom microarchitecture, the address generation unit
> 	assumes that the segment base will be 0 by default. Non-zero
> 	segment base will cause load and store operations to experience
> 	a delay.
> 		- If the segment base isn't aligned to a cache line
> 		  boundary, the max throughput of memory operations is
> 		  reduced to one [e]very 9 cycles.
> 	[...]
> 	Assembly/Compiler Coding Rule 15. (H impact, ML generality)
> 	For Intel Atom processors, use segments with base set to 0
> 	whenever possible; avoid non-zero segment base address that is
> 	not aligned to cache line boundary at all cost.
> 
> We can't avoid having a non-zero base for the stack-protector
> segment, but we can make it cache-aligned.
> 

With the new zero-based percpu segment, it seems we should be able to
subsume the stack protector into the percpu segment and reference both
via %gs -- we just have to reserve the first 24 bytes of the segment,
and being able to reduce the number of segments we need in the kernel is
good for multiple reasons.

Tejun - am I missing something why that would be hard or impossible?

	-hpa

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] x86/i386: make sure stack-protector segment base is cache aligned
  2009-09-03 19:47 ` Eric Dumazet
@ 2009-09-03 20:41   ` Jeremy Fitzhardinge
  2009-09-03 21:07     ` Eric Dumazet
  0 siblings, 1 reply; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2009-09-03 20:41 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: H. Peter Anvin, the arch/x86 maintainers, Linux Kernel Mailing List

On 09/03/09 12:47, Eric Dumazet wrote:
> Jeremy Fitzhardinge a écrit :
>   
>> The Intel Optimization Reference Guide says:
>>
>> 	In Intel Atom microarchitecture, the address generation unit
>> 	assumes that the segment base will be 0 by default. Non-zero
>> 	segment base will cause load and store operations to experience
>> 	a delay.
>> 		- If the segment base isn't aligned to a cache line
>> 		  boundary, the max throughput of memory operations is
>> 		  reduced to one [e]very 9 cycles.
>> 	[...]
>> 	Assembly/Compiler Coding Rule 15. (H impact, ML generality)
>> 	For Intel Atom processors, use segments with base set to 0
>> 	whenever possible; avoid non-zero segment base address that is
>> 	not aligned to cache line boundary at all cost.
>>
>> We can't avoid having a non-zero base for the stack-protector segment, but
>> we can make it cache-aligned.
>>
>> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
>>
>> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
>> index 0bfcf7e..f7d2c8f 100644
>> --- a/arch/x86/include/asm/processor.h
>> +++ b/arch/x86/include/asm/processor.h
>> @@ -403,7 +403,17 @@ extern unsigned long kernel_eflags;
>>  extern asmlinkage void ignore_sysret(void);
>>  #else	/* X86_64 */
>>  #ifdef CONFIG_CC_STACKPROTECTOR
>> -DECLARE_PER_CPU(unsigned long, stack_canary);
>> +/*
>> + * Make sure stack canary segment base is cached-aligned:
>> + *   "For Intel Atom processors, avoid non zero segment base address
>> + *    that is not aligned to cache line boundary at all cost."
>> + * (Optim Ref Manual Assembly/Compiler Coding Rule 15.)
>> + */
>> +struct stack_canary {
>> +	char __pad[20];		/* canary at %gs:20 */
>> +	unsigned long canary;
>> +};
>> +DECLARE_PER_CPU(struct stack_canary, stack_canary) ____cacheline_aligned;
>>     
> DECLARE_PER_CPU_SHARED_ALIGNED()
>
> Or else, we'll have many holes in percpu section, because of linker encapsulation
>   

That's only cache aligned when SMP is enabled, to avoid false cacheline
sharing.  In this case we need it unconditionally cache-aligned.

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-03 20:26   ` H. Peter Anvin
@ 2009-09-03 20:45     ` Jeremy Fitzhardinge
  2009-09-03 21:15       ` H. Peter Anvin
  0 siblings, 1 reply; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2009-09-03 20:45 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: mingo, linux-kernel, jeremy.fitzhardinge, stable, tglx, mingo,
	linux-tip-commits, Tejun Heo

On 09/03/09 13:26, H. Peter Anvin wrote:
> With the new zero-based percpu segment, it seems we should be able to
> subsume the stack protector into the percpu segment and reference both
> via %gs -- we just have to reserve the first 24 bytes of the segment,
> and being able to reduce the number of segments we need in the kernel is
> good for multiple reasons.
>
> Tejun - am I missing something why that would be hard or impossible?
>   

Two problems:

    * gcc generates %gs: references for stack-protector, but we use %fs
      for percpu data (because restoring %fs is faster if it's a null
      selector; TLS uses %gs).  I guess we could use %fs if
      !CONFIG_CC_STACKPROTECTOR, or %gs if we are using it (though that
      has some fiddly ramifications for things like ptrace).
    * The i386 percpu %fs base is offset by -__per_cpu_start from the
      percpu variables, so we can directly refer to %fs:per_cpu__foo. 
      I'm not sure what it would take to unify i386 to use the same
      scheme as x86-64.

Neither looks insoluble.

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] x86/i386: make sure stack-protector segment base is cache aligned
  2009-09-03 20:41   ` Jeremy Fitzhardinge
@ 2009-09-03 21:07     ` Eric Dumazet
  2009-09-03 21:31       ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 29+ messages in thread
From: Eric Dumazet @ 2009-09-03 21:07 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: H. Peter Anvin, the arch/x86 maintainers, Linux Kernel Mailing List

Jeremy Fitzhardinge a écrit :
> On 09/03/09 12:47, Eric Dumazet wrote:
>> Jeremy Fitzhardinge a écrit :
>>   
>>> The Intel Optimization Reference Guide says:
>>>
>>> 	In Intel Atom microarchitecture, the address generation unit
>>> 	assumes that the segment base will be 0 by default. Non-zero
>>> 	segment base will cause load and store operations to experience
>>> 	a delay.
>>> 		- If the segment base isn't aligned to a cache line
>>> 		  boundary, the max throughput of memory operations is
>>> 		  reduced to one [e]very 9 cycles.
>>> 	[...]
>>> 	Assembly/Compiler Coding Rule 15. (H impact, ML generality)
>>> 	For Intel Atom processors, use segments with base set to 0
>>> 	whenever possible; avoid non-zero segment base address that is
>>> 	not aligned to cache line boundary at all cost.
>>>
>>> We can't avoid having a non-zero base for the stack-protector segment, but
>>> we can make it cache-aligned.
>>>
>>> Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
>>>
>>> diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
>>> index 0bfcf7e..f7d2c8f 100644
>>> --- a/arch/x86/include/asm/processor.h
>>> +++ b/arch/x86/include/asm/processor.h
>>> @@ -403,7 +403,17 @@ extern unsigned long kernel_eflags;
>>>  extern asmlinkage void ignore_sysret(void);
>>>  #else	/* X86_64 */
>>>  #ifdef CONFIG_CC_STACKPROTECTOR
>>> -DECLARE_PER_CPU(unsigned long, stack_canary);
>>> +/*
>>> + * Make sure stack canary segment base is cached-aligned:
>>> + *   "For Intel Atom processors, avoid non zero segment base address
>>> + *    that is not aligned to cache line boundary at all cost."
>>> + * (Optim Ref Manual Assembly/Compiler Coding Rule 15.)
>>> + */
>>> +struct stack_canary {
>>> +	char __pad[20];		/* canary at %gs:20 */
>>> +	unsigned long canary;
>>> +};
>>> +DECLARE_PER_CPU(struct stack_canary, stack_canary) ____cacheline_aligned;
>>>     
>> DECLARE_PER_CPU_SHARED_ALIGNED()
>>
>> Or else, we'll have many holes in percpu section, because of linker encapsulation
>>   
> 
> That's only cache aligned when SMP is enabled, to avoid false cacheline
> sharing.  In this case we need it unconditionally cache-aligned.

I was referring to .data.percpu alignement requirements, not to false sharing.

When we put a object with an align(64) requirement into a section, linker
has to put this 2**6 alignment in resulting section.

When several .o are linked together, linker has to take the biggest alignement,
and has to put holes.

Check .data.percpu size in vmlinux before and after your patch, to make sure
it doesnt grow too much :)

therefore, ____cacheline_aligned objects should be placed in
.data.percpu.shared_aligned

I suggest running following script and check .data.percpu alignments
are 2**2

# find . -name built-in.o|xargs objdump -h |grep percpu
 17 .data.percpu  00000018  00000000  00000000  0001db40  2**2
 12 .data.percpu  00000018  00000000  00000000  00015e80  2**2
 15 .data.percpu  000010a8  00000000  00000000  00012ec4  2**2
 31 .data.percpu.shared_aligned 0000055c  00000000  00000000  00085740  2**6
 33 .data.percpu  0000178c  00000000  00000000  00086880  2**2
 19 .data.percpu  000000bc  00000000  00000000  00006c64  2**2
 21 .data.percpu  00000010  00000000  00000000  00018990  2**2
 19 .data.percpu  0000000c  00000000  00000000  00008fb4  2**2
 30 .data.percpu  000018c0  00000000  00000000  0003cd50  2**3
 32 .data.percpu.shared_aligned 00000100  00000000  00000000  0003ee40  2**6
 43 .data.percpu.page_aligned 00001000  00000000  00000000  00048000  2**12
 14 .data.percpu  0000005c  00000000  00000000  0000a8a0  2**2
 22 .data.percpu  0000134c  00000000  00000000  0000d7a8  2**3
 23 .data.percpu.page_aligned 00001000  00000000  00000000  0000f000  2**12
 11 .data.percpu  00000014  00000000  00000000  00001428  2**2
 31 .data.percpu  000020b8  00000000  00000000  00045660  2**3
 33 .data.percpu.shared_aligned 00000108  00000000  00000000  00047f40  2**6
 44 .data.percpu.page_aligned 00001000  00000000  00000000  00052000  2**12
 21 .data.percpu  000007f8  00000000  00000000  00006b94  2**2
 25 .data.percpu.shared_aligned 00000008  00000000  00000000  00007400  2**6
 11 .data.percpu  000000e0  00000000  00000000  0000146c  2**2
  6 .data.percpu  000000dc  00000000  00000000  000003c0  2**2
 41 .data.percpu  000001c4  00000000  00000000  00261d00  2**2
 18 .data.percpu  00000004  00000000  00000000  00009f6c  2**2
 18 .data.percpu  000000bc  00000000  00000000  00003e4c  2**2
 23 .data.percpu  00000014  00000000  00000000  00027814  2**2
 16 .data.percpu  00000004  00000000  00000000  000012f0  2**2
 27 .data.percpu  0000000c  00000000  00000000  0003d97c  2**2
 28 .data.percpu  00000a68  00000000  00000000  000324e0  2**2
 18 .data.percpu  00000008  00000000  00000000  00013b44  2**2
 20 .data.percpu  000004fc  00000000  00000000  001263d4  2**2
 18 .data.percpu  00000188  00000000  00000000  0001e0d8  2**2
 19 .data.percpu  00000194  00000000  00000000  00030840  2**2
 19 .data.percpu  000001d4  00000000  00000000  0004a508  2**2
 26 .data.percpu  00000040  00000000  00000000  000bc370  2**2

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-03 20:45     ` Jeremy Fitzhardinge
@ 2009-09-03 21:15       ` H. Peter Anvin
  2009-09-03 21:18         ` Ingo Molnar
                           ` (2 more replies)
  0 siblings, 3 replies; 29+ messages in thread
From: H. Peter Anvin @ 2009-09-03 21:15 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: mingo, linux-kernel, jeremy.fitzhardinge, stable, tglx, mingo,
	linux-tip-commits, Tejun Heo

On 09/03/2009 01:45 PM, Jeremy Fitzhardinge wrote:
> 
> Two problems:
> 
>     * gcc generates %gs: references for stack-protector, but we use %fs
>       for percpu data (because restoring %fs is faster if it's a null
>       selector; TLS uses %gs).  I guess we could use %fs if
>       !CONFIG_CC_STACKPROTECTOR, or %gs if we are using it (though that
>       has some fiddly ramifications for things like ptrace).

Well, by touching two segments we're getting the worst of both worlds,
so at least assuming some significant number of real-world deployments
use CC_STACKPROTECTOR, we really don't want to pessimize that case too much.

>     * The i386 percpu %fs base is offset by -__per_cpu_start from the
>       percpu variables, so we can directly refer to %fs:per_cpu__foo. 
>       I'm not sure what it would take to unify i386 to use the same
>       scheme as x86-64.

OK, I was under the impression that that had already been done (and no,
I didn't bother to look at the code.)  I guess I was wrong (and yes,
this is an absolute precondition.)

> Neither looks insoluble.

Agreed.  Looks like something that can and probably should be done but
is a bit further out.

	-hpa

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-03 21:15       ` H. Peter Anvin
@ 2009-09-03 21:18         ` Ingo Molnar
  2009-09-03 21:21           ` H. Peter Anvin
  2009-09-04 14:15           ` Arjan van de Ven
  2009-09-03 21:28         ` Jeremy Fitzhardinge
  2009-09-04  2:51         ` Tejun Heo
  2 siblings, 2 replies; 29+ messages in thread
From: Ingo Molnar @ 2009-09-03 21:18 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Jeremy Fitzhardinge, mingo, linux-kernel, jeremy.fitzhardinge,
	stable, tglx, linux-tip-commits, Tejun Heo, Arjan van de Ven


* H. Peter Anvin <hpa@zytor.com> wrote:

> On 09/03/2009 01:45 PM, Jeremy Fitzhardinge wrote:
> > 
> > Two problems:
> > 
> >     * gcc generates %gs: references for stack-protector, but we use %fs
> >       for percpu data (because restoring %fs is faster if it's a null
> >       selector; TLS uses %gs).  I guess we could use %fs if
> >       !CONFIG_CC_STACKPROTECTOR, or %gs if we are using it (though that
> >       has some fiddly ramifications for things like ptrace).
> 
> Well, by touching two segments we're getting the worst of both 
> worlds, so at least assuming some significant number of real-world 
> deployments use CC_STACKPROTECTOR, we really don't want to 
> pessimize that case too much.

Fedora has stackprotector enabled so it's used in a widespread way.

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-03 21:18         ` Ingo Molnar
@ 2009-09-03 21:21           ` H. Peter Anvin
  2009-09-04 14:15           ` Arjan van de Ven
  1 sibling, 0 replies; 29+ messages in thread
From: H. Peter Anvin @ 2009-09-03 21:21 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jeremy Fitzhardinge, mingo, linux-kernel, jeremy.fitzhardinge,
	stable, tglx, linux-tip-commits, Tejun Heo, Arjan van de Ven

On 09/03/2009 02:18 PM, Ingo Molnar wrote:
> 
> * H. Peter Anvin <hpa@zytor.com> wrote:
> 
>> On 09/03/2009 01:45 PM, Jeremy Fitzhardinge wrote:
>>>
>>> Two problems:
>>>
>>>     * gcc generates %gs: references for stack-protector, but we use %fs
>>>       for percpu data (because restoring %fs is faster if it's a null
>>>       selector; TLS uses %gs).  I guess we could use %fs if
>>>       !CONFIG_CC_STACKPROTECTOR, or %gs if we are using it (though that
>>>       has some fiddly ramifications for things like ptrace).
>>
>> Well, by touching two segments we're getting the worst of both 
>> worlds, so at least assuming some significant number of real-world 
>> deployments use CC_STACKPROTECTOR, we really don't want to 
>> pessimize that case too much.
> 
> Fedora has stackprotector enabled so it's used in a widespread way.
> 
> 	Ingo

I'm guessing most distros do, except perhaps embedded ones.

	-hpa

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-03 21:15       ` H. Peter Anvin
  2009-09-03 21:18         ` Ingo Molnar
@ 2009-09-03 21:28         ` Jeremy Fitzhardinge
  2009-09-04  2:51         ` Tejun Heo
  2 siblings, 0 replies; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2009-09-03 21:28 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: mingo, linux-kernel, jeremy.fitzhardinge, stable, tglx, mingo,
	linux-tip-commits, Tejun Heo

On 09/03/09 14:15, H. Peter Anvin wrote:
> On 09/03/2009 01:45 PM, Jeremy Fitzhardinge wrote:
>   
>> Two problems:
>>
>>     * gcc generates %gs: references for stack-protector, but we use %fs
>>       for percpu data (because restoring %fs is faster if it's a null
>>       selector; TLS uses %gs).  I guess we could use %fs if
>>       !CONFIG_CC_STACKPROTECTOR, or %gs if we are using it (though that
>>       has some fiddly ramifications for things like ptrace).
>>     
> Well, by touching two segments we're getting the worst of both worlds,
> so at least assuming some significant number of real-world deployments
> use CC_STACKPROTECTOR, we really don't want to pessimize that case too much.
>   

I'm assuming that stack-protector has fairly serious performance impact
anyway, so a bit of extra entry/exit cost is acceptable.  But I agree
that there's no point in making it gratuitously bad.

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH] x86/i386: make sure stack-protector segment base is cache aligned
  2009-09-03 21:07     ` Eric Dumazet
@ 2009-09-03 21:31       ` Jeremy Fitzhardinge
  2009-09-04  7:58         ` [tip:x86/asm] x86/i386: Put aligned stack-canary in percpu shared_aligned section tip-bot for Jeremy Fitzhardinge
  0 siblings, 1 reply; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2009-09-03 21:31 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: H. Peter Anvin, the arch/x86 maintainers, Linux Kernel Mailing List

On 09/03/09 14:07, Eric Dumazet wrote:
> I was referring to .data.percpu alignement requirements, not to false sharing.
>   

Yes, I know.  But the intent of DECLARE_PER_CPU_SHARED_ALIGNED is to
avoid false sharing, and so it does no alignment when CONFIG_SMP isn't
enabled.  We need alignment regardless.

> When we put a object with an align(64) requirement into a section, linker
> has to put this 2**6 alignment in resulting section.
>
> When several .o are linked together, linker has to take the biggest alignement,
> and has to put holes.
>
> Check .data.percpu size in vmlinux before and after your patch, to make sure
> it doesnt grow too much :)
>
> therefore, ____cacheline_aligned objects should be placed in
> .data.percpu.shared_aligned
>   

That section doesn't exist without SMP.  (Well,
PER_CPU_SHARED_ALIGNED_SECTION isn't defined.)

Anyway, this should sort it out.

    J

>From 69022dd952e65608afe33372e902b41cb94ff126 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 3 Sep 2009 14:17:27 -0700
Subject: [PATCH] x86/32: put aligned stack-canary in percpu shared_aligned section

Pack aligned things together into a special section to minimize padding
holes.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Eric Dumazet <eric.dumazet@gmail.com>

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index e597ecc..ac7e796 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -413,7 +413,7 @@ struct stack_canary {
 	char __pad[20];		/* canary at %gs:20 */
 	unsigned long canary;
 };
-DECLARE_PER_CPU(struct stack_canary, stack_canary) ____cacheline_aligned;
+DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
 #endif	/* X86_64 */
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e338b5c..1e8181c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1043,7 +1043,7 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist);
 #else	/* CONFIG_X86_64 */
 
 #ifdef CONFIG_CC_STACKPROTECTOR
-DEFINE_PER_CPU(struct stack_canary, stack_canary) ____cacheline_aligned;
+DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
 
 /* Make sure %fs and %gs are initialized properly in idle threads */
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index aa00800..90079c3 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -81,14 +81,17 @@ extern void setup_per_cpu_areas(void);
 
 #ifdef MODULE
 #define PER_CPU_SHARED_ALIGNED_SECTION ""
+#define PER_CPU_ALIGNED_SECTION ""
 #else
 #define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
+#define PER_CPU_ALIGNED_SECTION ".shared_aligned"
 #endif
 #define PER_CPU_FIRST_SECTION ".first"
 
 #else
 
 #define PER_CPU_SHARED_ALIGNED_SECTION ""
+#define PER_CPU_ALIGNED_SECTION ".shared_aligned"
 #define PER_CPU_FIRST_SECTION ""
 
 #endif
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 68438e1..3058cf9 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -66,6 +66,14 @@
 	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
 	____cacheline_aligned_in_smp
 
+#define DECLARE_PER_CPU_ALIGNED(type, name)				\
+	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)	\
+	____cacheline_aligned
+
+#define DEFINE_PER_CPU_ALIGNED(type, name)				\
+	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)	\
+	____cacheline_aligned
+
 /*
  * Declaration/definition used for per-CPU variables that must be page aligned.
  */



^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-03 21:15       ` H. Peter Anvin
  2009-09-03 21:18         ` Ingo Molnar
  2009-09-03 21:28         ` Jeremy Fitzhardinge
@ 2009-09-04  2:51         ` Tejun Heo
  2009-09-04  2:59           ` Tejun Heo
  2 siblings, 1 reply; 29+ messages in thread
From: Tejun Heo @ 2009-09-04  2:51 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Jeremy Fitzhardinge, mingo, linux-kernel, jeremy.fitzhardinge,
	stable, tglx, mingo, linux-tip-commits

Hello,

H. Peter Anvin wrote:
> On 09/03/2009 01:45 PM, Jeremy Fitzhardinge wrote:
>> Two problems:
>>
>>     * gcc generates %gs: references for stack-protector, but we use %fs
>>       for percpu data (because restoring %fs is faster if it's a null
>>       selector; TLS uses %gs).  I guess we could use %fs if
>>       !CONFIG_CC_STACKPROTECTOR, or %gs if we are using it (though that
>>       has some fiddly ramifications for things like ptrace).
> 
> Well, by touching two segments we're getting the worst of both worlds,
> so at least assuming some significant number of real-world deployments
> use CC_STACKPROTECTOR, we really don't want to pessimize that case too much.

Yes, this one definitely seems doable.  BTW, how much performance does
CC_STACKPROTECTOR cost?  That's an ambiguous question but really any
number would help to develop a general sense.  Considering fedora is
doing it by default, I assume it isn't too high?

>>     * The i386 percpu %fs base is offset by -__per_cpu_start from the
>>       percpu variables, so we can directly refer to %fs:per_cpu__foo. 
>>       I'm not sure what it would take to unify i386 to use the same
>>       scheme as x86-64.
> 
> OK, I was under the impression that that had already been done (and no,
> I didn't bother to look at the code.)  I guess I was wrong (and yes,
> this is an absolute precondition.)

I tried this a while ago but hit an obstacle which I don't remember
what exactly was now and decided the conversion wasn't worth the
trouble.  IIRC, it was something substantial.  I'll dig through my
trees.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-04  2:51         ` Tejun Heo
@ 2009-09-04  2:59           ` Tejun Heo
  2009-09-04  3:35             ` H. Peter Anvin
  0 siblings, 1 reply; 29+ messages in thread
From: Tejun Heo @ 2009-09-04  2:59 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Jeremy Fitzhardinge, mingo, linux-kernel, jeremy.fitzhardinge,
	stable, tglx, mingo, linux-tip-commits

Tejun Heo wrote:
> Hello,
> 
> H. Peter Anvin wrote:
>> On 09/03/2009 01:45 PM, Jeremy Fitzhardinge wrote:
>>> Two problems:
>>>
>>>     * gcc generates %gs: references for stack-protector, but we use %fs
>>>       for percpu data (because restoring %fs is faster if it's a null
>>>       selector; TLS uses %gs).  I guess we could use %fs if
>>>       !CONFIG_CC_STACKPROTECTOR, or %gs if we are using it (though that
>>>       has some fiddly ramifications for things like ptrace).
>> Well, by touching two segments we're getting the worst of both worlds,
>> so at least assuming some significant number of real-world deployments
>> use CC_STACKPROTECTOR, we really don't want to pessimize that case too much.
> 
> Yes, this one definitely seems doable.  BTW, how much performance does
> CC_STACKPROTECTOR cost?  That's an ambiguous question but really any
> number would help to develop a general sense.  Considering fedora is
> doing it by default, I assume it isn't too high?

Another question.  Other than saving and loading an extra segment
register on kernel entry/exit, whether using the same or different
segment registers doesn't look like would make difference
performance-wise.  If I'm interpreting the wording in the optimization
manual correctly, it means that each non-zero segment based memory
access will be costly regardless of which specific segment register is
in use and there's no way we can merge segment based dereferences for
stackprotector and percpu variables.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-04  2:59           ` Tejun Heo
@ 2009-09-04  3:35             ` H. Peter Anvin
  2009-09-04  3:47               ` Tejun Heo
  2009-09-04 16:01               ` Jeremy Fitzhardinge
  0 siblings, 2 replies; 29+ messages in thread
From: H. Peter Anvin @ 2009-09-04  3:35 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Jeremy Fitzhardinge, mingo, linux-kernel, jeremy.fitzhardinge,
	stable, tglx, mingo, linux-tip-commits

On 09/03/2009 07:59 PM, Tejun Heo wrote:
> 
> Another question.  Other than saving and loading an extra segment
> register on kernel entry/exit, whether using the same or different
> segment registers doesn't look like would make difference
> performance-wise.  If I'm interpreting the wording in the optimization
> manual correctly, it means that each non-zero segment based memory
> access will be costly regardless of which specific segment register is
> in use and there's no way we can merge segment based dereferences for
> stackprotector and percpu variables.
> 

It's correct that it doesn't make any difference for access, only for load.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-04  3:35             ` H. Peter Anvin
@ 2009-09-04  3:47               ` Tejun Heo
  2009-09-04  3:51                 ` H. Peter Anvin
  2009-09-04 16:01               ` Jeremy Fitzhardinge
  1 sibling, 1 reply; 29+ messages in thread
From: Tejun Heo @ 2009-09-04  3:47 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Jeremy Fitzhardinge, mingo, linux-kernel, jeremy.fitzhardinge,
	stable, tglx, mingo, linux-tip-commits

H. Peter Anvin wrote:
> On 09/03/2009 07:59 PM, Tejun Heo wrote:
>> Another question.  Other than saving and loading an extra segment
>> register on kernel entry/exit, whether using the same or different
>> segment registers doesn't look like would make difference
>> performance-wise.  If I'm interpreting the wording in the optimization
>> manual correctly, it means that each non-zero segment based memory
>> access will be costly regardless of which specific segment register is
>> in use and there's no way we can merge segment based dereferences for
>> stackprotector and percpu variables.
>>
> 
> It's correct that it doesn't make any difference for access, only for load.

Heh... here's a naive and hopeful plan.  How about we beg gcc
developers to allow different segment register and offset in newer gcc
versions and then use the same one when building with the new gcc?
This should solve the i386 problem too.  It would be the best as we
get to keep the separate segment register from the userland.  Too
hopeful?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-04  3:47               ` Tejun Heo
@ 2009-09-04  3:51                 ` H. Peter Anvin
  2009-09-04  5:06                   ` Tejun Heo
  0 siblings, 1 reply; 29+ messages in thread
From: H. Peter Anvin @ 2009-09-04  3:51 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Jeremy Fitzhardinge, mingo, linux-kernel, jeremy.fitzhardinge,
	stable, tglx, mingo, linux-tip-commits

On 09/03/2009 08:47 PM, Tejun Heo wrote:
> H. Peter Anvin wrote:
>> On 09/03/2009 07:59 PM, Tejun Heo wrote:
>>> Another question.  Other than saving and loading an extra segment
>>> register on kernel entry/exit, whether using the same or different
>>> segment registers doesn't look like would make difference
>>> performance-wise.  If I'm interpreting the wording in the optimization
>>> manual correctly, it means that each non-zero segment based memory
>>> access will be costly regardless of which specific segment register is
>>> in use and there's no way we can merge segment based dereferences for
>>> stackprotector and percpu variables.
>>>
>> It's correct that it doesn't make any difference for access, only for load.
> 
> Heh... here's a naive and hopeful plan.  How about we beg gcc
> developers to allow different segment register and offset in newer gcc
> versions and then use the same one when building with the new gcc?
> This should solve the i386 problem too.  It would be the best as we
> get to keep the separate segment register from the userland.  Too
> hopeful?

I think it's possible to set the register in more recent gcc.  Doing the
sane thing and having a symbol for an offset is probably worse.

I can talk to H.J. Lu about this tomorrow.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-04  3:51                 ` H. Peter Anvin
@ 2009-09-04  5:06                   ` Tejun Heo
  2009-09-04  5:12                     ` Ingo Molnar
  2009-09-04 16:04                     ` Jeremy Fitzhardinge
  0 siblings, 2 replies; 29+ messages in thread
From: Tejun Heo @ 2009-09-04  5:06 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Jeremy Fitzhardinge, mingo, linux-kernel, jeremy.fitzhardinge,
	stable, tglx, mingo, linux-tip-commits

H. Peter Anvin wrote:
> On 09/03/2009 08:47 PM, Tejun Heo wrote:
>> H. Peter Anvin wrote:
>>> On 09/03/2009 07:59 PM, Tejun Heo wrote:
>>>> Another question.  Other than saving and loading an extra segment
>>>> register on kernel entry/exit, whether using the same or different
>>>> segment registers doesn't look like would make difference
>>>> performance-wise.  If I'm interpreting the wording in the optimization
>>>> manual correctly, it means that each non-zero segment based memory
>>>> access will be costly regardless of which specific segment register is
>>>> in use and there's no way we can merge segment based dereferences for
>>>> stackprotector and percpu variables.
>>>>
>>> It's correct that it doesn't make any difference for access, only for load.
>> Heh... here's a naive and hopeful plan.  How about we beg gcc
>> developers to allow different segment register and offset in newer gcc
>> versions and then use the same one when building with the new gcc?
>> This should solve the i386 problem too.  It would be the best as we
>> get to keep the separate segment register from the userland.  Too
>> hopeful?
> 
> I think it's possible to set the register in more recent gcc.  Doing the
> sane thing and having a symbol for an offset is probably worse.

I was thinking about altering the build process so that we can use sed
to substitute %gs:40 with %fs:40 while compiling.  If it's already
possible to override the register in more recent gcc, no need to go
into that horror.

> I can talk to H.J. Lu about this tomorrow.

Great, please keep us posted.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-04  5:06                   ` Tejun Heo
@ 2009-09-04  5:12                     ` Ingo Molnar
  2009-09-04 16:04                     ` Jeremy Fitzhardinge
  1 sibling, 0 replies; 29+ messages in thread
From: Ingo Molnar @ 2009-09-04  5:12 UTC (permalink / raw)
  To: Tejun Heo
  Cc: H. Peter Anvin, Jeremy Fitzhardinge, mingo, linux-kernel,
	jeremy.fitzhardinge, stable, tglx, linux-tip-commits


* Tejun Heo <tj@kernel.org> wrote:

> H. Peter Anvin wrote:
> > On 09/03/2009 08:47 PM, Tejun Heo wrote:
> >> H. Peter Anvin wrote:
> >>> On 09/03/2009 07:59 PM, Tejun Heo wrote:
> >>>> Another question.  Other than saving and loading an extra segment
> >>>> register on kernel entry/exit, whether using the same or different
> >>>> segment registers doesn't look like would make difference
> >>>> performance-wise.  If I'm interpreting the wording in the optimization
> >>>> manual correctly, it means that each non-zero segment based memory
> >>>> access will be costly regardless of which specific segment register is
> >>>> in use and there's no way we can merge segment based dereferences for
> >>>> stackprotector and percpu variables.
> >>>>
> >>> It's correct that it doesn't make any difference for access, only for load.
> >> Heh... here's a naive and hopeful plan.  How about we beg gcc
> >> developers to allow different segment register and offset in newer gcc
> >> versions and then use the same one when building with the new gcc?
> >> This should solve the i386 problem too.  It would be the best as we
> >> get to keep the separate segment register from the userland.  Too
> >> hopeful?
> > 
> > I think it's possible to set the register in more recent gcc.  
> > Doing the sane thing and having a symbol for an offset is 
> > probably worse.
> 
> I was thinking about altering the build process so that we can use 
> sed to substitute %gs:40 with %fs:40 while compiling.  If it's 
> already possible to override the register in more recent gcc, no 
> need to go into that horror.
> 
> > I can talk to H.J. Lu about this tomorrow.
> 
> Great, please keep us posted.

Yeah - if then this should definitely be handled in the compiler.

	Ingo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* [tip:x86/asm] x86/i386: Put aligned stack-canary in percpu shared_aligned section
  2009-09-03 21:31       ` Jeremy Fitzhardinge
@ 2009-09-04  7:58         ` tip-bot for Jeremy Fitzhardinge
  0 siblings, 0 replies; 29+ messages in thread
From: tip-bot for Jeremy Fitzhardinge @ 2009-09-04  7:58 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, eric.dumazet, jeremy.fitzhardinge,
	jeremy, tj, tglx, mingo

Commit-ID:  53f824520b6d84ca5b4a8fd71addc91dbf64357e
Gitweb:     http://git.kernel.org/tip/53f824520b6d84ca5b4a8fd71addc91dbf64357e
Author:     Jeremy Fitzhardinge <jeremy@goop.org>
AuthorDate: Thu, 3 Sep 2009 14:31:44 -0700
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Fri, 4 Sep 2009 07:10:31 +0200

x86/i386: Put aligned stack-canary in percpu shared_aligned section

Pack aligned things together into a special section to minimize
padding holes.

Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Tejun Heo <tj@kernel.org>
LKML-Reference: <4AA035C0.9070202@goop.org>
[ queued up in tip:x86/asm because it depends on this commit:
  x86/i386: Make sure stack-protector segment base is cache aligned ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 arch/x86/include/asm/processor.h |    2 +-
 arch/x86/kernel/cpu/common.c     |    2 +-
 include/asm-generic/percpu.h     |    3 +++
 include/linux/percpu-defs.h      |    8 ++++++++
 4 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index e597ecc..ac7e796 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -413,7 +413,7 @@ struct stack_canary {
 	char __pad[20];		/* canary at %gs:20 */
 	unsigned long canary;
 };
-DECLARE_PER_CPU(struct stack_canary, stack_canary) ____cacheline_aligned;
+DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
 #endif	/* X86_64 */
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7d84bc4..f23e236 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1043,7 +1043,7 @@ DEFINE_PER_CPU(struct orig_ist, orig_ist);
 #else	/* CONFIG_X86_64 */
 
 #ifdef CONFIG_CC_STACKPROTECTOR
-DEFINE_PER_CPU(struct stack_canary, stack_canary) ____cacheline_aligned;
+DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
 
 /* Make sure %fs and %gs are initialized properly in idle threads */
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index aa00800..90079c3 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -81,14 +81,17 @@ extern void setup_per_cpu_areas(void);
 
 #ifdef MODULE
 #define PER_CPU_SHARED_ALIGNED_SECTION ""
+#define PER_CPU_ALIGNED_SECTION ""
 #else
 #define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
+#define PER_CPU_ALIGNED_SECTION ".shared_aligned"
 #endif
 #define PER_CPU_FIRST_SECTION ".first"
 
 #else
 
 #define PER_CPU_SHARED_ALIGNED_SECTION ""
+#define PER_CPU_ALIGNED_SECTION ".shared_aligned"
 #define PER_CPU_FIRST_SECTION ""
 
 #endif
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 68438e1..3058cf9 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -66,6 +66,14 @@
 	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
 	____cacheline_aligned_in_smp
 
+#define DECLARE_PER_CPU_ALIGNED(type, name)				\
+	DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)	\
+	____cacheline_aligned
+
+#define DEFINE_PER_CPU_ALIGNED(type, name)				\
+	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)	\
+	____cacheline_aligned
+
 /*
  * Declaration/definition used for per-CPU variables that must be page aligned.
  */

^ permalink raw reply related	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-03 21:18         ` Ingo Molnar
  2009-09-03 21:21           ` H. Peter Anvin
@ 2009-09-04 14:15           ` Arjan van de Ven
  2009-09-04 15:59             ` Jeremy Fitzhardinge
  2009-09-04 16:06             ` H. Peter Anvin
  1 sibling, 2 replies; 29+ messages in thread
From: Arjan van de Ven @ 2009-09-04 14:15 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: H. Peter Anvin, Jeremy Fitzhardinge, mingo, linux-kernel,
	jeremy.fitzhardinge, stable, tglx, linux-tip-commits, Tejun Heo

On Thu, 3 Sep 2009 23:18:05 +0200
Ingo Molnar <mingo@elte.hu> wrote:

> 
> * H. Peter Anvin <hpa@zytor.com> wrote:
> 
> > On 09/03/2009 01:45 PM, Jeremy Fitzhardinge wrote:
> > > 
> > > Two problems:
> > > 
> > >     * gcc generates %gs: references for stack-protector, but we
> > > use %fs for percpu data (because restoring %fs is faster if it's
> > > a null selector; TLS uses %gs).  I guess we could use %fs if
> > >       !CONFIG_CC_STACKPROTECTOR, or %gs if we are using it
> > > (though that has some fiddly ramifications for things like
> > > ptrace).
> > 
> > Well, by touching two segments we're getting the worst of both 
> > worlds, so at least assuming some significant number of real-world 
> > deployments use CC_STACKPROTECTOR, we really don't want to 
> > pessimize that case too much.
> 
> Fedora has stackprotector enabled so it's used in a widespread way.
> 
> 	Ingo

the other issue is that afaik we want the kernel to use the other
register than userspace does...


-- 
Arjan van de Ven 	Intel Open Source Technology Centre
For development, discussion and tips for power savings, 
visit http://www.lesswatts.org

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-04 14:15           ` Arjan van de Ven
@ 2009-09-04 15:59             ` Jeremy Fitzhardinge
  2009-09-04 16:06             ` H. Peter Anvin
  1 sibling, 0 replies; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2009-09-04 15:59 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ingo Molnar, H. Peter Anvin, mingo, linux-kernel,
	jeremy.fitzhardinge, stable, tglx, linux-tip-commits, Tejun Heo

On 09/04/09 07:15, Arjan van de Ven wrote:
> On Thu, 3 Sep 2009 23:18:05 +0200
> Ingo Molnar <mingo@elte.hu> wrote:
>
>   
>> * H. Peter Anvin <hpa@zytor.com> wrote:
>>
>>     
>>> On 09/03/2009 01:45 PM, Jeremy Fitzhardinge wrote:
>>>       
>>>> Two problems:
>>>>
>>>>     * gcc generates %gs: references for stack-protector, but we
>>>> use %fs for percpu data (because restoring %fs is faster if it's
>>>> a null selector; TLS uses %gs).  I guess we could use %fs if
>>>>       !CONFIG_CC_STACKPROTECTOR, or %gs if we are using it
>>>> (though that has some fiddly ramifications for things like
>>>> ptrace).
>>>>         
>>> Well, by touching two segments we're getting the worst of both 
>>> worlds, so at least assuming some significant number of real-world 
>>> deployments use CC_STACKPROTECTOR, we really don't want to 
>>> pessimize that case too much.
>>>       
>> Fedora has stackprotector enabled so it's used in a widespread way.
>>
>> 	Ingo
>>     
> the other issue is that afaik we want the kernel to use the other
> register than userspace does...
>   

We do for percpu (%fs), but gcc always generates %gs references for
stack-protector.  The difference between "pop %seg" for a null vs
non-null selector was fairly small (a couple of cycles), so using %gs
when stack-protector is enabled isn't a huge deal.  To put it another
way, calling one stack-protected function in kernel mode would probably
make up the difference between using %fs vs %gs.

    J
>
>   


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-04  3:35             ` H. Peter Anvin
  2009-09-04  3:47               ` Tejun Heo
@ 2009-09-04 16:01               ` Jeremy Fitzhardinge
  2009-09-04 16:52                 ` H. Peter Anvin
  1 sibling, 1 reply; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2009-09-04 16:01 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Tejun Heo, mingo, linux-kernel, jeremy.fitzhardinge, stable,
	tglx, mingo, linux-tip-commits

On 09/03/09 20:35, H. Peter Anvin wrote:
> On 09/03/2009 07:59 PM, Tejun Heo wrote:
>   
>> Another question.  Other than saving and loading an extra segment
>> register on kernel entry/exit, whether using the same or different
>> segment registers doesn't look like would make difference
>> performance-wise.  If I'm interpreting the wording in the optimization
>> manual correctly, it means that each non-zero segment based memory
>> access will be costly regardless of which specific segment register is
>> in use and there's no way we can merge segment based dereferences for
>> stackprotector and percpu variables.
>>
>>     
> It's correct that it doesn't make any difference for access, only for load.
>   

Well, to be completely precise, restore.  When returning to usermode,
the "pop %seg" is slightly faster if you're restoring a null selector,
which is typically the case for %fs as 32-bit usermode doesn't use it.

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-04  5:06                   ` Tejun Heo
  2009-09-04  5:12                     ` Ingo Molnar
@ 2009-09-04 16:04                     ` Jeremy Fitzhardinge
  2009-09-04 16:09                       ` Tejun Heo
  2009-09-04 16:13                       ` H. Peter Anvin
  1 sibling, 2 replies; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2009-09-04 16:04 UTC (permalink / raw)
  To: Tejun Heo
  Cc: H. Peter Anvin, mingo, linux-kernel, jeremy.fitzhardinge, stable,
	tglx, mingo, linux-tip-commits

On 09/03/09 22:06, Tejun Heo wrote:
>>> Heh... here's a naive and hopeful plan.  How about we beg gcc
>>> developers to allow different segment register and offset in newer gcc
>>> versions and then use the same one when building with the new gcc?
>>> This should solve the i386 problem too.  It would be the best as we
>>> get to keep the separate segment register from the userland.  Too
>>> hopeful?
>>>       
>> I think it's possible to set the register in more recent gcc.  Doing the
>> sane thing and having a symbol for an offset is probably worse.
>>     
> I was thinking about altering the build process so that we can use sed
> to substitute %gs:40 with %fs:40 while compiling.  If it's already
> possible to override the register in more recent gcc, no need to go
> into that horror.
>   

Ideally we'd like to get rid of the constant offset too.  If we could
change it to %[fg]s:__gcc_stack_canary_offset on both 32-bit and 64-bit,
it would give us a lot more flexibility.  __gcc_stack_canary_offset
could be weakly defined to 20/40 for backwards compatibility, but we
could override it to point to a normal percpu variable.

    J

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-04 14:15           ` Arjan van de Ven
  2009-09-04 15:59             ` Jeremy Fitzhardinge
@ 2009-09-04 16:06             ` H. Peter Anvin
  1 sibling, 0 replies; 29+ messages in thread
From: H. Peter Anvin @ 2009-09-04 16:06 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Ingo Molnar, Jeremy Fitzhardinge, mingo, linux-kernel,
	jeremy.fitzhardinge, stable, tglx, linux-tip-commits, Tejun Heo

On 09/04/2009 07:15 AM, Arjan van de Ven wrote:
> 
> the other issue is that afaik we want the kernel to use the other
> register than userspace does...
> 

Yes, although it's a pretty marginal win on 32 bits as far as I know.
On 64 bits, not so.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-04 16:04                     ` Jeremy Fitzhardinge
@ 2009-09-04 16:09                       ` Tejun Heo
  2009-09-04 16:13                       ` H. Peter Anvin
  1 sibling, 0 replies; 29+ messages in thread
From: Tejun Heo @ 2009-09-04 16:09 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: H. Peter Anvin, mingo, linux-kernel, jeremy.fitzhardinge, stable,
	tglx, mingo, linux-tip-commits

Jeremy Fitzhardinge wrote:
> On 09/03/09 22:06, Tejun Heo wrote:
>>>> Heh... here's a naive and hopeful plan.  How about we beg gcc
>>>> developers to allow different segment register and offset in newer gcc
>>>> versions and then use the same one when building with the new gcc?
>>>> This should solve the i386 problem too.  It would be the best as we
>>>> get to keep the separate segment register from the userland.  Too
>>>> hopeful?
>>>>       
>>> I think it's possible to set the register in more recent gcc.  Doing the
>>> sane thing and having a symbol for an offset is probably worse.
>>>     
>> I was thinking about altering the build process so that we can use sed
>> to substitute %gs:40 with %fs:40 while compiling.  If it's already
>> possible to override the register in more recent gcc, no need to go
>> into that horror.
>>   
> 
> Ideally we'd like to get rid of the constant offset too.  If we could
> change it to %[fg]s:__gcc_stack_canary_offset on both 32-bit and 64-bit,
> it would give us a lot more flexibility.  __gcc_stack_canary_offset
> could be weakly defined to 20/40 for backwards compatibility, but we
> could override it to point to a normal percpu variable.

Yeap, being able to do that will also allow using single segment
register on i386 too.  But given that the only overhead we're talking
here is a few more cycles when entering and leving the kernel, I don't
think we need to do anything drastic to optimize this.  I think
converting when gcc provides the feature should be enough.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-04 16:04                     ` Jeremy Fitzhardinge
  2009-09-04 16:09                       ` Tejun Heo
@ 2009-09-04 16:13                       ` H. Peter Anvin
  1 sibling, 0 replies; 29+ messages in thread
From: H. Peter Anvin @ 2009-09-04 16:13 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Tejun Heo, mingo, linux-kernel, jeremy.fitzhardinge, stable,
	tglx, mingo, linux-tip-commits

On 09/04/2009 09:04 AM, Jeremy Fitzhardinge wrote:
> 
> Ideally we'd like to get rid of the constant offset too.  If we could
> change it to %[fg]s:__gcc_stack_canary_offset on both 32-bit and 64-bit,
> it would give us a lot more flexibility.  __gcc_stack_canary_offset
> could be weakly defined to 20/40 for backwards compatibility, but we
> could override it to point to a normal percpu variable.
> 

Yes, although that definitely means starting the gcc pipeline from scratch.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-04 16:01               ` Jeremy Fitzhardinge
@ 2009-09-04 16:52                 ` H. Peter Anvin
  2009-09-04 16:57                   ` Jeremy Fitzhardinge
  0 siblings, 1 reply; 29+ messages in thread
From: H. Peter Anvin @ 2009-09-04 16:52 UTC (permalink / raw)
  To: Jeremy Fitzhardinge
  Cc: Tejun Heo, mingo, linux-kernel, jeremy.fitzhardinge, stable,
	tglx, mingo, linux-tip-commits

On 09/04/2009 09:01 AM, Jeremy Fitzhardinge wrote:
>>>     
>> It's correct that it doesn't make any difference for access, only for load.
>>   
> 
> Well, to be completely precise, restore.  When returning to usermode,
> the "pop %seg" is slightly faster if you're restoring a null selector,
> which is typically the case for %fs as 32-bit usermode doesn't use it.
> 

That's a segment load...

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned
  2009-09-04 16:52                 ` H. Peter Anvin
@ 2009-09-04 16:57                   ` Jeremy Fitzhardinge
  0 siblings, 0 replies; 29+ messages in thread
From: Jeremy Fitzhardinge @ 2009-09-04 16:57 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Tejun Heo, mingo, linux-kernel, jeremy.fitzhardinge, stable,
	tglx, mingo, linux-tip-commits

On 09/04/09 09:52, H. Peter Anvin wrote:
> On 09/04/2009 09:01 AM, Jeremy Fitzhardinge wrote:
>   
>>>>     
>>>>         
>>> It's correct that it doesn't make any difference for access, only for load.
>>>   
>>>       
>> Well, to be completely precise, restore.  When returning to usermode,
>> the "pop %seg" is slightly faster if you're restoring a null selector,
>> which is typically the case for %fs as 32-bit usermode doesn't use it.
>>
>>     
> That's a segment load...

Yeah, OK.

    J


^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2009-09-04 17:32 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-09-03 19:27 [PATCH] x86/i386: make sure stack-protector segment base is cache aligned Jeremy Fitzhardinge
2009-09-03 19:47 ` Eric Dumazet
2009-09-03 20:41   ` Jeremy Fitzhardinge
2009-09-03 21:07     ` Eric Dumazet
2009-09-03 21:31       ` Jeremy Fitzhardinge
2009-09-04  7:58         ` [tip:x86/asm] x86/i386: Put aligned stack-canary in percpu shared_aligned section tip-bot for Jeremy Fitzhardinge
2009-09-03 20:03 ` [tip:x86/asm] x86/i386: Make sure stack-protector segment base is cache aligned tip-bot for Jeremy Fitzhardinge
2009-09-03 20:26   ` H. Peter Anvin
2009-09-03 20:45     ` Jeremy Fitzhardinge
2009-09-03 21:15       ` H. Peter Anvin
2009-09-03 21:18         ` Ingo Molnar
2009-09-03 21:21           ` H. Peter Anvin
2009-09-04 14:15           ` Arjan van de Ven
2009-09-04 15:59             ` Jeremy Fitzhardinge
2009-09-04 16:06             ` H. Peter Anvin
2009-09-03 21:28         ` Jeremy Fitzhardinge
2009-09-04  2:51         ` Tejun Heo
2009-09-04  2:59           ` Tejun Heo
2009-09-04  3:35             ` H. Peter Anvin
2009-09-04  3:47               ` Tejun Heo
2009-09-04  3:51                 ` H. Peter Anvin
2009-09-04  5:06                   ` Tejun Heo
2009-09-04  5:12                     ` Ingo Molnar
2009-09-04 16:04                     ` Jeremy Fitzhardinge
2009-09-04 16:09                       ` Tejun Heo
2009-09-04 16:13                       ` H. Peter Anvin
2009-09-04 16:01               ` Jeremy Fitzhardinge
2009-09-04 16:52                 ` H. Peter Anvin
2009-09-04 16:57                   ` Jeremy Fitzhardinge

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.