All of lore.kernel.org
 help / color / mirror / Atom feed
* arm: Only load TLS values when needed
@ 2013-07-15 17:14 ` André Hentschel
  0 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-07-15 17:14 UTC (permalink / raw)
  To: linux-arch
  Cc: Russell King - ARM Linux, linux-arm-kernel, linux-kernel,
	Will Deacon, Jonathan Austin

From: André Hentschel <nerv@dawncrow.de>

This patch intents to reduce loading instructions when the resulting value is not used.
It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760

Signed-off-by: André Hentschel <nerv@dawncrow.de>

---
This patch is against Linux 3.11-rc1 (ad81f0545ef01ea651886dddac4bef6cec930092)

Thanks to everyone who helped me with a4780adeefd042482f624f5e0d577bf9cdcbb760

diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 83259b8..3742722 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -3,29 +3,32 @@
 
 #ifdef __ASSEMBLY__
 #include <asm/asm-offsets.h>
-	.macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
 	.endm
 
-	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
+	ldrd	\tp, \tpuser, [\next, #TI_TP_VALUE]	@ get the next TLS and user r/w register
 	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
 	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
 	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
-	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+	str	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
 	ldr	\tmp1, =elf_hwcap
 	ldr	\tmp1, [\tmp1, #0]
 	mov	\tmp2, #0xffff0fff
+	ldr	\tp, [\next, #TI_TP_VALUE]	@ get the next TLS register
 	tst	\tmp1, #HWCAP_TLS		@ hardware TLS available?
 	streq	\tp, [\tmp2, #-15]		@ set TLS value at 0xffff0ff0
-	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
+	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the previous user r/w register
+	ldrne	\tpuser, [\next, #TI_TP_VALUE + 4]	@ get the next user r/w register
 	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
 	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register
-	strne	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+	strne	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
 	mov	\tmp1, #0xffff0fff
 	str	\tp, [\tmp1, #-15]		@ set TLS value at 0xffff0ff0
 	.endm
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index a39cfc2a1..1484b59 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -689,12 +689,10 @@ ENTRY(__switch_to)
  THUMB(	stmia	ip!, {r4 - sl, fp}	   )	@ Store most regs on stack
  THUMB(	str	sp, [ip], #4		   )
  THUMB(	str	lr, [ip], #4		   )
-	ldr	r4, [r2, #TI_TP_VALUE]
-	ldr	r5, [r2, #TI_TP_VALUE + 4]
 #ifdef CONFIG_CPU_USE_DOMAINS
 	ldr	r6, [r2, #TI_CPU_DOMAIN]
 #endif
-	switch_tls r1, r4, r5, r3, r7
+	switch_tls r1, r2, r4, r5, r3, r7
 #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	ldr	r7, [r2, #TI_TASK]
 	ldr	r8, =__stack_chk_guard

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* arm: Only load TLS values when needed
@ 2013-07-15 17:14 ` André Hentschel
  0 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-07-15 17:14 UTC (permalink / raw)
  To: linux-arch
  Cc: Russell King - ARM Linux, linux-arm-kernel, linux-kernel,
	Will Deacon, Jonathan Austin

From: André Hentschel <nerv@dawncrow.de>

This patch intents to reduce loading instructions when the resulting value is not used.
It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760

Signed-off-by: André Hentschel <nerv@dawncrow.de>

---
This patch is against Linux 3.11-rc1 (ad81f0545ef01ea651886dddac4bef6cec930092)

Thanks to everyone who helped me with a4780adeefd042482f624f5e0d577bf9cdcbb760

diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 83259b8..3742722 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -3,29 +3,32 @@
 
 #ifdef __ASSEMBLY__
 #include <asm/asm-offsets.h>
-	.macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
 	.endm
 
-	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
+	ldrd	\tp, \tpuser, [\next, #TI_TP_VALUE]	@ get the next TLS and user r/w register
 	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
 	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
 	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
-	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+	str	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
 	ldr	\tmp1, =elf_hwcap
 	ldr	\tmp1, [\tmp1, #0]
 	mov	\tmp2, #0xffff0fff
+	ldr	\tp, [\next, #TI_TP_VALUE]	@ get the next TLS register
 	tst	\tmp1, #HWCAP_TLS		@ hardware TLS available?
 	streq	\tp, [\tmp2, #-15]		@ set TLS value at 0xffff0ff0
-	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
+	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the previous user r/w register
+	ldrne	\tpuser, [\next, #TI_TP_VALUE + 4]	@ get the next user r/w register
 	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
 	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register
-	strne	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+	strne	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
 	mov	\tmp1, #0xffff0fff
 	str	\tp, [\tmp1, #-15]		@ set TLS value at 0xffff0ff0
 	.endm
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index a39cfc2a1..1484b59 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -689,12 +689,10 @@ ENTRY(__switch_to)
  THUMB(	stmia	ip!, {r4 - sl, fp}	   )	@ Store most regs on stack
  THUMB(	str	sp, [ip], #4		   )
  THUMB(	str	lr, [ip], #4		   )
-	ldr	r4, [r2, #TI_TP_VALUE]
-	ldr	r5, [r2, #TI_TP_VALUE + 4]
 #ifdef CONFIG_CPU_USE_DOMAINS
 	ldr	r6, [r2, #TI_CPU_DOMAIN]
 #endif
-	switch_tls r1, r4, r5, r3, r7
+	switch_tls r1, r2, r4, r5, r3, r7
 #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	ldr	r7, [r2, #TI_TASK]
 	ldr	r8, =__stack_chk_guard

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* arm: Only load TLS values when needed
@ 2013-07-15 17:14 ` André Hentschel
  0 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-07-15 17:14 UTC (permalink / raw)
  To: linux-arm-kernel

From: Andr? Hentschel <nerv@dawncrow.de>

This patch intents to reduce loading instructions when the resulting value is not used.
It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760

Signed-off-by: Andr? Hentschel <nerv@dawncrow.de>

---
This patch is against Linux 3.11-rc1 (ad81f0545ef01ea651886dddac4bef6cec930092)

Thanks to everyone who helped me with a4780adeefd042482f624f5e0d577bf9cdcbb760

diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 83259b8..3742722 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -3,29 +3,32 @@
 
 #ifdef __ASSEMBLY__
 #include <asm/asm-offsets.h>
-	.macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
 	.endm
 
-	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
+	ldrd	\tp, \tpuser, [\next, #TI_TP_VALUE]	@ get the next TLS and user r/w register
 	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
 	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
 	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
-	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+	str	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
 	ldr	\tmp1, =elf_hwcap
 	ldr	\tmp1, [\tmp1, #0]
 	mov	\tmp2, #0xffff0fff
+	ldr	\tp, [\next, #TI_TP_VALUE]	@ get the next TLS register
 	tst	\tmp1, #HWCAP_TLS		@ hardware TLS available?
 	streq	\tp, [\tmp2, #-15]		@ set TLS value at 0xffff0ff0
-	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
+	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the previous user r/w register
+	ldrne	\tpuser, [\next, #TI_TP_VALUE + 4]	@ get the next user r/w register
 	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
 	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register
-	strne	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+	strne	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
 	mov	\tmp1, #0xffff0fff
 	str	\tp, [\tmp1, #-15]		@ set TLS value at 0xffff0ff0
 	.endm
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index a39cfc2a1..1484b59 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -689,12 +689,10 @@ ENTRY(__switch_to)
  THUMB(	stmia	ip!, {r4 - sl, fp}	   )	@ Store most regs on stack
  THUMB(	str	sp, [ip], #4		   )
  THUMB(	str	lr, [ip], #4		   )
-	ldr	r4, [r2, #TI_TP_VALUE]
-	ldr	r5, [r2, #TI_TP_VALUE + 4]
 #ifdef CONFIG_CPU_USE_DOMAINS
 	ldr	r6, [r2, #TI_CPU_DOMAIN]
 #endif
-	switch_tls r1, r4, r5, r3, r7
+	switch_tls r1, r2, r4, r5, r3, r7
 #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	ldr	r7, [r2, #TI_TASK]
 	ldr	r8, =__stack_chk_guard

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
  2013-07-15 17:14 ` André Hentschel
  (?)
@ 2013-07-16 17:31   ` Jonathan Austin
  -1 siblings, 0 replies; 28+ messages in thread
From: Jonathan Austin @ 2013-07-16 17:31 UTC (permalink / raw)
  To: André Hentschel
  Cc: linux-arch, Russell King - ARM Linux, linux-arm-kernel,
	linux-kernel, Will Deacon

Hi André,

On 15/07/13 18:14, André Hentschel wrote:
> From: André Hentschel <nerv@dawncrow.de>
>
> This patch intents to reduce loading instructions when the resulting value is not used.
> It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760
>

Have you done any benchmarking to see that this has any real impact? Or 
tested on a !Vv6k system? It looks possible that the only case where 
this will perform better is where we're using switch_tls_none or 
switch_tls_software (both rare cases, I think) and there's some change 
it will make things worse in other cases?

One of the reasons for Russell's suggestion of placing the ldrd (which 
became the two ldr instructions you've removed from __switch_to, in 
order to maintain building for older cores) where it is was in order to 
reduce the chance of pipeline stalls.

As I've pointed out below, there is some risk that changing that has 
implications for the v6 only case below (and the v6k case is now more 
prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer cores should 
have more advanced scheduling to avoid such issues anyway...)

> Signed-off-by: André Hentschel <nerv@dawncrow.de>
>
> ---
> This patch is against Linux 3.11-rc1 (ad81f0545ef01ea651886dddac4bef6cec930092)
>
> Thanks to everyone who helped me with a4780adeefd042482f624f5e0d577bf9cdcbb760
>
> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
> index 83259b8..3742722 100644
> --- a/arch/arm/include/asm/tls.h
> +++ b/arch/arm/include/asm/tls.h
> @@ -3,29 +3,32 @@
>
>   #ifdef __ASSEMBLY__
>   #include <asm/asm-offsets.h>
> -	.macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
>   	.endm
>
> -	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
> +	ldrd	\tp, \tpuser, [\next, #TI_TP_VALUE]	@ get the next TLS and user r/w register
>   	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
>   	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
>   	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
> -	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
> +	str	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>   	.endm
>
> -	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
>   	ldr	\tmp1, =elf_hwcap
>   	ldr	\tmp1, [\tmp1, #0]
>   	mov	\tmp2, #0xffff0fff
> +	ldr	\tp, [\next, #TI_TP_VALUE]	@ get the next TLS register
>   	tst	\tmp1, #HWCAP_TLS		@ hardware TLS available?
>   	streq	\tp, [\tmp2, #-15]		@ set TLS value at 0xffff0ff0
> -	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
> +	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the previous user r/w register
> +	ldrne	\tpuser, [\next, #TI_TP_VALUE + 4]	@ get the next user r/w register
>   	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
>   	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register

Now we've only got one instruction between the store and the load and 
risk stalling the pipeline...

Dave M cautiously says "The ancient advice was that one instruction was 
enough" but this is very core dependent... I wonder if anyone has a good 
idea about whether this is an issue here...?

> -	strne	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
> +	strne	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>   	.endm
>
> -	.macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
>   	mov	\tmp1, #0xffff0fff
>   	str	\tp, [\tmp1, #-15]		@ set TLS value at 0xffff0ff0
>   	.endm
> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
> index a39cfc2a1..1484b59 100644
> --- a/arch/arm/kernel/entry-armv.S
> +++ b/arch/arm/kernel/entry-armv.S
> @@ -689,12 +689,10 @@ ENTRY(__switch_to)
>    THUMB(	stmia	ip!, {r4 - sl, fp}	   )	@ Store most regs on stack
>    THUMB(	str	sp, [ip], #4		   )
>    THUMB(	str	lr, [ip], #4		   )
> -	ldr	r4, [r2, #TI_TP_VALUE]
> -	ldr	r5, [r2, #TI_TP_VALUE + 4]
>   #ifdef CONFIG_CPU_USE_DOMAINS
>   	ldr	r6, [r2, #TI_CPU_DOMAIN]
>   #endif
> -	switch_tls r1, r4, r5, r3, r7
> +	switch_tls r1, r2, r4, r5, r3, r7
>   #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
>   	ldr	r7, [r2, #TI_TASK]
>   	ldr	r8, =__stack_chk_guard
>

Jonny


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
@ 2013-07-16 17:31   ` Jonathan Austin
  0 siblings, 0 replies; 28+ messages in thread
From: Jonathan Austin @ 2013-07-16 17:31 UTC (permalink / raw)
  To: André Hentschel
  Cc: linux-arch, Russell King - ARM Linux, linux-arm-kernel,
	linux-kernel, Will Deacon

Hi André,

On 15/07/13 18:14, André Hentschel wrote:
> From: André Hentschel <nerv@dawncrow.de>
>
> This patch intents to reduce loading instructions when the resulting value is not used.
> It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760
>

Have you done any benchmarking to see that this has any real impact? Or 
tested on a !Vv6k system? It looks possible that the only case where 
this will perform better is where we're using switch_tls_none or 
switch_tls_software (both rare cases, I think) and there's some change 
it will make things worse in other cases?

One of the reasons for Russell's suggestion of placing the ldrd (which 
became the two ldr instructions you've removed from __switch_to, in 
order to maintain building for older cores) where it is was in order to 
reduce the chance of pipeline stalls.

As I've pointed out below, there is some risk that changing that has 
implications for the v6 only case below (and the v6k case is now more 
prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer cores should 
have more advanced scheduling to avoid such issues anyway...)

> Signed-off-by: André Hentschel <nerv@dawncrow.de>
>
> ---
> This patch is against Linux 3.11-rc1 (ad81f0545ef01ea651886dddac4bef6cec930092)
>
> Thanks to everyone who helped me with a4780adeefd042482f624f5e0d577bf9cdcbb760
>
> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
> index 83259b8..3742722 100644
> --- a/arch/arm/include/asm/tls.h
> +++ b/arch/arm/include/asm/tls.h
> @@ -3,29 +3,32 @@
>
>   #ifdef __ASSEMBLY__
>   #include <asm/asm-offsets.h>
> -	.macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
>   	.endm
>
> -	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
> +	ldrd	\tp, \tpuser, [\next, #TI_TP_VALUE]	@ get the next TLS and user r/w register
>   	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
>   	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
>   	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
> -	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
> +	str	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>   	.endm
>
> -	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
>   	ldr	\tmp1, =elf_hwcap
>   	ldr	\tmp1, [\tmp1, #0]
>   	mov	\tmp2, #0xffff0fff
> +	ldr	\tp, [\next, #TI_TP_VALUE]	@ get the next TLS register
>   	tst	\tmp1, #HWCAP_TLS		@ hardware TLS available?
>   	streq	\tp, [\tmp2, #-15]		@ set TLS value at 0xffff0ff0
> -	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
> +	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the previous user r/w register
> +	ldrne	\tpuser, [\next, #TI_TP_VALUE + 4]	@ get the next user r/w register
>   	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
>   	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register

Now we've only got one instruction between the store and the load and 
risk stalling the pipeline...

Dave M cautiously says "The ancient advice was that one instruction was 
enough" but this is very core dependent... I wonder if anyone has a good 
idea about whether this is an issue here...?

> -	strne	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
> +	strne	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>   	.endm
>
> -	.macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
>   	mov	\tmp1, #0xffff0fff
>   	str	\tp, [\tmp1, #-15]		@ set TLS value at 0xffff0ff0
>   	.endm
> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
> index a39cfc2a1..1484b59 100644
> --- a/arch/arm/kernel/entry-armv.S
> +++ b/arch/arm/kernel/entry-armv.S
> @@ -689,12 +689,10 @@ ENTRY(__switch_to)
>    THUMB(	stmia	ip!, {r4 - sl, fp}	   )	@ Store most regs on stack
>    THUMB(	str	sp, [ip], #4		   )
>    THUMB(	str	lr, [ip], #4		   )
> -	ldr	r4, [r2, #TI_TP_VALUE]
> -	ldr	r5, [r2, #TI_TP_VALUE + 4]
>   #ifdef CONFIG_CPU_USE_DOMAINS
>   	ldr	r6, [r2, #TI_CPU_DOMAIN]
>   #endif
> -	switch_tls r1, r4, r5, r3, r7
> +	switch_tls r1, r2, r4, r5, r3, r7
>   #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
>   	ldr	r7, [r2, #TI_TASK]
>   	ldr	r8, =__stack_chk_guard
>

Jonny

^ permalink raw reply	[flat|nested] 28+ messages in thread

* arm: Only load TLS values when needed
@ 2013-07-16 17:31   ` Jonathan Austin
  0 siblings, 0 replies; 28+ messages in thread
From: Jonathan Austin @ 2013-07-16 17:31 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Andr?,

On 15/07/13 18:14, Andr? Hentschel wrote:
> From: Andr? Hentschel <nerv@dawncrow.de>
>
> This patch intents to reduce loading instructions when the resulting value is not used.
> It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760
>

Have you done any benchmarking to see that this has any real impact? Or 
tested on a !Vv6k system? It looks possible that the only case where 
this will perform better is where we're using switch_tls_none or 
switch_tls_software (both rare cases, I think) and there's some change 
it will make things worse in other cases?

One of the reasons for Russell's suggestion of placing the ldrd (which 
became the two ldr instructions you've removed from __switch_to, in 
order to maintain building for older cores) where it is was in order to 
reduce the chance of pipeline stalls.

As I've pointed out below, there is some risk that changing that has 
implications for the v6 only case below (and the v6k case is now more 
prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer cores should 
have more advanced scheduling to avoid such issues anyway...)

> Signed-off-by: Andr? Hentschel <nerv@dawncrow.de>
>
> ---
> This patch is against Linux 3.11-rc1 (ad81f0545ef01ea651886dddac4bef6cec930092)
>
> Thanks to everyone who helped me with a4780adeefd042482f624f5e0d577bf9cdcbb760
>
> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
> index 83259b8..3742722 100644
> --- a/arch/arm/include/asm/tls.h
> +++ b/arch/arm/include/asm/tls.h
> @@ -3,29 +3,32 @@
>
>   #ifdef __ASSEMBLY__
>   #include <asm/asm-offsets.h>
> -	.macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
>   	.endm
>
> -	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
> +	ldrd	\tp, \tpuser, [\next, #TI_TP_VALUE]	@ get the next TLS and user r/w register
>   	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
>   	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
>   	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
> -	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
> +	str	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>   	.endm
>
> -	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
>   	ldr	\tmp1, =elf_hwcap
>   	ldr	\tmp1, [\tmp1, #0]
>   	mov	\tmp2, #0xffff0fff
> +	ldr	\tp, [\next, #TI_TP_VALUE]	@ get the next TLS register
>   	tst	\tmp1, #HWCAP_TLS		@ hardware TLS available?
>   	streq	\tp, [\tmp2, #-15]		@ set TLS value at 0xffff0ff0
> -	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
> +	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the previous user r/w register
> +	ldrne	\tpuser, [\next, #TI_TP_VALUE + 4]	@ get the next user r/w register
>   	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
>   	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register

Now we've only got one instruction between the store and the load and 
risk stalling the pipeline...

Dave M cautiously says "The ancient advice was that one instruction was 
enough" but this is very core dependent... I wonder if anyone has a good 
idea about whether this is an issue here...?

> -	strne	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
> +	strne	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>   	.endm
>
> -	.macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
>   	mov	\tmp1, #0xffff0fff
>   	str	\tp, [\tmp1, #-15]		@ set TLS value at 0xffff0ff0
>   	.endm
> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
> index a39cfc2a1..1484b59 100644
> --- a/arch/arm/kernel/entry-armv.S
> +++ b/arch/arm/kernel/entry-armv.S
> @@ -689,12 +689,10 @@ ENTRY(__switch_to)
>    THUMB(	stmia	ip!, {r4 - sl, fp}	   )	@ Store most regs on stack
>    THUMB(	str	sp, [ip], #4		   )
>    THUMB(	str	lr, [ip], #4		   )
> -	ldr	r4, [r2, #TI_TP_VALUE]
> -	ldr	r5, [r2, #TI_TP_VALUE + 4]
>   #ifdef CONFIG_CPU_USE_DOMAINS
>   	ldr	r6, [r2, #TI_CPU_DOMAIN]
>   #endif
> -	switch_tls r1, r4, r5, r3, r7
> +	switch_tls r1, r2, r4, r5, r3, r7
>   #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
>   	ldr	r7, [r2, #TI_TASK]
>   	ldr	r8, =__stack_chk_guard
>

Jonny

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
  2013-07-16 17:31   ` Jonathan Austin
  (?)
@ 2013-07-16 19:27     ` André Hentschel
  -1 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-07-16 19:27 UTC (permalink / raw)
  To: Jonathan Austin
  Cc: linux-arch, Russell King - ARM Linux, linux-arm-kernel,
	linux-kernel, Will Deacon

Hi Jonathan,
First, thank you for your review.

Am 16.07.2013 19:31, schrieb Jonathan Austin:
> Hi André,
> 
> On 15/07/13 18:14, André Hentschel wrote:
>> From: André Hentschel <nerv@dawncrow.de>
>>
>> This patch intents to reduce loading instructions when the resulting value is not used.
>> It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760
>>
> 
> Have you done any benchmarking to see that this has any real impact? Or tested on a !Vv6k system? It looks possible that the only case where this will perform better is where we're using switch_tls_none or switch_tls_software (both rare cases, I think) and there's some change it will make things worse in other cases?

I have to admit that i only tested it on v6k and did no benchmark.

> One of the reasons for Russell's suggestion of placing the ldrd (which became the two ldr instructions you've removed from __switch_to, in order to maintain building for older cores) where it is was in order to reduce the chance of pipeline stalls.
> 
> As I've pointed out below, there is some risk that changing that has implications for the v6 only case below (and the v6k case is now more prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer cores should have more advanced scheduling to avoid such issues anyway...)

I'm not sure how this could make things worse on v6k, could you elaborate please?
Besides of the ldr and str being too close to each other i thought this patch is a good idea, because it removes two ldr which are always executed.
(Continuing below...)

>> Signed-off-by: André Hentschel <nerv@dawncrow.de>
>>
>> ---
>> This patch is against Linux 3.11-rc1 (ad81f0545ef01ea651886dddac4bef6cec930092)
>>
>> Thanks to everyone who helped me with a4780adeefd042482f624f5e0d577bf9cdcbb760
>>
>> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
>> index 83259b8..3742722 100644
>> --- a/arch/arm/include/asm/tls.h
>> +++ b/arch/arm/include/asm/tls.h
>> @@ -3,29 +3,32 @@
>>
>>   #ifdef __ASSEMBLY__
>>   #include <asm/asm-offsets.h>
>> -    .macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
>>       .endm
>>
>> -    .macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
>> +    ldrd    \tp, \tpuser, [\next, #TI_TP_VALUE]    @ get the next TLS and user r/w register
>>       mrc    p15, 0, \tmp2, c13, c0, 2    @ get the user r/w register
>>       mcr    p15, 0, \tp, c13, c0, 3        @ set TLS register
>>       mcr    p15, 0, \tpuser, c13, c0, 2    @ and the user r/w register
>> -    str    \tmp2, [\base, #TI_TP_VALUE + 4] @ save it
>> +    str    \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>>       .endm
>>
>> -    .macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
>>       ldr    \tmp1, =elf_hwcap
>>       ldr    \tmp1, [\tmp1, #0]
>>       mov    \tmp2, #0xffff0fff
>> +    ldr    \tp, [\next, #TI_TP_VALUE]    @ get the next TLS register
>>       tst    \tmp1, #HWCAP_TLS        @ hardware TLS available?
>>       streq    \tp, [\tmp2, #-15]        @ set TLS value at 0xffff0ff0
>> -    mrcne    p15, 0, \tmp2, c13, c0, 2    @ get the user r/w register
>> +    mrcne    p15, 0, \tmp2, c13, c0, 2    @ get the previous user r/w register
>> +    ldrne    \tpuser, [\next, #TI_TP_VALUE + 4]    @ get the next user r/w register
>>       mcrne    p15, 0, \tp, c13, c0, 3        @ yes, set TLS register
>>       mcrne    p15, 0, \tpuser, c13, c0, 2    @ set user r/w register
> 
> Now we've only got one instruction between the store and the load and risk stalling the pipeline...
> 
> Dave M cautiously says "The ancient advice was that one instruction was enough" but this is very core dependent... I wonder if anyone has a good idea about whether this is an issue here...?

We could use a ldrd at the top, that'd be nearly what we have right now, don't we?



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
@ 2013-07-16 19:27     ` André Hentschel
  0 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-07-16 19:27 UTC (permalink / raw)
  To: Jonathan Austin
  Cc: linux-arch, Russell King - ARM Linux, linux-arm-kernel,
	linux-kernel, Will Deacon

Hi Jonathan,
First, thank you for your review.

Am 16.07.2013 19:31, schrieb Jonathan Austin:
> Hi André,
> 
> On 15/07/13 18:14, André Hentschel wrote:
>> From: André Hentschel <nerv@dawncrow.de>
>>
>> This patch intents to reduce loading instructions when the resulting value is not used.
>> It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760
>>
> 
> Have you done any benchmarking to see that this has any real impact? Or tested on a !Vv6k system? It looks possible that the only case where this will perform better is where we're using switch_tls_none or switch_tls_software (both rare cases, I think) and there's some change it will make things worse in other cases?

I have to admit that i only tested it on v6k and did no benchmark.

> One of the reasons for Russell's suggestion of placing the ldrd (which became the two ldr instructions you've removed from __switch_to, in order to maintain building for older cores) where it is was in order to reduce the chance of pipeline stalls.
> 
> As I've pointed out below, there is some risk that changing that has implications for the v6 only case below (and the v6k case is now more prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer cores should have more advanced scheduling to avoid such issues anyway...)

I'm not sure how this could make things worse on v6k, could you elaborate please?
Besides of the ldr and str being too close to each other i thought this patch is a good idea, because it removes two ldr which are always executed.
(Continuing below...)

>> Signed-off-by: André Hentschel <nerv@dawncrow.de>
>>
>> ---
>> This patch is against Linux 3.11-rc1 (ad81f0545ef01ea651886dddac4bef6cec930092)
>>
>> Thanks to everyone who helped me with a4780adeefd042482f624f5e0d577bf9cdcbb760
>>
>> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
>> index 83259b8..3742722 100644
>> --- a/arch/arm/include/asm/tls.h
>> +++ b/arch/arm/include/asm/tls.h
>> @@ -3,29 +3,32 @@
>>
>>   #ifdef __ASSEMBLY__
>>   #include <asm/asm-offsets.h>
>> -    .macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
>>       .endm
>>
>> -    .macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
>> +    ldrd    \tp, \tpuser, [\next, #TI_TP_VALUE]    @ get the next TLS and user r/w register
>>       mrc    p15, 0, \tmp2, c13, c0, 2    @ get the user r/w register
>>       mcr    p15, 0, \tp, c13, c0, 3        @ set TLS register
>>       mcr    p15, 0, \tpuser, c13, c0, 2    @ and the user r/w register
>> -    str    \tmp2, [\base, #TI_TP_VALUE + 4] @ save it
>> +    str    \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>>       .endm
>>
>> -    .macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
>>       ldr    \tmp1, =elf_hwcap
>>       ldr    \tmp1, [\tmp1, #0]
>>       mov    \tmp2, #0xffff0fff
>> +    ldr    \tp, [\next, #TI_TP_VALUE]    @ get the next TLS register
>>       tst    \tmp1, #HWCAP_TLS        @ hardware TLS available?
>>       streq    \tp, [\tmp2, #-15]        @ set TLS value at 0xffff0ff0
>> -    mrcne    p15, 0, \tmp2, c13, c0, 2    @ get the user r/w register
>> +    mrcne    p15, 0, \tmp2, c13, c0, 2    @ get the previous user r/w register
>> +    ldrne    \tpuser, [\next, #TI_TP_VALUE + 4]    @ get the next user r/w register
>>       mcrne    p15, 0, \tp, c13, c0, 3        @ yes, set TLS register
>>       mcrne    p15, 0, \tpuser, c13, c0, 2    @ set user r/w register
> 
> Now we've only got one instruction between the store and the load and risk stalling the pipeline...
> 
> Dave M cautiously says "The ancient advice was that one instruction was enough" but this is very core dependent... I wonder if anyone has a good idea about whether this is an issue here...?

We could use a ldrd at the top, that'd be nearly what we have right now, don't we?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* arm: Only load TLS values when needed
@ 2013-07-16 19:27     ` André Hentschel
  0 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-07-16 19:27 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Jonathan,
First, thank you for your review.

Am 16.07.2013 19:31, schrieb Jonathan Austin:
> Hi Andr?,
> 
> On 15/07/13 18:14, Andr? Hentschel wrote:
>> From: Andr? Hentschel <nerv@dawncrow.de>
>>
>> This patch intents to reduce loading instructions when the resulting value is not used.
>> It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760
>>
> 
> Have you done any benchmarking to see that this has any real impact? Or tested on a !Vv6k system? It looks possible that the only case where this will perform better is where we're using switch_tls_none or switch_tls_software (both rare cases, I think) and there's some change it will make things worse in other cases?

I have to admit that i only tested it on v6k and did no benchmark.

> One of the reasons for Russell's suggestion of placing the ldrd (which became the two ldr instructions you've removed from __switch_to, in order to maintain building for older cores) where it is was in order to reduce the chance of pipeline stalls.
> 
> As I've pointed out below, there is some risk that changing that has implications for the v6 only case below (and the v6k case is now more prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer cores should have more advanced scheduling to avoid such issues anyway...)

I'm not sure how this could make things worse on v6k, could you elaborate please?
Besides of the ldr and str being too close to each other i thought this patch is a good idea, because it removes two ldr which are always executed.
(Continuing below...)

>> Signed-off-by: Andr? Hentschel <nerv@dawncrow.de>
>>
>> ---
>> This patch is against Linux 3.11-rc1 (ad81f0545ef01ea651886dddac4bef6cec930092)
>>
>> Thanks to everyone who helped me with a4780adeefd042482f624f5e0d577bf9cdcbb760
>>
>> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
>> index 83259b8..3742722 100644
>> --- a/arch/arm/include/asm/tls.h
>> +++ b/arch/arm/include/asm/tls.h
>> @@ -3,29 +3,32 @@
>>
>>   #ifdef __ASSEMBLY__
>>   #include <asm/asm-offsets.h>
>> -    .macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
>>       .endm
>>
>> -    .macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
>> +    ldrd    \tp, \tpuser, [\next, #TI_TP_VALUE]    @ get the next TLS and user r/w register
>>       mrc    p15, 0, \tmp2, c13, c0, 2    @ get the user r/w register
>>       mcr    p15, 0, \tp, c13, c0, 3        @ set TLS register
>>       mcr    p15, 0, \tpuser, c13, c0, 2    @ and the user r/w register
>> -    str    \tmp2, [\base, #TI_TP_VALUE + 4] @ save it
>> +    str    \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>>       .endm
>>
>> -    .macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
>>       ldr    \tmp1, =elf_hwcap
>>       ldr    \tmp1, [\tmp1, #0]
>>       mov    \tmp2, #0xffff0fff
>> +    ldr    \tp, [\next, #TI_TP_VALUE]    @ get the next TLS register
>>       tst    \tmp1, #HWCAP_TLS        @ hardware TLS available?
>>       streq    \tp, [\tmp2, #-15]        @ set TLS value at 0xffff0ff0
>> -    mrcne    p15, 0, \tmp2, c13, c0, 2    @ get the user r/w register
>> +    mrcne    p15, 0, \tmp2, c13, c0, 2    @ get the previous user r/w register
>> +    ldrne    \tpuser, [\next, #TI_TP_VALUE + 4]    @ get the next user r/w register
>>       mcrne    p15, 0, \tp, c13, c0, 3        @ yes, set TLS register
>>       mcrne    p15, 0, \tpuser, c13, c0, 2    @ set user r/w register
> 
> Now we've only got one instruction between the store and the load and risk stalling the pipeline...
> 
> Dave M cautiously says "The ancient advice was that one instruction was enough" but this is very core dependent... I wonder if anyone has a good idea about whether this is an issue here...?

We could use a ldrd at the top, that'd be nearly what we have right now, don't we?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
  2013-07-16 19:27     ` André Hentschel
  (?)
@ 2013-07-17 11:10       ` Jonathan Austin
  -1 siblings, 0 replies; 28+ messages in thread
From: Jonathan Austin @ 2013-07-17 11:10 UTC (permalink / raw)
  To: André Hentschel
  Cc: linux-arch, Russell King - ARM Linux, linux-arm-kernel,
	linux-kernel, Will Deacon

Hi André,

On 16/07/13 20:27, André Hentschel wrote:
> Hi Jonathan, First, thank you for your review.
>
> Am 16.07.2013 19:31, schrieb Jonathan Austin:
>> Hi André,
>>
>> On 15/07/13 18:14, André Hentschel wrote:
>>> From: André Hentschel <nerv@dawncrow.de>
>>>
>>> This patch intents to reduce loading instructions when the
>>> resulting value is not used. It's a follow up on
>>> a4780adeefd042482f624f5e0d577bf9cdcbb760
>>>
>>
>> Have you done any benchmarking to see that this has any real
>> impact? Or tested on a !Vv6k system? It looks possible that the
>> only case where this will perform better is where we're using
>> switch_tls_none or switch_tls_software (both rare cases, I think)
>> and there's some change it will make things worse in other cases?
>
> I have to admit that i only tested it on v6k and did no benchmark.
>
Do you have access to anything v6-NOT-k-ish? If not I can try and test 
this on something appropriate. How does your test-case access tpidrurw? 
If it uses inline asm then it won't work on v6-not-k, as those 
instructions aren't defined...

>> One of the reasons for Russell's suggestion of placing the ldrd
>> (which became the two ldr instructions you've removed from
>> __switch_to, in order to maintain building for older cores) where
>> it is was in order to reduce the chance of pipeline stalls.
>>
>> As I've pointed out below, there is some risk that changing that
>> has implications for the v6 only case below (and the v6k case is
>> now more prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer
>> cores should have more advanced scheduling to avoid such issues
>> anyway...)
>
> I'm not sure how this could make things worse on v6k, could you
> elaborate please? Besides of the ldr and str being too close to each
> other

Yea, that's the only issue, and in the !CONFIG_CPU_USE_DOMAINS case 
things are slightly worse than they were before

> i thought this patch is a good idea, because it removes two ldr
> which are always executed. (Continuing below...)

Indeed, as long as it doesn't cause pipeline stalls then that's a gain 
for some cases :)

[...]
>> Now we've only got one instruction between the store and the load
>> and risk stalling the pipeline...
>>
>> Dave M cautiously says "The ancient advice was that one instruction
>> was enough" but this is very core dependent... I wonder if anyone
>> has a good idea about whether this is an issue here...?
>
> We could use a ldrd at the top, that'd be nearly what we have right
> now, don't we?

Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd 
*may* be two cycles (depending on alignment of the words) but the ldr 
and ldrne will always be two cycles. Ahhh, the joys of modifying the 
fast path ;)

Jonny
>
>
>



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
@ 2013-07-17 11:10       ` Jonathan Austin
  0 siblings, 0 replies; 28+ messages in thread
From: Jonathan Austin @ 2013-07-17 11:10 UTC (permalink / raw)
  To: André Hentschel
  Cc: linux-arch, Russell King - ARM Linux, linux-arm-kernel,
	linux-kernel, Will Deacon

Hi André,

On 16/07/13 20:27, André Hentschel wrote:
> Hi Jonathan, First, thank you for your review.
>
> Am 16.07.2013 19:31, schrieb Jonathan Austin:
>> Hi André,
>>
>> On 15/07/13 18:14, André Hentschel wrote:
>>> From: André Hentschel <nerv@dawncrow.de>
>>>
>>> This patch intents to reduce loading instructions when the
>>> resulting value is not used. It's a follow up on
>>> a4780adeefd042482f624f5e0d577bf9cdcbb760
>>>
>>
>> Have you done any benchmarking to see that this has any real
>> impact? Or tested on a !Vv6k system? It looks possible that the
>> only case where this will perform better is where we're using
>> switch_tls_none or switch_tls_software (both rare cases, I think)
>> and there's some change it will make things worse in other cases?
>
> I have to admit that i only tested it on v6k and did no benchmark.
>
Do you have access to anything v6-NOT-k-ish? If not I can try and test 
this on something appropriate. How does your test-case access tpidrurw? 
If it uses inline asm then it won't work on v6-not-k, as those 
instructions aren't defined...

>> One of the reasons for Russell's suggestion of placing the ldrd
>> (which became the two ldr instructions you've removed from
>> __switch_to, in order to maintain building for older cores) where
>> it is was in order to reduce the chance of pipeline stalls.
>>
>> As I've pointed out below, there is some risk that changing that
>> has implications for the v6 only case below (and the v6k case is
>> now more prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer
>> cores should have more advanced scheduling to avoid such issues
>> anyway...)
>
> I'm not sure how this could make things worse on v6k, could you
> elaborate please? Besides of the ldr and str being too close to each
> other

Yea, that's the only issue, and in the !CONFIG_CPU_USE_DOMAINS case 
things are slightly worse than they were before

> i thought this patch is a good idea, because it removes two ldr
> which are always executed. (Continuing below...)

Indeed, as long as it doesn't cause pipeline stalls then that's a gain 
for some cases :)

[...]
>> Now we've only got one instruction between the store and the load
>> and risk stalling the pipeline...
>>
>> Dave M cautiously says "The ancient advice was that one instruction
>> was enough" but this is very core dependent... I wonder if anyone
>> has a good idea about whether this is an issue here...?
>
> We could use a ldrd at the top, that'd be nearly what we have right
> now, don't we?

Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd 
*may* be two cycles (depending on alignment of the words) but the ldr 
and ldrne will always be two cycles. Ahhh, the joys of modifying the 
fast path ;)

Jonny
>
>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* arm: Only load TLS values when needed
@ 2013-07-17 11:10       ` Jonathan Austin
  0 siblings, 0 replies; 28+ messages in thread
From: Jonathan Austin @ 2013-07-17 11:10 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Andr?,

On 16/07/13 20:27, Andr? Hentschel wrote:
> Hi Jonathan, First, thank you for your review.
>
> Am 16.07.2013 19:31, schrieb Jonathan Austin:
>> Hi Andr?,
>>
>> On 15/07/13 18:14, Andr? Hentschel wrote:
>>> From: Andr? Hentschel <nerv@dawncrow.de>
>>>
>>> This patch intents to reduce loading instructions when the
>>> resulting value is not used. It's a follow up on
>>> a4780adeefd042482f624f5e0d577bf9cdcbb760
>>>
>>
>> Have you done any benchmarking to see that this has any real
>> impact? Or tested on a !Vv6k system? It looks possible that the
>> only case where this will perform better is where we're using
>> switch_tls_none or switch_tls_software (both rare cases, I think)
>> and there's some change it will make things worse in other cases?
>
> I have to admit that i only tested it on v6k and did no benchmark.
>
Do you have access to anything v6-NOT-k-ish? If not I can try and test 
this on something appropriate. How does your test-case access tpidrurw? 
If it uses inline asm then it won't work on v6-not-k, as those 
instructions aren't defined...

>> One of the reasons for Russell's suggestion of placing the ldrd
>> (which became the two ldr instructions you've removed from
>> __switch_to, in order to maintain building for older cores) where
>> it is was in order to reduce the chance of pipeline stalls.
>>
>> As I've pointed out below, there is some risk that changing that
>> has implications for the v6 only case below (and the v6k case is
>> now more prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer
>> cores should have more advanced scheduling to avoid such issues
>> anyway...)
>
> I'm not sure how this could make things worse on v6k, could you
> elaborate please? Besides of the ldr and str being too close to each
> other

Yea, that's the only issue, and in the !CONFIG_CPU_USE_DOMAINS case 
things are slightly worse than they were before

> i thought this patch is a good idea, because it removes two ldr
> which are always executed. (Continuing below...)

Indeed, as long as it doesn't cause pipeline stalls then that's a gain 
for some cases :)

[...]
>> Now we've only got one instruction between the store and the load
>> and risk stalling the pipeline...
>>
>> Dave M cautiously says "The ancient advice was that one instruction
>> was enough" but this is very core dependent... I wonder if anyone
>> has a good idea about whether this is an issue here...?
>
> We could use a ldrd at the top, that'd be nearly what we have right
> now, don't we?

Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd 
*may* be two cycles (depending on alignment of the words) but the ldr 
and ldrne will always be two cycles. Ahhh, the joys of modifying the 
fast path ;)

Jonny
>
>
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
  2013-07-17 11:10       ` Jonathan Austin
  (?)
@ 2013-07-17 19:49         ` André Hentschel
  -1 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-07-17 19:49 UTC (permalink / raw)
  To: Jonathan Austin
  Cc: linux-arch, Russell King - ARM Linux, linux-arm-kernel,
	linux-kernel, Will Deacon

Am 17.07.2013 13:10, schrieb Jonathan Austin:
> Hi André,
> 
> On 16/07/13 20:27, André Hentschel wrote:
>> Hi Jonathan, First, thank you for your review.
>>
>> Am 16.07.2013 19:31, schrieb Jonathan Austin:
>>> Hi André,
>>>
>>> On 15/07/13 18:14, André Hentschel wrote:
>>>> From: André Hentschel <nerv@dawncrow.de>
>>>>
>>>> This patch intents to reduce loading instructions when the
>>>> resulting value is not used. It's a follow up on
>>>> a4780adeefd042482f624f5e0d577bf9cdcbb760
>>>>
>>>
>>> Have you done any benchmarking to see that this has any real
>>> impact? Or tested on a !Vv6k system? It looks possible that the
>>> only case where this will perform better is where we're using
>>> switch_tls_none or switch_tls_software (both rare cases, I think)
>>> and there's some change it will make things worse in other cases?
>>
>> I have to admit that i only tested it on v6k and did no benchmark.
>>
> Do you have access to anything v6-NOT-k-ish? If not I can try and test this on something appropriate. How does your test-case access tpidrurw? If it uses inline asm then it won't work on v6-not-k, as those instructions aren't defined...

I don't, so it'd be nice if you could do that. I could imagine you have a good choice of devices at ARM :)

In my crappy test application i do it similar to Wine:
https://github.com/AndreRH/tpidrurw-test/blob/master/main.c#L29

but Wine code won't work out of the box on v6:
http://source.winehq.org/git/wine.git/blob/HEAD:/dlls/ntdll/signal_arm.c#l851

>>> One of the reasons for Russell's suggestion of placing the ldrd
>>> (which became the two ldr instructions you've removed from
>>> __switch_to, in order to maintain building for older cores) where
>>> it is was in order to reduce the chance of pipeline stalls.
>>>
>>> As I've pointed out below, there is some risk that changing that
>>> has implications for the v6 only case below (and the v6k case is
>>> now more prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer
>>> cores should have more advanced scheduling to avoid such issues
>>> anyway...)
>>
>> I'm not sure how this could make things worse on v6k, could you
>> elaborate please? Besides of the ldr and str being too close to each
>> other
> 
> Yea, that's the only issue, and in the !CONFIG_CPU_USE_DOMAINS case things are slightly worse than they were before
> 
>> i thought this patch is a good idea, because it removes two ldr
>> which are always executed. (Continuing below...)
> 
> Indeed, as long as it doesn't cause pipeline stalls then that's a gain for some cases :)
> 
> [...]
>>> Now we've only got one instruction between the store and the load
>>> and risk stalling the pipeline...
>>>
>>> Dave M cautiously says "The ancient advice was that one instruction
>>> was enough" but this is very core dependent... I wonder if anyone
>>> has a good idea about whether this is an issue here...?
>>
>> We could use a ldrd at the top, that'd be nearly what we have right
>> now, don't we?
> 
> Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd *may* be two cycles (depending on alignment of the words) but the ldr and ldrne will always be two cycles. Ahhh, the joys of modifying the fast path ;)


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
@ 2013-07-17 19:49         ` André Hentschel
  0 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-07-17 19:49 UTC (permalink / raw)
  To: Jonathan Austin
  Cc: linux-arch, Russell King - ARM Linux, linux-arm-kernel,
	linux-kernel, Will Deacon

Am 17.07.2013 13:10, schrieb Jonathan Austin:
> Hi André,
> 
> On 16/07/13 20:27, André Hentschel wrote:
>> Hi Jonathan, First, thank you for your review.
>>
>> Am 16.07.2013 19:31, schrieb Jonathan Austin:
>>> Hi André,
>>>
>>> On 15/07/13 18:14, André Hentschel wrote:
>>>> From: André Hentschel <nerv@dawncrow.de>
>>>>
>>>> This patch intents to reduce loading instructions when the
>>>> resulting value is not used. It's a follow up on
>>>> a4780adeefd042482f624f5e0d577bf9cdcbb760
>>>>
>>>
>>> Have you done any benchmarking to see that this has any real
>>> impact? Or tested on a !Vv6k system? It looks possible that the
>>> only case where this will perform better is where we're using
>>> switch_tls_none or switch_tls_software (both rare cases, I think)
>>> and there's some change it will make things worse in other cases?
>>
>> I have to admit that i only tested it on v6k and did no benchmark.
>>
> Do you have access to anything v6-NOT-k-ish? If not I can try and test this on something appropriate. How does your test-case access tpidrurw? If it uses inline asm then it won't work on v6-not-k, as those instructions aren't defined...

I don't, so it'd be nice if you could do that. I could imagine you have a good choice of devices at ARM :)

In my crappy test application i do it similar to Wine:
https://github.com/AndreRH/tpidrurw-test/blob/master/main.c#L29

but Wine code won't work out of the box on v6:
http://source.winehq.org/git/wine.git/blob/HEAD:/dlls/ntdll/signal_arm.c#l851

>>> One of the reasons for Russell's suggestion of placing the ldrd
>>> (which became the two ldr instructions you've removed from
>>> __switch_to, in order to maintain building for older cores) where
>>> it is was in order to reduce the chance of pipeline stalls.
>>>
>>> As I've pointed out below, there is some risk that changing that
>>> has implications for the v6 only case below (and the v6k case is
>>> now more prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer
>>> cores should have more advanced scheduling to avoid such issues
>>> anyway...)
>>
>> I'm not sure how this could make things worse on v6k, could you
>> elaborate please? Besides of the ldr and str being too close to each
>> other
> 
> Yea, that's the only issue, and in the !CONFIG_CPU_USE_DOMAINS case things are slightly worse than they were before
> 
>> i thought this patch is a good idea, because it removes two ldr
>> which are always executed. (Continuing below...)
> 
> Indeed, as long as it doesn't cause pipeline stalls then that's a gain for some cases :)
> 
> [...]
>>> Now we've only got one instruction between the store and the load
>>> and risk stalling the pipeline...
>>>
>>> Dave M cautiously says "The ancient advice was that one instruction
>>> was enough" but this is very core dependent... I wonder if anyone
>>> has a good idea about whether this is an issue here...?
>>
>> We could use a ldrd at the top, that'd be nearly what we have right
>> now, don't we?
> 
> Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd *may* be two cycles (depending on alignment of the words) but the ldr and ldrne will always be two cycles. Ahhh, the joys of modifying the fast path ;)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* arm: Only load TLS values when needed
@ 2013-07-17 19:49         ` André Hentschel
  0 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-07-17 19:49 UTC (permalink / raw)
  To: linux-arm-kernel

Am 17.07.2013 13:10, schrieb Jonathan Austin:
> Hi Andr?,
> 
> On 16/07/13 20:27, Andr? Hentschel wrote:
>> Hi Jonathan, First, thank you for your review.
>>
>> Am 16.07.2013 19:31, schrieb Jonathan Austin:
>>> Hi Andr?,
>>>
>>> On 15/07/13 18:14, Andr? Hentschel wrote:
>>>> From: Andr? Hentschel <nerv@dawncrow.de>
>>>>
>>>> This patch intents to reduce loading instructions when the
>>>> resulting value is not used. It's a follow up on
>>>> a4780adeefd042482f624f5e0d577bf9cdcbb760
>>>>
>>>
>>> Have you done any benchmarking to see that this has any real
>>> impact? Or tested on a !Vv6k system? It looks possible that the
>>> only case where this will perform better is where we're using
>>> switch_tls_none or switch_tls_software (both rare cases, I think)
>>> and there's some change it will make things worse in other cases?
>>
>> I have to admit that i only tested it on v6k and did no benchmark.
>>
> Do you have access to anything v6-NOT-k-ish? If not I can try and test this on something appropriate. How does your test-case access tpidrurw? If it uses inline asm then it won't work on v6-not-k, as those instructions aren't defined...

I don't, so it'd be nice if you could do that. I could imagine you have a good choice of devices at ARM :)

In my crappy test application i do it similar to Wine:
https://github.com/AndreRH/tpidrurw-test/blob/master/main.c#L29

but Wine code won't work out of the box on v6:
http://source.winehq.org/git/wine.git/blob/HEAD:/dlls/ntdll/signal_arm.c#l851

>>> One of the reasons for Russell's suggestion of placing the ldrd
>>> (which became the two ldr instructions you've removed from
>>> __switch_to, in order to maintain building for older cores) where
>>> it is was in order to reduce the chance of pipeline stalls.
>>>
>>> As I've pointed out below, there is some risk that changing that
>>> has implications for the v6 only case below (and the v6k case is
>>> now more prone to stalls with !CONFIG_CPU_USE_DOMAINS, but newer
>>> cores should have more advanced scheduling to avoid such issues
>>> anyway...)
>>
>> I'm not sure how this could make things worse on v6k, could you
>> elaborate please? Besides of the ldr and str being too close to each
>> other
> 
> Yea, that's the only issue, and in the !CONFIG_CPU_USE_DOMAINS case things are slightly worse than they were before
> 
>> i thought this patch is a good idea, because it removes two ldr
>> which are always executed. (Continuing below...)
> 
> Indeed, as long as it doesn't cause pipeline stalls then that's a gain for some cases :)
> 
> [...]
>>> Now we've only got one instruction between the store and the load
>>> and risk stalling the pipeline...
>>>
>>> Dave M cautiously says "The ancient advice was that one instruction
>>> was enough" but this is very core dependent... I wonder if anyone
>>> has a good idea about whether this is an issue here...?
>>
>> We could use a ldrd at the top, that'd be nearly what we have right
>> now, don't we?
> 
> Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd *may* be two cycles (depending on alignment of the words) but the ldr and ldrne will always be two cycles. Ahhh, the joys of modifying the fast path ;)

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
  2013-07-17 19:49         ` André Hentschel
  (?)
  (?)
@ 2013-08-14 14:07         ` André Hentschel
  2013-08-14 16:20           ` Jonathan Austin
  -1 siblings, 1 reply; 28+ messages in thread
From: André Hentschel @ 2013-08-14 14:07 UTC (permalink / raw)
  To: Jonathan Austin; +Cc: linux-kernel, Will Deacon

Hi Jonathan,
Any updates on this?

Am 17.07.2013 21:49, schrieb André Hentschel:
> Am 17.07.2013 13:10, schrieb Jonathan Austin:
>> Hi André,

>> Do you have access to anything v6-NOT-k-ish? If not I can try and test this on something appropriate. How does your test-case access tpidrurw? If it uses inline asm then it won't work on v6-not-k, as those instructions aren't defined...
> 
> I don't, so it'd be nice if you could do that. I could imagine you have a good choice of devices at ARM :)
> 
> In my crappy test application i do it similar to Wine:
> https://github.com/AndreRH/tpidrurw-test/blob/master/main.c#L29
> 
> but Wine code won't work out of the box on v6:
> http://source.winehq.org/git/wine.git/blob/HEAD:/dlls/ntdll/signal_arm.c#l851
> 

>>> I'm not sure how this could make things worse on v6k, could you
>>> elaborate please? Besides of the ldr and str being too close to each
>>> other
>>
>> Yea, that's the only issue, and in the !CONFIG_CPU_USE_DOMAINS case things are slightly worse than they were before
>>
>>> i thought this patch is a good idea, because it removes two ldr
>>> which are always executed. (Continuing below...)
>>
>> Indeed, as long as it doesn't cause pipeline stalls then that's a gain for some cases :)
>>
>> [...]
>>>> Now we've only got one instruction between the store and the load
>>>> and risk stalling the pipeline...
>>>>
>>>> Dave M cautiously says "The ancient advice was that one instruction
>>>> was enough" but this is very core dependent... I wonder if anyone
>>>> has a good idea about whether this is an issue here...?
>>>
>>> We could use a ldrd at the top, that'd be nearly what we have right
>>> now, don't we?
>>
>> Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd *may* be two cycles (depending on alignment of the words) but the ldr and ldrne will always be two cycles. Ahhh, the joys of modifying the fast path ;)


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
  2013-08-14 14:07         ` André Hentschel
@ 2013-08-14 16:20           ` Jonathan Austin
  2013-08-14 21:21             ` André Hentschel
  0 siblings, 1 reply; 28+ messages in thread
From: Jonathan Austin @ 2013-08-14 16:20 UTC (permalink / raw)
  To: André Hentschel; +Cc: linux-kernel, Will Deacon

Hi André,

On 14/08/13 15:07, André Hentschel wrote:
> Hi Jonathan,
> Any updates on this?
>

I was holding out to see the version with ldrd at the top, as discussed 
below - I never saw a version with that change? I'd meant to ping you to 
see if that was coming, sorry.


[...]
>>> [...]
>>>>> Now we've only got one instruction between the store and the load
>>>>> and risk stalling the pipeline...
>>>>>
>>>>> Dave M cautiously says "The ancient advice was that one instruction
>>>>> was enough" but this is very core dependent... I wonder if anyone
>>>>> has a good idea about whether this is an issue here...?
>>>>
>>>> We could use a ldrd at the top, that'd be nearly what we have right
>>>> now, don't we?
>>>
>>> Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd *may* be two cycles (depending on alignment of the words) but the ldr and ldrne will always be two cycles. Ahhh, the joys of modifying the fast path ;)

Was expecting to see something that reflected this discussion,

Jonny


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
  2013-08-14 16:20           ` Jonathan Austin
@ 2013-08-14 21:21             ` André Hentschel
  2013-08-15 17:29                 ` Jonathan Austin
  0 siblings, 1 reply; 28+ messages in thread
From: André Hentschel @ 2013-08-14 21:21 UTC (permalink / raw)
  To: Jonathan Austin; +Cc: linux-kernel, Will Deacon, linux-arch

Am 14.08.2013 18:20, schrieb Jonathan Austin:
> Hi André,
> 
> On 14/08/13 15:07, André Hentschel wrote:
>> Hi Jonathan,
>> Any updates on this?
>>
> 
> I was holding out to see the version with ldrd at the top, as discussed below - I never saw a version with that change? I'd meant to ping you to see if that was coming, sorry.
> 
> 
> [...]
>>>> [...]
>>>>>> Now we've only got one instruction between the store and the load
>>>>>> and risk stalling the pipeline...
>>>>>>
>>>>>> Dave M cautiously says "The ancient advice was that one instruction
>>>>>> was enough" but this is very core dependent... I wonder if anyone
>>>>>> has a good idea about whether this is an issue here...?
>>>>>
>>>>> We could use a ldrd at the top, that'd be nearly what we have right
>>>>> now, don't we?
>>>>
>>>> Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd *may* be two cycles (depending on alignment of the words) but the ldr and ldrne will always be two cycles. Ahhh, the joys of modifying the fast path ;)
> 
> Was expecting to see something that reflected this discussion,

Ah ok, i misunderstood that, sry.
Something like that?

From: André Hentschel <nerv@dawncrow.de>

This patch intents to reduce loading instructions when the resulting value is not used.
It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760

Signed-off-by: André Hentschel <nerv@dawncrow.de>

---
This patch is against 28fbc8b6a29c849a3f03a6b05010d4b584055665

diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 83259b8..31743f7 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -3,29 +3,31 @@
 
 #ifdef __ASSEMBLY__
 #include <asm/asm-offsets.h>
-	.macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
 	.endm
 
-	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
+	ldrd	\tp, \tpuser, [\next, #TI_TP_VALUE]	@ get the next TLS and user r/w register
 	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
 	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
 	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
-	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+	str	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
+	ldrd	\tp, \tpuser, [\next, #TI_TP_VALUE]
 	ldr	\tmp1, =elf_hwcap
 	ldr	\tmp1, [\tmp1, #0]
 	mov	\tmp2, #0xffff0fff
 	tst	\tmp1, #HWCAP_TLS		@ hardware TLS available?
 	streq	\tp, [\tmp2, #-15]		@ set TLS value at 0xffff0ff0
-	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
+	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the previous user r/w register
 	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
 	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register
-	strne	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+	strne	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
 	mov	\tmp1, #0xffff0fff
 	str	\tp, [\tmp1, #-15]		@ set TLS value at 0xffff0ff0
 	.endm
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index d40d0ef..11112de 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -689,12 +689,10 @@ ENTRY(__switch_to)
  THUMB(	stmia	ip!, {r4 - sl, fp}	   )	@ Store most regs on stack
  THUMB(	str	sp, [ip], #4		   )
  THUMB(	str	lr, [ip], #4		   )
-	ldr	r4, [r2, #TI_TP_VALUE]
-	ldr	r5, [r2, #TI_TP_VALUE + 4]
 #ifdef CONFIG_CPU_USE_DOMAINS
 	ldr	r6, [r2, #TI_CPU_DOMAIN]
 #endif
-	switch_tls r1, r4, r5, r3, r7
+	switch_tls r1, r2, r4, r5, r3, r7
 #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	ldr	r7, [r2, #TI_TASK]
 	ldr	r8, =__stack_chk_guard



^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
  2013-08-14 21:21             ` André Hentschel
  2013-08-15 17:29                 ` Jonathan Austin
@ 2013-08-15 17:29                 ` Jonathan Austin
  0 siblings, 0 replies; 28+ messages in thread
From: Jonathan Austin @ 2013-08-15 17:29 UTC (permalink / raw)
  To: André Hentschel
  Cc: linux-kernel, Will Deacon, linux-arch, linux-arm-kernel, Russell King

Hi André

(I've put RMK and LAKML back on Cc: they got dropped somewhere along the 
way)

On 14/08/13 22:21, André Hentschel wrote:
>>>>> [...]
>>>>>>> Now we've only got one instruction between the store and the load
>>>>>>> and risk stalling the pipeline...
>>>>>>>
>>>>>>> Dave M cautiously says "The ancient advice was that one instruction
>>>>>>> was enough" but this is very core dependent... I wonder if anyone
>>>>>>> has a good idea about whether this is an issue here...?
>>>>>>
>>>>>> We could use a ldrd at the top, that'd be nearly what we have right
>>>>>> now, don't we?
>>>>>
>>>>> Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd *may* be two cycles (depending on alignment of the words) but the ldr and ldrne will always be two cycles. Ahhh, the joys of modifying the fast path ;)
>>
>> Was expecting to see something that reflected this discussion,
>
> Ah ok, i misunderstood that, sry.
> Something like that?
>
> From: André Hentschel <nerv@dawncrow.de>
>
> This patch intents to reduce loading instructions when the resulting value is not used.
> It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760
>

You caught me just before running away for a week of holiday, but I've 
managed to squeeze in a bit of testing - hope this helps - I'll try to 
be clear about what I have/haven't tested so you can judge for yourself 
how much weight to give this.

What I've tested:

* Apply your code to 3.11-rc5
* Boot on Integrator CP (1176 r1p0) and run a full Debian Wheezy init

--> It works!

I also applied the following diff and rebuilt to check that we were 
really testing the V6 NOT V6K path here:

diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 31743f7..71dfe82 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -43,6 +43,7 @@
  #define switch_tls     switch_tls_v6
  #elif defined(CONFIG_CPU_32v6K)
  #define tls_emu                0
+#error
  #define has_tls_reg            1
  #define switch_tls     switch_tls_v6k
  #else

What I've *not* tested:
* As this CPU does not have the tls register, I haven't run any tests 
that try to read/write it, but we wouldn't expect that to work, right?
* Haven't tried a V7 platform, as I understand you've already done that.
* Haven't verified it still builds for v4 (which was the problem for 
ldrd in the past and now we've put it back, so that's important)
* Performance impact

I think it'd be good if you can verify the bottom two points - 
especially the ldrd one: perhaps Russell can tell you which platform it 
broke on before?

So, that's a
Tested-by: Jonathan Austin <jonathan.austin@arm.com>

If you want it :)
Hope that helps,

Jonny

> Signed-off-by: André Hentschel <nerv@dawncrow.de>
>
> ---
> This patch is against 28fbc8b6a29c849a3f03a6b05010d4b584055665
>
> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
> index 83259b8..31743f7 100644
> --- a/arch/arm/include/asm/tls.h
> +++ b/arch/arm/include/asm/tls.h
> @@ -3,29 +3,31 @@
>
>   #ifdef __ASSEMBLY__
>   #include <asm/asm-offsets.h>
> -	.macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
>   	.endm
>
> -	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
> +	ldrd	\tp, \tpuser, [\next, #TI_TP_VALUE]	@ get the next TLS and user r/w register
>   	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
>   	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
>   	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
> -	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
> +	str	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>   	.endm
>
> -	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
> +	ldrd	\tp, \tpuser, [\next, #TI_TP_VALUE]
>   	ldr	\tmp1, =elf_hwcap
>   	ldr	\tmp1, [\tmp1, #0]
>   	mov	\tmp2, #0xffff0fff
>   	tst	\tmp1, #HWCAP_TLS		@ hardware TLS available?
>   	streq	\tp, [\tmp2, #-15]		@ set TLS value at 0xffff0ff0
> -	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
> +	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the previous user r/w register
>   	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
>   	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register
> -	strne	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
> +	strne	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>   	.endm
>
> -	.macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
>   	mov	\tmp1, #0xffff0fff
>   	str	\tp, [\tmp1, #-15]		@ set TLS value at 0xffff0ff0
>   	.endm
> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
> index d40d0ef..11112de 100644
> --- a/arch/arm/kernel/entry-armv.S
> +++ b/arch/arm/kernel/entry-armv.S
> @@ -689,12 +689,10 @@ ENTRY(__switch_to)
>    THUMB(	stmia	ip!, {r4 - sl, fp}	   )	@ Store most regs on stack
>    THUMB(	str	sp, [ip], #4		   )
>    THUMB(	str	lr, [ip], #4		   )
> -	ldr	r4, [r2, #TI_TP_VALUE]
> -	ldr	r5, [r2, #TI_TP_VALUE + 4]
>   #ifdef CONFIG_CPU_USE_DOMAINS
>   	ldr	r6, [r2, #TI_CPU_DOMAIN]
>   #endif
> -	switch_tls r1, r4, r5, r3, r7
> +	switch_tls r1, r2, r4, r5, r3, r7
>   #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
>   	ldr	r7, [r2, #TI_TASK]
>   	ldr	r8, =__stack_chk_guard
>
>
>



^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
@ 2013-08-15 17:29                 ` Jonathan Austin
  0 siblings, 0 replies; 28+ messages in thread
From: Jonathan Austin @ 2013-08-15 17:29 UTC (permalink / raw)
  To: André Hentschel
  Cc: linux-kernel, Will Deacon, linux-arch, linux-arm-kernel, Russell King

Hi André

(I've put RMK and LAKML back on Cc: they got dropped somewhere along the 
way)

On 14/08/13 22:21, André Hentschel wrote:
>>>>> [...]
>>>>>>> Now we've only got one instruction between the store and the load
>>>>>>> and risk stalling the pipeline...
>>>>>>>
>>>>>>> Dave M cautiously says "The ancient advice was that one instruction
>>>>>>> was enough" but this is very core dependent... I wonder if anyone
>>>>>>> has a good idea about whether this is an issue here...?
>>>>>>
>>>>>> We could use a ldrd at the top, that'd be nearly what we have right
>>>>>> now, don't we?
>>>>>
>>>>> Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd *may* be two cycles (depending on alignment of the words) but the ldr and ldrne will always be two cycles. Ahhh, the joys of modifying the fast path ;)
>>
>> Was expecting to see something that reflected this discussion,
>
> Ah ok, i misunderstood that, sry.
> Something like that?
>
> From: André Hentschel <nerv@dawncrow.de>
>
> This patch intents to reduce loading instructions when the resulting value is not used.
> It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760
>

You caught me just before running away for a week of holiday, but I've 
managed to squeeze in a bit of testing - hope this helps - I'll try to 
be clear about what I have/haven't tested so you can judge for yourself 
how much weight to give this.

What I've tested:

* Apply your code to 3.11-rc5
* Boot on Integrator CP (1176 r1p0) and run a full Debian Wheezy init

--> It works!

I also applied the following diff and rebuilt to check that we were 
really testing the V6 NOT V6K path here:

diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 31743f7..71dfe82 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -43,6 +43,7 @@
  #define switch_tls     switch_tls_v6
  #elif defined(CONFIG_CPU_32v6K)
  #define tls_emu                0
+#error
  #define has_tls_reg            1
  #define switch_tls     switch_tls_v6k
  #else

What I've *not* tested:
* As this CPU does not have the tls register, I haven't run any tests 
that try to read/write it, but we wouldn't expect that to work, right?
* Haven't tried a V7 platform, as I understand you've already done that.
* Haven't verified it still builds for v4 (which was the problem for 
ldrd in the past and now we've put it back, so that's important)
* Performance impact

I think it'd be good if you can verify the bottom two points - 
especially the ldrd one: perhaps Russell can tell you which platform it 
broke on before?

So, that's a
Tested-by: Jonathan Austin <jonathan.austin@arm.com>

If you want it :)
Hope that helps,

Jonny

> Signed-off-by: André Hentschel <nerv@dawncrow.de>
>
> ---
> This patch is against 28fbc8b6a29c849a3f03a6b05010d4b584055665
>
> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
> index 83259b8..31743f7 100644
> --- a/arch/arm/include/asm/tls.h
> +++ b/arch/arm/include/asm/tls.h
> @@ -3,29 +3,31 @@
>
>   #ifdef __ASSEMBLY__
>   #include <asm/asm-offsets.h>
> -	.macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
>   	.endm
>
> -	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
> +	ldrd	\tp, \tpuser, [\next, #TI_TP_VALUE]	@ get the next TLS and user r/w register
>   	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
>   	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
>   	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
> -	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
> +	str	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>   	.endm
>
> -	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
> +	ldrd	\tp, \tpuser, [\next, #TI_TP_VALUE]
>   	ldr	\tmp1, =elf_hwcap
>   	ldr	\tmp1, [\tmp1, #0]
>   	mov	\tmp2, #0xffff0fff
>   	tst	\tmp1, #HWCAP_TLS		@ hardware TLS available?
>   	streq	\tp, [\tmp2, #-15]		@ set TLS value at 0xffff0ff0
> -	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
> +	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the previous user r/w register
>   	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
>   	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register
> -	strne	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
> +	strne	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>   	.endm
>
> -	.macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
>   	mov	\tmp1, #0xffff0fff
>   	str	\tp, [\tmp1, #-15]		@ set TLS value at 0xffff0ff0
>   	.endm
> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
> index d40d0ef..11112de 100644
> --- a/arch/arm/kernel/entry-armv.S
> +++ b/arch/arm/kernel/entry-armv.S
> @@ -689,12 +689,10 @@ ENTRY(__switch_to)
>    THUMB(	stmia	ip!, {r4 - sl, fp}	   )	@ Store most regs on stack
>    THUMB(	str	sp, [ip], #4		   )
>    THUMB(	str	lr, [ip], #4		   )
> -	ldr	r4, [r2, #TI_TP_VALUE]
> -	ldr	r5, [r2, #TI_TP_VALUE + 4]
>   #ifdef CONFIG_CPU_USE_DOMAINS
>   	ldr	r6, [r2, #TI_CPU_DOMAIN]
>   #endif
> -	switch_tls r1, r4, r5, r3, r7
> +	switch_tls r1, r2, r4, r5, r3, r7
>   #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
>   	ldr	r7, [r2, #TI_TASK]
>   	ldr	r8, =__stack_chk_guard
>
>
>

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* arm: Only load TLS values when needed
@ 2013-08-15 17:29                 ` Jonathan Austin
  0 siblings, 0 replies; 28+ messages in thread
From: Jonathan Austin @ 2013-08-15 17:29 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Andr?

(I've put RMK and LAKML back on Cc: they got dropped somewhere along the 
way)

On 14/08/13 22:21, Andr? Hentschel wrote:
>>>>> [...]
>>>>>>> Now we've only got one instruction between the store and the load
>>>>>>> and risk stalling the pipeline...
>>>>>>>
>>>>>>> Dave M cautiously says "The ancient advice was that one instruction
>>>>>>> was enough" but this is very core dependent... I wonder if anyone
>>>>>>> has a good idea about whether this is an issue here...?
>>>>>>
>>>>>> We could use a ldrd at the top, that'd be nearly what we have right
>>>>>> now, don't we?
>>>>>
>>>>> Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd *may* be two cycles (depending on alignment of the words) but the ldr and ldrne will always be two cycles. Ahhh, the joys of modifying the fast path ;)
>>
>> Was expecting to see something that reflected this discussion,
>
> Ah ok, i misunderstood that, sry.
> Something like that?
>
> From: Andr? Hentschel <nerv@dawncrow.de>
>
> This patch intents to reduce loading instructions when the resulting value is not used.
> It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760
>

You caught me just before running away for a week of holiday, but I've 
managed to squeeze in a bit of testing - hope this helps - I'll try to 
be clear about what I have/haven't tested so you can judge for yourself 
how much weight to give this.

What I've tested:

* Apply your code to 3.11-rc5
* Boot on Integrator CP (1176 r1p0) and run a full Debian Wheezy init

--> It works!

I also applied the following diff and rebuilt to check that we were 
really testing the V6 NOT V6K path here:

diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 31743f7..71dfe82 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -43,6 +43,7 @@
  #define switch_tls     switch_tls_v6
  #elif defined(CONFIG_CPU_32v6K)
  #define tls_emu                0
+#error
  #define has_tls_reg            1
  #define switch_tls     switch_tls_v6k
  #else

What I've *not* tested:
* As this CPU does not have the tls register, I haven't run any tests 
that try to read/write it, but we wouldn't expect that to work, right?
* Haven't tried a V7 platform, as I understand you've already done that.
* Haven't verified it still builds for v4 (which was the problem for 
ldrd in the past and now we've put it back, so that's important)
* Performance impact

I think it'd be good if you can verify the bottom two points - 
especially the ldrd one: perhaps Russell can tell you which platform it 
broke on before?

So, that's a
Tested-by: Jonathan Austin <jonathan.austin@arm.com>

If you want it :)
Hope that helps,

Jonny

> Signed-off-by: Andr? Hentschel <nerv@dawncrow.de>
>
> ---
> This patch is against 28fbc8b6a29c849a3f03a6b05010d4b584055665
>
> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
> index 83259b8..31743f7 100644
> --- a/arch/arm/include/asm/tls.h
> +++ b/arch/arm/include/asm/tls.h
> @@ -3,29 +3,31 @@
>
>   #ifdef __ASSEMBLY__
>   #include <asm/asm-offsets.h>
> -	.macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
>   	.endm
>
> -	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
> +	ldrd	\tp, \tpuser, [\next, #TI_TP_VALUE]	@ get the next TLS and user r/w register
>   	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
>   	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
>   	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
> -	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
> +	str	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>   	.endm
>
> -	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
> +	ldrd	\tp, \tpuser, [\next, #TI_TP_VALUE]
>   	ldr	\tmp1, =elf_hwcap
>   	ldr	\tmp1, [\tmp1, #0]
>   	mov	\tmp2, #0xffff0fff
>   	tst	\tmp1, #HWCAP_TLS		@ hardware TLS available?
>   	streq	\tp, [\tmp2, #-15]		@ set TLS value at 0xffff0ff0
> -	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
> +	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the previous user r/w register
>   	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
>   	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register
> -	strne	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
> +	strne	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>   	.endm
>
> -	.macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
> +	.macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
>   	mov	\tmp1, #0xffff0fff
>   	str	\tp, [\tmp1, #-15]		@ set TLS value at 0xffff0ff0
>   	.endm
> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
> index d40d0ef..11112de 100644
> --- a/arch/arm/kernel/entry-armv.S
> +++ b/arch/arm/kernel/entry-armv.S
> @@ -689,12 +689,10 @@ ENTRY(__switch_to)
>    THUMB(	stmia	ip!, {r4 - sl, fp}	   )	@ Store most regs on stack
>    THUMB(	str	sp, [ip], #4		   )
>    THUMB(	str	lr, [ip], #4		   )
> -	ldr	r4, [r2, #TI_TP_VALUE]
> -	ldr	r5, [r2, #TI_TP_VALUE + 4]
>   #ifdef CONFIG_CPU_USE_DOMAINS
>   	ldr	r6, [r2, #TI_CPU_DOMAIN]
>   #endif
> -	switch_tls r1, r4, r5, r3, r7
> +	switch_tls r1, r2, r4, r5, r3, r7
>   #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
>   	ldr	r7, [r2, #TI_TASK]
>   	ldr	r8, =__stack_chk_guard
>
>
>

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
  2013-08-15 17:29                 ` Jonathan Austin
  (?)
  (?)
@ 2013-08-15 18:27                   ` André Hentschel
  -1 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-08-15 18:27 UTC (permalink / raw)
  To: Jonathan Austin
  Cc: linux-kernel, Will Deacon, linux-arch, linux-arm-kernel, Russell King

Hi Jonathan,

Am 15.08.2013 19:29, schrieb Jonathan Austin:
> Hi André
> 
> (I've put RMK and LAKML back on Cc: they got dropped somewhere along the way)
> 
> On 14/08/13 22:21, André Hentschel wrote:
>>>>>> [...]
>>>>>>>> Now we've only got one instruction between the store and the load
>>>>>>>> and risk stalling the pipeline...
>>>>>>>>
>>>>>>>> Dave M cautiously says "The ancient advice was that one instruction
>>>>>>>> was enough" but this is very core dependent... I wonder if anyone
>>>>>>>> has a good idea about whether this is an issue here...?
>>>>>>>
>>>>>>> We could use a ldrd at the top, that'd be nearly what we have right
>>>>>>> now, don't we?
>>>>>>
>>>>>> Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd *may* be two cycles (depending on alignment of the words) but the ldr and ldrne will always be two cycles. Ahhh, the joys of modifying the fast path ;)
>>>
>>> Was expecting to see something that reflected this discussion,
>>
>> Ah ok, i misunderstood that, sry.
>> Something like that?
>>
>> From: André Hentschel <nerv@dawncrow.de>
>>
>> This patch intents to reduce loading instructions when the resulting value is not used.
>> It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760
>>
> 
> You caught me just before running away for a week of holiday, but I've managed to squeeze in a bit of testing - hope this helps - I'll try to be clear about what I have/haven't tested so you can judge for yourself how much weight to give this.

Thanks for testing it anyway, and happy holiday!

> What I've tested:
> 
> * Apply your code to 3.11-rc5
> * Boot on Integrator CP (1176 r1p0) and run a full Debian Wheezy init
> 
> --> It works!
> 
> I also applied the following diff and rebuilt to check that we were really testing the V6 NOT V6K path here:
> 
> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
> index 31743f7..71dfe82 100644
> --- a/arch/arm/include/asm/tls.h
> +++ b/arch/arm/include/asm/tls.h
> @@ -43,6 +43,7 @@
>  #define switch_tls     switch_tls_v6
>  #elif defined(CONFIG_CPU_32v6K)
>  #define tls_emu                0
> +#error
>  #define has_tls_reg            1
>  #define switch_tls     switch_tls_v6k
>  #else
> 
> What I've *not* tested:
> * As this CPU does not have the tls register, I haven't run any tests that try to read/write it, but we wouldn't expect that to work, right?
> * Haven't tried a V7 platform, as I understand you've already done that.
> * Haven't verified it still builds for v4 (which was the problem for ldrd in the past and now we've put it back, so that's important)

It shouldn't matter as v4 would not use the v6 codepath, would it?

> * Performance impact

I only could test it on v7, would be interesting to see some v6, v5, ... "benchmarks", though.

> 
> I think it'd be good if you can verify the bottom two points - especially the ldrd one: perhaps Russell can tell you which platform it broke on before?
> 
> So, that's a
> Tested-by: Jonathan Austin <jonathan.austin@arm.com>
> 
> If you want it :)

sure, thx.

> Hope that helps,
> 
> Jonny
> 
>> Signed-off-by: André Hentschel <nerv@dawncrow.de>
>>
>> ---
>> This patch is against 28fbc8b6a29c849a3f03a6b05010d4b584055665
>>
>> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
>> index 83259b8..31743f7 100644
>> --- a/arch/arm/include/asm/tls.h
>> +++ b/arch/arm/include/asm/tls.h
>> @@ -3,29 +3,31 @@
>>
>>   #ifdef __ASSEMBLY__
>>   #include <asm/asm-offsets.h>
>> -    .macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
>>       .endm
>>
>> -    .macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
>> +    ldrd    \tp, \tpuser, [\next, #TI_TP_VALUE]    @ get the next TLS and user r/w register
>>       mrc    p15, 0, \tmp2, c13, c0, 2    @ get the user r/w register
>>       mcr    p15, 0, \tp, c13, c0, 3        @ set TLS register
>>       mcr    p15, 0, \tpuser, c13, c0, 2    @ and the user r/w register
>> -    str    \tmp2, [\base, #TI_TP_VALUE + 4] @ save it
>> +    str    \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>>       .endm
>>
>> -    .macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
>> +    ldrd    \tp, \tpuser, [\next, #TI_TP_VALUE]
>>       ldr    \tmp1, =elf_hwcap
>>       ldr    \tmp1, [\tmp1, #0]
>>       mov    \tmp2, #0xffff0fff
>>       tst    \tmp1, #HWCAP_TLS        @ hardware TLS available?
>>       streq    \tp, [\tmp2, #-15]        @ set TLS value at 0xffff0ff0
>> -    mrcne    p15, 0, \tmp2, c13, c0, 2    @ get the user r/w register
>> +    mrcne    p15, 0, \tmp2, c13, c0, 2    @ get the previous user r/w register
>>       mcrne    p15, 0, \tp, c13, c0, 3        @ yes, set TLS register
>>       mcrne    p15, 0, \tpuser, c13, c0, 2    @ set user r/w register
>> -    strne    \tmp2, [\base, #TI_TP_VALUE + 4] @ save it
>> +    strne    \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>>       .endm
>>
>> -    .macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
>>       mov    \tmp1, #0xffff0fff
>>       str    \tp, [\tmp1, #-15]        @ set TLS value at 0xffff0ff0
>>       .endm
>> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
>> index d40d0ef..11112de 100644
>> --- a/arch/arm/kernel/entry-armv.S
>> +++ b/arch/arm/kernel/entry-armv.S
>> @@ -689,12 +689,10 @@ ENTRY(__switch_to)
>>    THUMB(    stmia    ip!, {r4 - sl, fp}       )    @ Store most regs on stack
>>    THUMB(    str    sp, [ip], #4           )
>>    THUMB(    str    lr, [ip], #4           )
>> -    ldr    r4, [r2, #TI_TP_VALUE]
>> -    ldr    r5, [r2, #TI_TP_VALUE + 4]
>>   #ifdef CONFIG_CPU_USE_DOMAINS
>>       ldr    r6, [r2, #TI_CPU_DOMAIN]
>>   #endif
>> -    switch_tls r1, r4, r5, r3, r7
>> +    switch_tls r1, r2, r4, r5, r3, r7
>>   #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
>>       ldr    r7, [r2, #TI_TASK]
>>       ldr    r8, =__stack_chk_guard
>>
>>
>>
> 
> 
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-arch" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
@ 2013-08-15 18:27                   ` André Hentschel
  0 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-08-15 18:27 UTC (permalink / raw)
  To: Jonathan Austin
  Cc: linux-arch, Russell King, Will Deacon, linux-kernel, linux-arm-kernel

Hi Jonathan,

Am 15.08.2013 19:29, schrieb Jonathan Austin:
> Hi André
> 
> (I've put RMK and LAKML back on Cc: they got dropped somewhere along the way)
> 
> On 14/08/13 22:21, André Hentschel wrote:
>>>>>> [...]
>>>>>>>> Now we've only got one instruction between the store and the load
>>>>>>>> and risk stalling the pipeline...
>>>>>>>>
>>>>>>>> Dave M cautiously says "The ancient advice was that one instruction
>>>>>>>> was enough" but this is very core dependent... I wonder if anyone
>>>>>>>> has a good idea about whether this is an issue here...?
>>>>>>>
>>>>>>> We could use a ldrd at the top, that'd be nearly what we have right
>>>>>>> now, don't we?
>>>>>>
>>>>>> Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd *may* be two cycles (depending on alignment of the words) but the ldr and ldrne will always be two cycles. Ahhh, the joys of modifying the fast path ;)
>>>
>>> Was expecting to see something that reflected this discussion,
>>
>> Ah ok, i misunderstood that, sry.
>> Something like that?
>>
>> From: André Hentschel <nerv@dawncrow.de>
>>
>> This patch intents to reduce loading instructions when the resulting value is not used.
>> It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760
>>
> 
> You caught me just before running away for a week of holiday, but I've managed to squeeze in a bit of testing - hope this helps - I'll try to be clear about what I have/haven't tested so you can judge for yourself how much weight to give this.

Thanks for testing it anyway, and happy holiday!

> What I've tested:
> 
> * Apply your code to 3.11-rc5
> * Boot on Integrator CP (1176 r1p0) and run a full Debian Wheezy init
> 
> --> It works!
> 
> I also applied the following diff and rebuilt to check that we were really testing the V6 NOT V6K path here:
> 
> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
> index 31743f7..71dfe82 100644
> --- a/arch/arm/include/asm/tls.h
> +++ b/arch/arm/include/asm/tls.h
> @@ -43,6 +43,7 @@
>  #define switch_tls     switch_tls_v6
>  #elif defined(CONFIG_CPU_32v6K)
>  #define tls_emu                0
> +#error
>  #define has_tls_reg            1
>  #define switch_tls     switch_tls_v6k
>  #else
> 
> What I've *not* tested:
> * As this CPU does not have the tls register, I haven't run any tests that try to read/write it, but we wouldn't expect that to work, right?
> * Haven't tried a V7 platform, as I understand you've already done that.
> * Haven't verified it still builds for v4 (which was the problem for ldrd in the past and now we've put it back, so that's important)

It shouldn't matter as v4 would not use the v6 codepath, would it?

> * Performance impact

I only could test it on v7, would be interesting to see some v6, v5, ... "benchmarks", though.

> 
> I think it'd be good if you can verify the bottom two points - especially the ldrd one: perhaps Russell can tell you which platform it broke on before?
> 
> So, that's a
> Tested-by: Jonathan Austin <jonathan.austin@arm.com>
> 
> If you want it :)

sure, thx.

> Hope that helps,
> 
> Jonny
> 
>> Signed-off-by: André Hentschel <nerv@dawncrow.de>
>>
>> ---
>> This patch is against 28fbc8b6a29c849a3f03a6b05010d4b584055665
>>
>> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
>> index 83259b8..31743f7 100644
>> --- a/arch/arm/include/asm/tls.h
>> +++ b/arch/arm/include/asm/tls.h
>> @@ -3,29 +3,31 @@
>>
>>   #ifdef __ASSEMBLY__
>>   #include <asm/asm-offsets.h>
>> -    .macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
>>       .endm
>>
>> -    .macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
>> +    ldrd    \tp, \tpuser, [\next, #TI_TP_VALUE]    @ get the next TLS and user r/w register
>>       mrc    p15, 0, \tmp2, c13, c0, 2    @ get the user r/w register
>>       mcr    p15, 0, \tp, c13, c0, 3        @ set TLS register
>>       mcr    p15, 0, \tpuser, c13, c0, 2    @ and the user r/w register
>> -    str    \tmp2, [\base, #TI_TP_VALUE + 4] @ save it
>> +    str    \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>>       .endm
>>
>> -    .macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
>> +    ldrd    \tp, \tpuser, [\next, #TI_TP_VALUE]
>>       ldr    \tmp1, =elf_hwcap
>>       ldr    \tmp1, [\tmp1, #0]
>>       mov    \tmp2, #0xffff0fff
>>       tst    \tmp1, #HWCAP_TLS        @ hardware TLS available?
>>       streq    \tp, [\tmp2, #-15]        @ set TLS value at 0xffff0ff0
>> -    mrcne    p15, 0, \tmp2, c13, c0, 2    @ get the user r/w register
>> +    mrcne    p15, 0, \tmp2, c13, c0, 2    @ get the previous user r/w register
>>       mcrne    p15, 0, \tp, c13, c0, 3        @ yes, set TLS register
>>       mcrne    p15, 0, \tpuser, c13, c0, 2    @ set user r/w register
>> -    strne    \tmp2, [\base, #TI_TP_VALUE + 4] @ save it
>> +    strne    \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>>       .endm
>>
>> -    .macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
>>       mov    \tmp1, #0xffff0fff
>>       str    \tp, [\tmp1, #-15]        @ set TLS value at 0xffff0ff0
>>       .endm
>> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
>> index d40d0ef..11112de 100644
>> --- a/arch/arm/kernel/entry-armv.S
>> +++ b/arch/arm/kernel/entry-armv.S
>> @@ -689,12 +689,10 @@ ENTRY(__switch_to)
>>    THUMB(    stmia    ip!, {r4 - sl, fp}       )    @ Store most regs on stack
>>    THUMB(    str    sp, [ip], #4           )
>>    THUMB(    str    lr, [ip], #4           )
>> -    ldr    r4, [r2, #TI_TP_VALUE]
>> -    ldr    r5, [r2, #TI_TP_VALUE + 4]
>>   #ifdef CONFIG_CPU_USE_DOMAINS
>>       ldr    r6, [r2, #TI_CPU_DOMAIN]
>>   #endif
>> -    switch_tls r1, r4, r5, r3, r7
>> +    switch_tls r1, r2, r4, r5, r3, r7
>>   #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
>>       ldr    r7, [r2, #TI_TASK]
>>       ldr    r8, =__stack_chk_guard
>>
>>
>>
> 
> 
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-arch" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
@ 2013-08-15 18:27                   ` André Hentschel
  0 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-08-15 18:27 UTC (permalink / raw)
  To: Jonathan Austin
  Cc: linux-kernel, Will Deacon, linux-arch, linux-arm-kernel, Russell King

Hi Jonathan,

Am 15.08.2013 19:29, schrieb Jonathan Austin:
> Hi André
> 
> (I've put RMK and LAKML back on Cc: they got dropped somewhere along the way)
> 
> On 14/08/13 22:21, André Hentschel wrote:
>>>>>> [...]
>>>>>>>> Now we've only got one instruction between the store and the load
>>>>>>>> and risk stalling the pipeline...
>>>>>>>>
>>>>>>>> Dave M cautiously says "The ancient advice was that one instruction
>>>>>>>> was enough" but this is very core dependent... I wonder if anyone
>>>>>>>> has a good idea about whether this is an issue here...?
>>>>>>>
>>>>>>> We could use a ldrd at the top, that'd be nearly what we have right
>>>>>>> now, don't we?
>>>>>>
>>>>>> Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd *may* be two cycles (depending on alignment of the words) but the ldr and ldrne will always be two cycles. Ahhh, the joys of modifying the fast path ;)
>>>
>>> Was expecting to see something that reflected this discussion,
>>
>> Ah ok, i misunderstood that, sry.
>> Something like that?
>>
>> From: André Hentschel <nerv@dawncrow.de>
>>
>> This patch intents to reduce loading instructions when the resulting value is not used.
>> It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760
>>
> 
> You caught me just before running away for a week of holiday, but I've managed to squeeze in a bit of testing - hope this helps - I'll try to be clear about what I have/haven't tested so you can judge for yourself how much weight to give this.

Thanks for testing it anyway, and happy holiday!

> What I've tested:
> 
> * Apply your code to 3.11-rc5
> * Boot on Integrator CP (1176 r1p0) and run a full Debian Wheezy init
> 
> --> It works!
> 
> I also applied the following diff and rebuilt to check that we were really testing the V6 NOT V6K path here:
> 
> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
> index 31743f7..71dfe82 100644
> --- a/arch/arm/include/asm/tls.h
> +++ b/arch/arm/include/asm/tls.h
> @@ -43,6 +43,7 @@
>  #define switch_tls     switch_tls_v6
>  #elif defined(CONFIG_CPU_32v6K)
>  #define tls_emu                0
> +#error
>  #define has_tls_reg            1
>  #define switch_tls     switch_tls_v6k
>  #else
> 
> What I've *not* tested:
> * As this CPU does not have the tls register, I haven't run any tests that try to read/write it, but we wouldn't expect that to work, right?
> * Haven't tried a V7 platform, as I understand you've already done that.
> * Haven't verified it still builds for v4 (which was the problem for ldrd in the past and now we've put it back, so that's important)

It shouldn't matter as v4 would not use the v6 codepath, would it?

> * Performance impact

I only could test it on v7, would be interesting to see some v6, v5, ... "benchmarks", though.

> 
> I think it'd be good if you can verify the bottom two points - especially the ldrd one: perhaps Russell can tell you which platform it broke on before?
> 
> So, that's a
> Tested-by: Jonathan Austin <jonathan.austin@arm.com>
> 
> If you want it :)

sure, thx.

> Hope that helps,
> 
> Jonny
> 
>> Signed-off-by: André Hentschel <nerv@dawncrow.de>
>>
>> ---
>> This patch is against 28fbc8b6a29c849a3f03a6b05010d4b584055665
>>
>> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
>> index 83259b8..31743f7 100644
>> --- a/arch/arm/include/asm/tls.h
>> +++ b/arch/arm/include/asm/tls.h
>> @@ -3,29 +3,31 @@
>>
>>   #ifdef __ASSEMBLY__
>>   #include <asm/asm-offsets.h>
>> -    .macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
>>       .endm
>>
>> -    .macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
>> +    ldrd    \tp, \tpuser, [\next, #TI_TP_VALUE]    @ get the next TLS and user r/w register
>>       mrc    p15, 0, \tmp2, c13, c0, 2    @ get the user r/w register
>>       mcr    p15, 0, \tp, c13, c0, 3        @ set TLS register
>>       mcr    p15, 0, \tpuser, c13, c0, 2    @ and the user r/w register
>> -    str    \tmp2, [\base, #TI_TP_VALUE + 4] @ save it
>> +    str    \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>>       .endm
>>
>> -    .macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
>> +    ldrd    \tp, \tpuser, [\next, #TI_TP_VALUE]
>>       ldr    \tmp1, =elf_hwcap
>>       ldr    \tmp1, [\tmp1, #0]
>>       mov    \tmp2, #0xffff0fff
>>       tst    \tmp1, #HWCAP_TLS        @ hardware TLS available?
>>       streq    \tp, [\tmp2, #-15]        @ set TLS value at 0xffff0ff0
>> -    mrcne    p15, 0, \tmp2, c13, c0, 2    @ get the user r/w register
>> +    mrcne    p15, 0, \tmp2, c13, c0, 2    @ get the previous user r/w register
>>       mcrne    p15, 0, \tp, c13, c0, 3        @ yes, set TLS register
>>       mcrne    p15, 0, \tpuser, c13, c0, 2    @ set user r/w register
>> -    strne    \tmp2, [\base, #TI_TP_VALUE + 4] @ save it
>> +    strne    \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>>       .endm
>>
>> -    .macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
>>       mov    \tmp1, #0xffff0fff
>>       str    \tp, [\tmp1, #-15]        @ set TLS value at 0xffff0ff0
>>       .endm
>> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
>> index d40d0ef..11112de 100644
>> --- a/arch/arm/kernel/entry-armv.S
>> +++ b/arch/arm/kernel/entry-armv.S
>> @@ -689,12 +689,10 @@ ENTRY(__switch_to)
>>    THUMB(    stmia    ip!, {r4 - sl, fp}       )    @ Store most regs on stack
>>    THUMB(    str    sp, [ip], #4           )
>>    THUMB(    str    lr, [ip], #4           )
>> -    ldr    r4, [r2, #TI_TP_VALUE]
>> -    ldr    r5, [r2, #TI_TP_VALUE + 4]
>>   #ifdef CONFIG_CPU_USE_DOMAINS
>>       ldr    r6, [r2, #TI_CPU_DOMAIN]
>>   #endif
>> -    switch_tls r1, r4, r5, r3, r7
>> +    switch_tls r1, r2, r4, r5, r3, r7
>>   #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
>>       ldr    r7, [r2, #TI_TASK]
>>       ldr    r8, =__stack_chk_guard
>>
>>
>>
> 
> 
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-arch" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


^ permalink raw reply	[flat|nested] 28+ messages in thread

* arm: Only load TLS values when needed
@ 2013-08-15 18:27                   ` André Hentschel
  0 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-08-15 18:27 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Jonathan,

Am 15.08.2013 19:29, schrieb Jonathan Austin:
> Hi Andr?
> 
> (I've put RMK and LAKML back on Cc: they got dropped somewhere along the way)
> 
> On 14/08/13 22:21, Andr? Hentschel wrote:
>>>>>> [...]
>>>>>>>> Now we've only got one instruction between the store and the load
>>>>>>>> and risk stalling the pipeline...
>>>>>>>>
>>>>>>>> Dave M cautiously says "The ancient advice was that one instruction
>>>>>>>> was enough" but this is very core dependent... I wonder if anyone
>>>>>>>> has a good idea about whether this is an issue here...?
>>>>>>>
>>>>>>> We could use a ldrd at the top, that'd be nearly what we have right
>>>>>>> now, don't we?
>>>>>>
>>>>>> Yea, that'd be good - as far as I can see from an 1136 TRM, the ldrd *may* be two cycles (depending on alignment of the words) but the ldr and ldrne will always be two cycles. Ahhh, the joys of modifying the fast path ;)
>>>
>>> Was expecting to see something that reflected this discussion,
>>
>> Ah ok, i misunderstood that, sry.
>> Something like that?
>>
>> From: Andr? Hentschel <nerv@dawncrow.de>
>>
>> This patch intents to reduce loading instructions when the resulting value is not used.
>> It's a follow up on a4780adeefd042482f624f5e0d577bf9cdcbb760
>>
> 
> You caught me just before running away for a week of holiday, but I've managed to squeeze in a bit of testing - hope this helps - I'll try to be clear about what I have/haven't tested so you can judge for yourself how much weight to give this.

Thanks for testing it anyway, and happy holiday!

> What I've tested:
> 
> * Apply your code to 3.11-rc5
> * Boot on Integrator CP (1176 r1p0) and run a full Debian Wheezy init
> 
> --> It works!
> 
> I also applied the following diff and rebuilt to check that we were really testing the V6 NOT V6K path here:
> 
> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
> index 31743f7..71dfe82 100644
> --- a/arch/arm/include/asm/tls.h
> +++ b/arch/arm/include/asm/tls.h
> @@ -43,6 +43,7 @@
>  #define switch_tls     switch_tls_v6
>  #elif defined(CONFIG_CPU_32v6K)
>  #define tls_emu                0
> +#error
>  #define has_tls_reg            1
>  #define switch_tls     switch_tls_v6k
>  #else
> 
> What I've *not* tested:
> * As this CPU does not have the tls register, I haven't run any tests that try to read/write it, but we wouldn't expect that to work, right?
> * Haven't tried a V7 platform, as I understand you've already done that.
> * Haven't verified it still builds for v4 (which was the problem for ldrd in the past and now we've put it back, so that's important)

It shouldn't matter as v4 would not use the v6 codepath, would it?

> * Performance impact

I only could test it on v7, would be interesting to see some v6, v5, ... "benchmarks", though.

> 
> I think it'd be good if you can verify the bottom two points - especially the ldrd one: perhaps Russell can tell you which platform it broke on before?
> 
> So, that's a
> Tested-by: Jonathan Austin <jonathan.austin@arm.com>
> 
> If you want it :)

sure, thx.

> Hope that helps,
> 
> Jonny
> 
>> Signed-off-by: Andr? Hentschel <nerv@dawncrow.de>
>>
>> ---
>> This patch is against 28fbc8b6a29c849a3f03a6b05010d4b584055665
>>
>> diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
>> index 83259b8..31743f7 100644
>> --- a/arch/arm/include/asm/tls.h
>> +++ b/arch/arm/include/asm/tls.h
>> @@ -3,29 +3,31 @@
>>
>>   #ifdef __ASSEMBLY__
>>   #include <asm/asm-offsets.h>
>> -    .macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
>>       .endm
>>
>> -    .macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
>> +    ldrd    \tp, \tpuser, [\next, #TI_TP_VALUE]    @ get the next TLS and user r/w register
>>       mrc    p15, 0, \tmp2, c13, c0, 2    @ get the user r/w register
>>       mcr    p15, 0, \tp, c13, c0, 3        @ set TLS register
>>       mcr    p15, 0, \tpuser, c13, c0, 2    @ and the user r/w register
>> -    str    \tmp2, [\base, #TI_TP_VALUE + 4] @ save it
>> +    str    \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>>       .endm
>>
>> -    .macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
>> +    ldrd    \tp, \tpuser, [\next, #TI_TP_VALUE]
>>       ldr    \tmp1, =elf_hwcap
>>       ldr    \tmp1, [\tmp1, #0]
>>       mov    \tmp2, #0xffff0fff
>>       tst    \tmp1, #HWCAP_TLS        @ hardware TLS available?
>>       streq    \tp, [\tmp2, #-15]        @ set TLS value at 0xffff0ff0
>> -    mrcne    p15, 0, \tmp2, c13, c0, 2    @ get the user r/w register
>> +    mrcne    p15, 0, \tmp2, c13, c0, 2    @ get the previous user r/w register
>>       mcrne    p15, 0, \tp, c13, c0, 3        @ yes, set TLS register
>>       mcrne    p15, 0, \tpuser, c13, c0, 2    @ set user r/w register
>> -    strne    \tmp2, [\base, #TI_TP_VALUE + 4] @ save it
>> +    strne    \tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
>>       .endm
>>
>> -    .macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
>> +    .macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
>>       mov    \tmp1, #0xffff0fff
>>       str    \tp, [\tmp1, #-15]        @ set TLS value at 0xffff0ff0
>>       .endm
>> diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
>> index d40d0ef..11112de 100644
>> --- a/arch/arm/kernel/entry-armv.S
>> +++ b/arch/arm/kernel/entry-armv.S
>> @@ -689,12 +689,10 @@ ENTRY(__switch_to)
>>    THUMB(    stmia    ip!, {r4 - sl, fp}       )    @ Store most regs on stack
>>    THUMB(    str    sp, [ip], #4           )
>>    THUMB(    str    lr, [ip], #4           )
>> -    ldr    r4, [r2, #TI_TP_VALUE]
>> -    ldr    r5, [r2, #TI_TP_VALUE + 4]
>>   #ifdef CONFIG_CPU_USE_DOMAINS
>>       ldr    r6, [r2, #TI_CPU_DOMAIN]
>>   #endif
>> -    switch_tls r1, r4, r5, r3, r7
>> +    switch_tls r1, r2, r4, r5, r3, r7
>>   #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
>>       ldr    r7, [r2, #TI_TASK]
>>       ldr    r8, =__stack_chk_guard
>>
>>
>>
> 
> 
> -- 
> To unsubscribe from this list: send the line "unsubscribe linux-arch" in
> the body of a message to majordomo at vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
  2013-08-15 18:27                   ` André Hentschel
  (?)
@ 2013-08-26 19:19                     ` André Hentschel
  -1 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-08-26 19:19 UTC (permalink / raw)
  To: Jonathan Austin
  Cc: linux-kernel, Will Deacon, linux-arch, linux-arm-kernel, Russell King

Am 15.08.2013 20:27, schrieb André Hentschel:
> Hi Jonathan,
> 
> Am 15.08.2013 19:29, schrieb Jonathan Austin:
>> Hi André
>> What I've *not* tested:
>> * As this CPU does not have the tls register, I haven't run any tests that try to read/write it, but we wouldn't expect that to work, right?
>> * Haven't tried a V7 platform, as I understand you've already done that.
>> * Haven't verified it still builds for v4 (which was the problem for ldrd in the past and now we've put it back, so that's important)
> 
> It shouldn't matter as v4 would not use the v6 codepath, would it?
> 
>> * Performance impact
> 
> I only could test it on v7, would be interesting to see some v6, v5, ... "benchmarks", though.
> 

I did some "benchmarks" on my pandaboard(armv7) and it turns out that ldrd is slightly slower than two ldr.
So with this in mind the best thing is most likely the easiest one, so what about this untested patch below?
(It intends to improve the situation for tls_none and tls_software)

diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 83259b8..1dd50f8 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -3,17 +3,21 @@
 
 #ifdef __ASSEMBLY__
 #include <asm/asm-offsets.h>
-	.macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
 	.endm
 
-	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
+	ldr	\tp, [\next, #TI_TP_VALUE]
+	ldr	\tpuser, [\next, #TI_TP_VALUE + 4]
 	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
 	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
 	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
-	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+	str	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
+	ldr	\tp, [\next, #TI_TP_VALUE]
+	ldr	\tpuser, [\next, #TI_TP_VALUE + 4]
 	ldr	\tmp1, =elf_hwcap
 	ldr	\tmp1, [\tmp1, #0]
 	mov	\tmp2, #0xffff0fff
@@ -22,10 +26,11 @@
 	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
 	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
 	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register
-	strne	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+	strne	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
+	ldr	\tp, [\next, #TI_TP_VALUE]
 	mov	\tmp1, #0xffff0fff
 	str	\tp, [\tmp1, #-15]		@ set TLS value at 0xffff0ff0
 	.endm
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 9cbe70c..4df8976 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -690,12 +690,10 @@ ENTRY(__switch_to)
  THUMB(	stmia	ip!, {r4 - sl, fp}	   )	@ Store most regs on stack
  THUMB(	str	sp, [ip], #4		   )
  THUMB(	str	lr, [ip], #4		   )
-	ldr	r4, [r2, #TI_TP_VALUE]
-	ldr	r5, [r2, #TI_TP_VALUE + 4]
 #ifdef CONFIG_CPU_USE_DOMAINS
 	ldr	r6, [r2, #TI_CPU_DOMAIN]
 #endif
-	switch_tls r1, r4, r5, r3, r7
+	switch_tls r1, r2, r4, r5, r3, r7
 #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	ldr	r7, [r2, #TI_TASK]
 	ldr	r8, =__stack_chk_guard


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: arm: Only load TLS values when needed
@ 2013-08-26 19:19                     ` André Hentschel
  0 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-08-26 19:19 UTC (permalink / raw)
  To: Jonathan Austin
  Cc: linux-kernel, Will Deacon, linux-arch, linux-arm-kernel, Russell King

Am 15.08.2013 20:27, schrieb André Hentschel:
> Hi Jonathan,
> 
> Am 15.08.2013 19:29, schrieb Jonathan Austin:
>> Hi André
>> What I've *not* tested:
>> * As this CPU does not have the tls register, I haven't run any tests that try to read/write it, but we wouldn't expect that to work, right?
>> * Haven't tried a V7 platform, as I understand you've already done that.
>> * Haven't verified it still builds for v4 (which was the problem for ldrd in the past and now we've put it back, so that's important)
> 
> It shouldn't matter as v4 would not use the v6 codepath, would it?
> 
>> * Performance impact
> 
> I only could test it on v7, would be interesting to see some v6, v5, ... "benchmarks", though.
> 

I did some "benchmarks" on my pandaboard(armv7) and it turns out that ldrd is slightly slower than two ldr.
So with this in mind the best thing is most likely the easiest one, so what about this untested patch below?
(It intends to improve the situation for tls_none and tls_software)

diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 83259b8..1dd50f8 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -3,17 +3,21 @@
 
 #ifdef __ASSEMBLY__
 #include <asm/asm-offsets.h>
-	.macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
 	.endm
 
-	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
+	ldr	\tp, [\next, #TI_TP_VALUE]
+	ldr	\tpuser, [\next, #TI_TP_VALUE + 4]
 	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
 	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
 	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
-	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+	str	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
+	ldr	\tp, [\next, #TI_TP_VALUE]
+	ldr	\tpuser, [\next, #TI_TP_VALUE + 4]
 	ldr	\tmp1, =elf_hwcap
 	ldr	\tmp1, [\tmp1, #0]
 	mov	\tmp2, #0xffff0fff
@@ -22,10 +26,11 @@
 	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
 	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
 	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register
-	strne	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+	strne	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
+	ldr	\tp, [\next, #TI_TP_VALUE]
 	mov	\tmp1, #0xffff0fff
 	str	\tp, [\tmp1, #-15]		@ set TLS value at 0xffff0ff0
 	.endm
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 9cbe70c..4df8976 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -690,12 +690,10 @@ ENTRY(__switch_to)
  THUMB(	stmia	ip!, {r4 - sl, fp}	   )	@ Store most regs on stack
  THUMB(	str	sp, [ip], #4		   )
  THUMB(	str	lr, [ip], #4		   )
-	ldr	r4, [r2, #TI_TP_VALUE]
-	ldr	r5, [r2, #TI_TP_VALUE + 4]
 #ifdef CONFIG_CPU_USE_DOMAINS
 	ldr	r6, [r2, #TI_CPU_DOMAIN]
 #endif
-	switch_tls r1, r4, r5, r3, r7
+	switch_tls r1, r2, r4, r5, r3, r7
 #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	ldr	r7, [r2, #TI_TASK]
 	ldr	r8, =__stack_chk_guard

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* arm: Only load TLS values when needed
@ 2013-08-26 19:19                     ` André Hentschel
  0 siblings, 0 replies; 28+ messages in thread
From: André Hentschel @ 2013-08-26 19:19 UTC (permalink / raw)
  To: linux-arm-kernel

Am 15.08.2013 20:27, schrieb Andr? Hentschel:
> Hi Jonathan,
> 
> Am 15.08.2013 19:29, schrieb Jonathan Austin:
>> Hi Andr?
>> What I've *not* tested:
>> * As this CPU does not have the tls register, I haven't run any tests that try to read/write it, but we wouldn't expect that to work, right?
>> * Haven't tried a V7 platform, as I understand you've already done that.
>> * Haven't verified it still builds for v4 (which was the problem for ldrd in the past and now we've put it back, so that's important)
> 
> It shouldn't matter as v4 would not use the v6 codepath, would it?
> 
>> * Performance impact
> 
> I only could test it on v7, would be interesting to see some v6, v5, ... "benchmarks", though.
> 

I did some "benchmarks" on my pandaboard(armv7) and it turns out that ldrd is slightly slower than two ldr.
So with this in mind the best thing is most likely the easiest one, so what about this untested patch below?
(It intends to improve the situation for tls_none and tls_software)

diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 83259b8..1dd50f8 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -3,17 +3,21 @@
 
 #ifdef __ASSEMBLY__
 #include <asm/asm-offsets.h>
-	.macro switch_tls_none, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_none, prev, next, tp, tpuser, tmp1, tmp2
 	.endm
 
-	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_v6k, prev, next, tp, tpuser, tmp1, tmp2
+	ldr	\tp, [\next, #TI_TP_VALUE]
+	ldr	\tpuser, [\next, #TI_TP_VALUE + 4]
 	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
 	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
 	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
-	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+	str	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_v6, prev, next, tp, tpuser, tmp1, tmp2
+	ldr	\tp, [\next, #TI_TP_VALUE]
+	ldr	\tpuser, [\next, #TI_TP_VALUE + 4]
 	ldr	\tmp1, =elf_hwcap
 	ldr	\tmp1, [\tmp1, #0]
 	mov	\tmp2, #0xffff0fff
@@ -22,10 +26,11 @@
 	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
 	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
 	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register
-	strne	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+	strne	\tmp2, [\prev, #TI_TP_VALUE + 4] @ save it
 	.endm
 
-	.macro switch_tls_software, base, tp, tpuser, tmp1, tmp2
+	.macro switch_tls_software, prev, next, tp, tpuser, tmp1, tmp2
+	ldr	\tp, [\next, #TI_TP_VALUE]
 	mov	\tmp1, #0xffff0fff
 	str	\tp, [\tmp1, #-15]		@ set TLS value at 0xffff0ff0
 	.endm
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 9cbe70c..4df8976 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -690,12 +690,10 @@ ENTRY(__switch_to)
  THUMB(	stmia	ip!, {r4 - sl, fp}	   )	@ Store most regs on stack
  THUMB(	str	sp, [ip], #4		   )
  THUMB(	str	lr, [ip], #4		   )
-	ldr	r4, [r2, #TI_TP_VALUE]
-	ldr	r5, [r2, #TI_TP_VALUE + 4]
 #ifdef CONFIG_CPU_USE_DOMAINS
 	ldr	r6, [r2, #TI_CPU_DOMAIN]
 #endif
-	switch_tls r1, r4, r5, r3, r7
+	switch_tls r1, r2, r4, r5, r3, r7
 #if defined(CONFIG_CC_STACKPROTECTOR) && !defined(CONFIG_SMP)
 	ldr	r7, [r2, #TI_TASK]
 	ldr	r8, =__stack_chk_guard

^ permalink raw reply related	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2013-08-26 19:20 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-07-15 17:14 arm: Only load TLS values when needed André Hentschel
2013-07-15 17:14 ` André Hentschel
2013-07-15 17:14 ` André Hentschel
2013-07-16 17:31 ` Jonathan Austin
2013-07-16 17:31   ` Jonathan Austin
2013-07-16 17:31   ` Jonathan Austin
2013-07-16 19:27   ` André Hentschel
2013-07-16 19:27     ` André Hentschel
2013-07-16 19:27     ` André Hentschel
2013-07-17 11:10     ` Jonathan Austin
2013-07-17 11:10       ` Jonathan Austin
2013-07-17 11:10       ` Jonathan Austin
2013-07-17 19:49       ` André Hentschel
2013-07-17 19:49         ` André Hentschel
2013-07-17 19:49         ` André Hentschel
2013-08-14 14:07         ` André Hentschel
2013-08-14 16:20           ` Jonathan Austin
2013-08-14 21:21             ` André Hentschel
2013-08-15 17:29               ` Jonathan Austin
2013-08-15 17:29                 ` Jonathan Austin
2013-08-15 17:29                 ` Jonathan Austin
2013-08-15 18:27                 ` André Hentschel
2013-08-15 18:27                   ` André Hentschel
2013-08-15 18:27                   ` André Hentschel
2013-08-15 18:27                   ` André Hentschel
2013-08-26 19:19                   ` André Hentschel
2013-08-26 19:19                     ` André Hentschel
2013-08-26 19:19                     ` André Hentschel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.