All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] powerpc/vdso32: inline __get_datapage()
@ 2019-08-16 14:48 ` Christophe Leroy
  0 siblings, 0 replies; 13+ messages in thread
From: Christophe Leroy @ 2019-08-16 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

__get_datapage() is only a few instructions to retrieve the
address of the page where the kernel stores data to the VDSO.

By inlining this function into its users, a bl/blr pair and
a mflr/mtlr pair is avoided, plus a few reg moves.

The improvement is noticeable (about 55 nsec/call on an 8xx)

vdsotest before the patch:
gettimeofday:    vdso: 731 nsec/call
clock-gettime-realtime-coarse:    vdso: 668 nsec/call
clock-gettime-monotonic-coarse:    vdso: 745 nsec/call

vdsotest after the patch:
gettimeofday:    vdso: 677 nsec/call
clock-gettime-realtime-coarse:    vdso: 613 nsec/call
clock-gettime-monotonic-coarse:    vdso: 690 nsec/call

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/kernel/vdso32/cacheflush.S   | 10 +++++-----
 arch/powerpc/kernel/vdso32/datapage.S     | 29 ++++-------------------------
 arch/powerpc/kernel/vdso32/datapage.h     | 12 ++++++++++++
 arch/powerpc/kernel/vdso32/gettimeofday.S | 11 +++++------
 4 files changed, 26 insertions(+), 36 deletions(-)
 create mode 100644 arch/powerpc/kernel/vdso32/datapage.h

diff --git a/arch/powerpc/kernel/vdso32/cacheflush.S b/arch/powerpc/kernel/vdso32/cacheflush.S
index 7f882e7b9f43..e9453837e4ee 100644
--- a/arch/powerpc/kernel/vdso32/cacheflush.S
+++ b/arch/powerpc/kernel/vdso32/cacheflush.S
@@ -10,6 +10,8 @@
 #include <asm/vdso.h>
 #include <asm/asm-offsets.h>
 
+#include "datapage.h"
+
 	.text
 
 /*
@@ -24,14 +26,12 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
   .cfi_startproc
 	mflr	r12
   .cfi_register lr,r12
-	mr	r11,r3
-	bl	__get_datapage@local
+	get_datapage	r10, r0
 	mtlr	r12
-	mr	r10,r3
 
 	lwz	r7,CFG_DCACHE_BLOCKSZ(r10)
 	addi	r5,r7,-1
-	andc	r6,r11,r5		/* round low to line bdy */
+	andc	r6,r3,r5		/* round low to line bdy */
 	subf	r8,r6,r4		/* compute length */
 	add	r8,r8,r5		/* ensure we get enough */
 	lwz	r9,CFG_DCACHE_LOGBLOCKSZ(r10)
@@ -48,7 +48,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
 
 	lwz	r7,CFG_ICACHE_BLOCKSZ(r10)
 	addi	r5,r7,-1
-	andc	r6,r11,r5		/* round low to line bdy */
+	andc	r6,r3,r5		/* round low to line bdy */
 	subf	r8,r6,r4		/* compute length */
 	add	r8,r8,r5
 	lwz	r9,CFG_ICACHE_LOGBLOCKSZ(r10)
diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S
index 6984125b9fc0..d480d2d4a3fe 100644
--- a/arch/powerpc/kernel/vdso32/datapage.S
+++ b/arch/powerpc/kernel/vdso32/datapage.S
@@ -11,34 +11,13 @@
 #include <asm/unistd.h>
 #include <asm/vdso.h>
 
+#include "datapage.h"
+
 	.text
 	.global	__kernel_datapage_offset;
 __kernel_datapage_offset:
 	.long	0
 
-V_FUNCTION_BEGIN(__get_datapage)
-  .cfi_startproc
-	/* We don't want that exposed or overridable as we want other objects
-	 * to be able to bl directly to here
-	 */
-	.protected __get_datapage
-	.hidden __get_datapage
-
-	mflr	r0
-  .cfi_register lr,r0
-
-	bcl	20,31,data_page_branch
-data_page_branch:
-	mflr	r3
-	mtlr	r0
-	addi	r3, r3, __kernel_datapage_offset-data_page_branch
-	lwz	r0,0(r3)
-  .cfi_restore lr
-	add	r3,r0,r3
-	blr
-  .cfi_endproc
-V_FUNCTION_END(__get_datapage)
-
 /*
  * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
  *
@@ -53,7 +32,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map)
 	mflr	r12
   .cfi_register lr,r12
 	mr	r4,r3
-	bl	__get_datapage@local
+	get_datapage	r3, r0
 	mtlr	r12
 	addi	r3,r3,CFG_SYSCALL_MAP32
 	cmpli	cr0,r4,0
@@ -74,7 +53,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq)
   .cfi_startproc
 	mflr	r12
   .cfi_register lr,r12
-	bl	__get_datapage@local
+	get_datapage	r3, r0
 	lwz	r4,(CFG_TB_TICKS_PER_SEC + 4)(r3)
 	lwz	r3,CFG_TB_TICKS_PER_SEC(r3)
 	mtlr	r12
diff --git a/arch/powerpc/kernel/vdso32/datapage.h b/arch/powerpc/kernel/vdso32/datapage.h
new file mode 100644
index 000000000000..ad96256be090
--- /dev/null
+++ b/arch/powerpc/kernel/vdso32/datapage.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+.macro get_datapage ptr, tmp
+	bcl	20,31,888f
+888:
+	mflr	\ptr
+	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
+	lwz	\tmp, 0(\ptr)
+	add	\ptr, \tmp, \ptr
+.endm
+
+
diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S
index e10098cde89c..91a58f01dcd5 100644
--- a/arch/powerpc/kernel/vdso32/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso32/gettimeofday.S
@@ -12,6 +12,8 @@
 #include <asm/asm-offsets.h>
 #include <asm/unistd.h>
 
+#include "datapage.h"
+
 /* Offset for the low 32-bit part of a field of long type */
 #ifdef CONFIG_PPC64
 #define LOPART	4
@@ -35,8 +37,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday)
 
 	mr	r10,r3			/* r10 saves tv */
 	mr	r11,r4			/* r11 saves tz */
-	bl	__get_datapage@local	/* get data page */
-	mr	r9, r3			/* datapage ptr in r9 */
+	get_datapage	r9, r0
 	cmplwi	r10,0			/* check if tv is NULL */
 	beq	3f
 	lis	r7,1000000@ha		/* load up USEC_PER_SEC */
@@ -82,8 +83,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
 	mflr	r12			/* r12 saves lr */
   .cfi_register lr,r12
 	mr	r11,r4			/* r11 saves tp */
-	bl	__get_datapage@local	/* get data page */
-	mr	r9,r3			/* datapage ptr in r9 */
+	get_datapage	r9, r0
 	lis	r7,NSEC_PER_SEC@h	/* want nanoseconds */
 	ori	r7,r7,NSEC_PER_SEC@l
 	beq	cr5,70f
@@ -235,8 +235,7 @@ V_FUNCTION_BEGIN(__kernel_time)
   .cfi_register lr,r12
 
 	mr	r11,r3			/* r11 holds t */
-	bl	__get_datapage@local
-	mr	r9, r3			/* datapage ptr in r9 */
+	get_datapage	r9, r0
 
 	lwz	r3,STAMP_XTIME+TSPEC_TV_SEC(r9)
 
-- 
2.13.3


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH] powerpc/vdso32: inline __get_datapage()
@ 2019-08-16 14:48 ` Christophe Leroy
  0 siblings, 0 replies; 13+ messages in thread
From: Christophe Leroy @ 2019-08-16 14:48 UTC (permalink / raw)
  To: Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

__get_datapage() is only a few instructions to retrieve the
address of the page where the kernel stores data to the VDSO.

By inlining this function into its users, a bl/blr pair and
a mflr/mtlr pair is avoided, plus a few reg moves.

The improvement is noticeable (about 55 nsec/call on an 8xx)

vdsotest before the patch:
gettimeofday:    vdso: 731 nsec/call
clock-gettime-realtime-coarse:    vdso: 668 nsec/call
clock-gettime-monotonic-coarse:    vdso: 745 nsec/call

vdsotest after the patch:
gettimeofday:    vdso: 677 nsec/call
clock-gettime-realtime-coarse:    vdso: 613 nsec/call
clock-gettime-monotonic-coarse:    vdso: 690 nsec/call

Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
---
 arch/powerpc/kernel/vdso32/cacheflush.S   | 10 +++++-----
 arch/powerpc/kernel/vdso32/datapage.S     | 29 ++++-------------------------
 arch/powerpc/kernel/vdso32/datapage.h     | 12 ++++++++++++
 arch/powerpc/kernel/vdso32/gettimeofday.S | 11 +++++------
 4 files changed, 26 insertions(+), 36 deletions(-)
 create mode 100644 arch/powerpc/kernel/vdso32/datapage.h

diff --git a/arch/powerpc/kernel/vdso32/cacheflush.S b/arch/powerpc/kernel/vdso32/cacheflush.S
index 7f882e7b9f43..e9453837e4ee 100644
--- a/arch/powerpc/kernel/vdso32/cacheflush.S
+++ b/arch/powerpc/kernel/vdso32/cacheflush.S
@@ -10,6 +10,8 @@
 #include <asm/vdso.h>
 #include <asm/asm-offsets.h>
 
+#include "datapage.h"
+
 	.text
 
 /*
@@ -24,14 +26,12 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
   .cfi_startproc
 	mflr	r12
   .cfi_register lr,r12
-	mr	r11,r3
-	bl	__get_datapage@local
+	get_datapage	r10, r0
 	mtlr	r12
-	mr	r10,r3
 
 	lwz	r7,CFG_DCACHE_BLOCKSZ(r10)
 	addi	r5,r7,-1
-	andc	r6,r11,r5		/* round low to line bdy */
+	andc	r6,r3,r5		/* round low to line bdy */
 	subf	r8,r6,r4		/* compute length */
 	add	r8,r8,r5		/* ensure we get enough */
 	lwz	r9,CFG_DCACHE_LOGBLOCKSZ(r10)
@@ -48,7 +48,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
 
 	lwz	r7,CFG_ICACHE_BLOCKSZ(r10)
 	addi	r5,r7,-1
-	andc	r6,r11,r5		/* round low to line bdy */
+	andc	r6,r3,r5		/* round low to line bdy */
 	subf	r8,r6,r4		/* compute length */
 	add	r8,r8,r5
 	lwz	r9,CFG_ICACHE_LOGBLOCKSZ(r10)
diff --git a/arch/powerpc/kernel/vdso32/datapage.S b/arch/powerpc/kernel/vdso32/datapage.S
index 6984125b9fc0..d480d2d4a3fe 100644
--- a/arch/powerpc/kernel/vdso32/datapage.S
+++ b/arch/powerpc/kernel/vdso32/datapage.S
@@ -11,34 +11,13 @@
 #include <asm/unistd.h>
 #include <asm/vdso.h>
 
+#include "datapage.h"
+
 	.text
 	.global	__kernel_datapage_offset;
 __kernel_datapage_offset:
 	.long	0
 
-V_FUNCTION_BEGIN(__get_datapage)
-  .cfi_startproc
-	/* We don't want that exposed or overridable as we want other objects
-	 * to be able to bl directly to here
-	 */
-	.protected __get_datapage
-	.hidden __get_datapage
-
-	mflr	r0
-  .cfi_register lr,r0
-
-	bcl	20,31,data_page_branch
-data_page_branch:
-	mflr	r3
-	mtlr	r0
-	addi	r3, r3, __kernel_datapage_offset-data_page_branch
-	lwz	r0,0(r3)
-  .cfi_restore lr
-	add	r3,r0,r3
-	blr
-  .cfi_endproc
-V_FUNCTION_END(__get_datapage)
-
 /*
  * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
  *
@@ -53,7 +32,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map)
 	mflr	r12
   .cfi_register lr,r12
 	mr	r4,r3
-	bl	__get_datapage@local
+	get_datapage	r3, r0
 	mtlr	r12
 	addi	r3,r3,CFG_SYSCALL_MAP32
 	cmpli	cr0,r4,0
@@ -74,7 +53,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq)
   .cfi_startproc
 	mflr	r12
   .cfi_register lr,r12
-	bl	__get_datapage@local
+	get_datapage	r3, r0
 	lwz	r4,(CFG_TB_TICKS_PER_SEC + 4)(r3)
 	lwz	r3,CFG_TB_TICKS_PER_SEC(r3)
 	mtlr	r12
diff --git a/arch/powerpc/kernel/vdso32/datapage.h b/arch/powerpc/kernel/vdso32/datapage.h
new file mode 100644
index 000000000000..ad96256be090
--- /dev/null
+++ b/arch/powerpc/kernel/vdso32/datapage.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+.macro get_datapage ptr, tmp
+	bcl	20,31,888f
+888:
+	mflr	\ptr
+	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
+	lwz	\tmp, 0(\ptr)
+	add	\ptr, \tmp, \ptr
+.endm
+
+
diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S
index e10098cde89c..91a58f01dcd5 100644
--- a/arch/powerpc/kernel/vdso32/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso32/gettimeofday.S
@@ -12,6 +12,8 @@
 #include <asm/asm-offsets.h>
 #include <asm/unistd.h>
 
+#include "datapage.h"
+
 /* Offset for the low 32-bit part of a field of long type */
 #ifdef CONFIG_PPC64
 #define LOPART	4
@@ -35,8 +37,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday)
 
 	mr	r10,r3			/* r10 saves tv */
 	mr	r11,r4			/* r11 saves tz */
-	bl	__get_datapage@local	/* get data page */
-	mr	r9, r3			/* datapage ptr in r9 */
+	get_datapage	r9, r0
 	cmplwi	r10,0			/* check if tv is NULL */
 	beq	3f
 	lis	r7,1000000@ha		/* load up USEC_PER_SEC */
@@ -82,8 +83,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
 	mflr	r12			/* r12 saves lr */
   .cfi_register lr,r12
 	mr	r11,r4			/* r11 saves tp */
-	bl	__get_datapage@local	/* get data page */
-	mr	r9,r3			/* datapage ptr in r9 */
+	get_datapage	r9, r0
 	lis	r7,NSEC_PER_SEC@h	/* want nanoseconds */
 	ori	r7,r7,NSEC_PER_SEC@l
 	beq	cr5,70f
@@ -235,8 +235,7 @@ V_FUNCTION_BEGIN(__kernel_time)
   .cfi_register lr,r12
 
 	mr	r11,r3			/* r11 holds t */
-	bl	__get_datapage@local
-	mr	r9, r3			/* datapage ptr in r9 */
+	get_datapage	r9, r0
 
 	lwz	r3,STAMP_XTIME+TSPEC_TV_SEC(r9)
 
-- 
2.13.3


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc/vdso32: inline __get_datapage()
  2019-08-16 14:48 ` Christophe Leroy
@ 2019-08-21  4:37   ` Benjamin Herrenschmidt
  -1 siblings, 0 replies; 13+ messages in thread
From: Benjamin Herrenschmidt @ 2019-08-21  4:37 UTC (permalink / raw)
  To: Christophe Leroy, Paul Mackerras, Michael Ellerman
  Cc: linux-kernel, linuxppc-dev

On Fri, 2019-08-16 at 14:48 +0000, Christophe Leroy wrote:
> __get_datapage() is only a few instructions to retrieve the
> address of the page where the kernel stores data to the VDSO.
> 
> By inlining this function into its users, a bl/blr pair and
> a mflr/mtlr pair is avoided, plus a few reg moves.
> 
> The improvement is noticeable (about 55 nsec/call on an 8xx)

Interesting... would be worth doing the same on vdso64 no ?

> vdsotest before the patch:
> gettimeofday:    vdso: 731 nsec/call
> clock-gettime-realtime-coarse:    vdso: 668 nsec/call
> clock-gettime-monotonic-coarse:    vdso: 745 nsec/call
> 
> vdsotest after the patch:
> gettimeofday:    vdso: 677 nsec/call
> clock-gettime-realtime-coarse:    vdso: 613 nsec/call
> clock-gettime-monotonic-coarse:    vdso: 690 nsec/call
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> ---
>  arch/powerpc/kernel/vdso32/cacheflush.S   | 10 +++++-----
>  arch/powerpc/kernel/vdso32/datapage.S     | 29 ++++-----------------
> --------
>  arch/powerpc/kernel/vdso32/datapage.h     | 12 ++++++++++++
>  arch/powerpc/kernel/vdso32/gettimeofday.S | 11 +++++------
>  4 files changed, 26 insertions(+), 36 deletions(-)
>  create mode 100644 arch/powerpc/kernel/vdso32/datapage.h
> 
> diff --git a/arch/powerpc/kernel/vdso32/cacheflush.S
> b/arch/powerpc/kernel/vdso32/cacheflush.S
> index 7f882e7b9f43..e9453837e4ee 100644
> --- a/arch/powerpc/kernel/vdso32/cacheflush.S
> +++ b/arch/powerpc/kernel/vdso32/cacheflush.S
> @@ -10,6 +10,8 @@
>  #include <asm/vdso.h>
>  #include <asm/asm-offsets.h>
>  
> +#include "datapage.h"
> +
>  	.text
>  
>  /*
> @@ -24,14 +26,12 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
>    .cfi_startproc
>  	mflr	r12
>    .cfi_register lr,r12
> -	mr	r11,r3
> -	bl	__get_datapage@local
> +	get_datapage	r10, r0
>  	mtlr	r12
> -	mr	r10,r3
>  
>  	lwz	r7,CFG_DCACHE_BLOCKSZ(r10)
>  	addi	r5,r7,-1
> -	andc	r6,r11,r5		/* round low to line bdy */
> +	andc	r6,r3,r5		/* round low to line bdy */
>  	subf	r8,r6,r4		/* compute length */
>  	add	r8,r8,r5		/* ensure we get enough */
>  	lwz	r9,CFG_DCACHE_LOGBLOCKSZ(r10)
> @@ -48,7 +48,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
>  
>  	lwz	r7,CFG_ICACHE_BLOCKSZ(r10)
>  	addi	r5,r7,-1
> -	andc	r6,r11,r5		/* round low to line bdy */
> +	andc	r6,r3,r5		/* round low to line bdy */
>  	subf	r8,r6,r4		/* compute length */
>  	add	r8,r8,r5
>  	lwz	r9,CFG_ICACHE_LOGBLOCKSZ(r10)
> diff --git a/arch/powerpc/kernel/vdso32/datapage.S
> b/arch/powerpc/kernel/vdso32/datapage.S
> index 6984125b9fc0..d480d2d4a3fe 100644
> --- a/arch/powerpc/kernel/vdso32/datapage.S
> +++ b/arch/powerpc/kernel/vdso32/datapage.S
> @@ -11,34 +11,13 @@
>  #include <asm/unistd.h>
>  #include <asm/vdso.h>
>  
> +#include "datapage.h"
> +
>  	.text
>  	.global	__kernel_datapage_offset;
>  __kernel_datapage_offset:
>  	.long	0
>  
> -V_FUNCTION_BEGIN(__get_datapage)
> -  .cfi_startproc
> -	/* We don't want that exposed or overridable as we want other
> objects
> -	 * to be able to bl directly to here
> -	 */
> -	.protected __get_datapage
> -	.hidden __get_datapage
> -
> -	mflr	r0
> -  .cfi_register lr,r0
> -
> -	bcl	20,31,data_page_branch
> -data_page_branch:
> -	mflr	r3
> -	mtlr	r0
> -	addi	r3, r3, __kernel_datapage_offset-data_page_branch
> -	lwz	r0,0(r3)
> -  .cfi_restore lr
> -	add	r3,r0,r3
> -	blr
> -  .cfi_endproc
> -V_FUNCTION_END(__get_datapage)
> -
>  /*
>   * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
>   *
> @@ -53,7 +32,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map)
>  	mflr	r12
>    .cfi_register lr,r12
>  	mr	r4,r3
> -	bl	__get_datapage@local
> +	get_datapage	r3, r0
>  	mtlr	r12
>  	addi	r3,r3,CFG_SYSCALL_MAP32
>  	cmpli	cr0,r4,0
> @@ -74,7 +53,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq)
>    .cfi_startproc
>  	mflr	r12
>    .cfi_register lr,r12
> -	bl	__get_datapage@local
> +	get_datapage	r3, r0
>  	lwz	r4,(CFG_TB_TICKS_PER_SEC + 4)(r3)
>  	lwz	r3,CFG_TB_TICKS_PER_SEC(r3)
>  	mtlr	r12
> diff --git a/arch/powerpc/kernel/vdso32/datapage.h
> b/arch/powerpc/kernel/vdso32/datapage.h
> new file mode 100644
> index 000000000000..ad96256be090
> --- /dev/null
> +++ b/arch/powerpc/kernel/vdso32/datapage.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +
> +.macro get_datapage ptr, tmp
> +	bcl	20,31,888f
> +888:
> +	mflr	\ptr
> +	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
> +	lwz	\tmp, 0(\ptr)
> +	add	\ptr, \tmp, \ptr
> +.endm
> +
> +
> diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S
> b/arch/powerpc/kernel/vdso32/gettimeofday.S
> index e10098cde89c..91a58f01dcd5 100644
> --- a/arch/powerpc/kernel/vdso32/gettimeofday.S
> +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S
> @@ -12,6 +12,8 @@
>  #include <asm/asm-offsets.h>
>  #include <asm/unistd.h>
>  
> +#include "datapage.h"
> +
>  /* Offset for the low 32-bit part of a field of long type */
>  #ifdef CONFIG_PPC64
>  #define LOPART	4
> @@ -35,8 +37,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday)
>  
>  	mr	r10,r3			/* r10 saves tv */
>  	mr	r11,r4			/* r11 saves tz */
> -	bl	__get_datapage@local	/* get data page */
> -	mr	r9, r3			/* datapage ptr in r9 */
> +	get_datapage	r9, r0
>  	cmplwi	r10,0			/* check if tv is NULL */
>  	beq	3f
>  	lis	r7,1000000@ha		/* load up USEC_PER_SEC */
> @@ -82,8 +83,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
>  	mflr	r12			/* r12 saves lr */
>    .cfi_register lr,r12
>  	mr	r11,r4			/* r11 saves tp */
> -	bl	__get_datapage@local	/* get data page */
> -	mr	r9,r3			/* datapage ptr in r9 */
> +	get_datapage	r9, r0
>  	lis	r7,NSEC_PER_SEC@h	/* want nanoseconds */
>  	ori	r7,r7,NSEC_PER_SEC@l
>  	beq	cr5,70f
> @@ -235,8 +235,7 @@ V_FUNCTION_BEGIN(__kernel_time)
>    .cfi_register lr,r12
>  
>  	mr	r11,r3			/* r11 holds t */
> -	bl	__get_datapage@local
> -	mr	r9, r3			/* datapage ptr in r9 */
> +	get_datapage	r9, r0
>  
>  	lwz	r3,STAMP_XTIME+TSPEC_TV_SEC(r9)
>  


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc/vdso32: inline __get_datapage()
@ 2019-08-21  4:37   ` Benjamin Herrenschmidt
  0 siblings, 0 replies; 13+ messages in thread
From: Benjamin Herrenschmidt @ 2019-08-21  4:37 UTC (permalink / raw)
  To: Christophe Leroy, Paul Mackerras, Michael Ellerman
  Cc: linuxppc-dev, linux-kernel

On Fri, 2019-08-16 at 14:48 +0000, Christophe Leroy wrote:
> __get_datapage() is only a few instructions to retrieve the
> address of the page where the kernel stores data to the VDSO.
> 
> By inlining this function into its users, a bl/blr pair and
> a mflr/mtlr pair is avoided, plus a few reg moves.
> 
> The improvement is noticeable (about 55 nsec/call on an 8xx)

Interesting... would be worth doing the same on vdso64 no ?

> vdsotest before the patch:
> gettimeofday:    vdso: 731 nsec/call
> clock-gettime-realtime-coarse:    vdso: 668 nsec/call
> clock-gettime-monotonic-coarse:    vdso: 745 nsec/call
> 
> vdsotest after the patch:
> gettimeofday:    vdso: 677 nsec/call
> clock-gettime-realtime-coarse:    vdso: 613 nsec/call
> clock-gettime-monotonic-coarse:    vdso: 690 nsec/call
> 
> Signed-off-by: Christophe Leroy <christophe.leroy@c-s.fr>
> ---
>  arch/powerpc/kernel/vdso32/cacheflush.S   | 10 +++++-----
>  arch/powerpc/kernel/vdso32/datapage.S     | 29 ++++-----------------
> --------
>  arch/powerpc/kernel/vdso32/datapage.h     | 12 ++++++++++++
>  arch/powerpc/kernel/vdso32/gettimeofday.S | 11 +++++------
>  4 files changed, 26 insertions(+), 36 deletions(-)
>  create mode 100644 arch/powerpc/kernel/vdso32/datapage.h
> 
> diff --git a/arch/powerpc/kernel/vdso32/cacheflush.S
> b/arch/powerpc/kernel/vdso32/cacheflush.S
> index 7f882e7b9f43..e9453837e4ee 100644
> --- a/arch/powerpc/kernel/vdso32/cacheflush.S
> +++ b/arch/powerpc/kernel/vdso32/cacheflush.S
> @@ -10,6 +10,8 @@
>  #include <asm/vdso.h>
>  #include <asm/asm-offsets.h>
>  
> +#include "datapage.h"
> +
>  	.text
>  
>  /*
> @@ -24,14 +26,12 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
>    .cfi_startproc
>  	mflr	r12
>    .cfi_register lr,r12
> -	mr	r11,r3
> -	bl	__get_datapage@local
> +	get_datapage	r10, r0
>  	mtlr	r12
> -	mr	r10,r3
>  
>  	lwz	r7,CFG_DCACHE_BLOCKSZ(r10)
>  	addi	r5,r7,-1
> -	andc	r6,r11,r5		/* round low to line bdy */
> +	andc	r6,r3,r5		/* round low to line bdy */
>  	subf	r8,r6,r4		/* compute length */
>  	add	r8,r8,r5		/* ensure we get enough */
>  	lwz	r9,CFG_DCACHE_LOGBLOCKSZ(r10)
> @@ -48,7 +48,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
>  
>  	lwz	r7,CFG_ICACHE_BLOCKSZ(r10)
>  	addi	r5,r7,-1
> -	andc	r6,r11,r5		/* round low to line bdy */
> +	andc	r6,r3,r5		/* round low to line bdy */
>  	subf	r8,r6,r4		/* compute length */
>  	add	r8,r8,r5
>  	lwz	r9,CFG_ICACHE_LOGBLOCKSZ(r10)
> diff --git a/arch/powerpc/kernel/vdso32/datapage.S
> b/arch/powerpc/kernel/vdso32/datapage.S
> index 6984125b9fc0..d480d2d4a3fe 100644
> --- a/arch/powerpc/kernel/vdso32/datapage.S
> +++ b/arch/powerpc/kernel/vdso32/datapage.S
> @@ -11,34 +11,13 @@
>  #include <asm/unistd.h>
>  #include <asm/vdso.h>
>  
> +#include "datapage.h"
> +
>  	.text
>  	.global	__kernel_datapage_offset;
>  __kernel_datapage_offset:
>  	.long	0
>  
> -V_FUNCTION_BEGIN(__get_datapage)
> -  .cfi_startproc
> -	/* We don't want that exposed or overridable as we want other
> objects
> -	 * to be able to bl directly to here
> -	 */
> -	.protected __get_datapage
> -	.hidden __get_datapage
> -
> -	mflr	r0
> -  .cfi_register lr,r0
> -
> -	bcl	20,31,data_page_branch
> -data_page_branch:
> -	mflr	r3
> -	mtlr	r0
> -	addi	r3, r3, __kernel_datapage_offset-data_page_branch
> -	lwz	r0,0(r3)
> -  .cfi_restore lr
> -	add	r3,r0,r3
> -	blr
> -  .cfi_endproc
> -V_FUNCTION_END(__get_datapage)
> -
>  /*
>   * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
>   *
> @@ -53,7 +32,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map)
>  	mflr	r12
>    .cfi_register lr,r12
>  	mr	r4,r3
> -	bl	__get_datapage@local
> +	get_datapage	r3, r0
>  	mtlr	r12
>  	addi	r3,r3,CFG_SYSCALL_MAP32
>  	cmpli	cr0,r4,0
> @@ -74,7 +53,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq)
>    .cfi_startproc
>  	mflr	r12
>    .cfi_register lr,r12
> -	bl	__get_datapage@local
> +	get_datapage	r3, r0
>  	lwz	r4,(CFG_TB_TICKS_PER_SEC + 4)(r3)
>  	lwz	r3,CFG_TB_TICKS_PER_SEC(r3)
>  	mtlr	r12
> diff --git a/arch/powerpc/kernel/vdso32/datapage.h
> b/arch/powerpc/kernel/vdso32/datapage.h
> new file mode 100644
> index 000000000000..ad96256be090
> --- /dev/null
> +++ b/arch/powerpc/kernel/vdso32/datapage.h
> @@ -0,0 +1,12 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +
> +.macro get_datapage ptr, tmp
> +	bcl	20,31,888f
> +888:
> +	mflr	\ptr
> +	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
> +	lwz	\tmp, 0(\ptr)
> +	add	\ptr, \tmp, \ptr
> +.endm
> +
> +
> diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S
> b/arch/powerpc/kernel/vdso32/gettimeofday.S
> index e10098cde89c..91a58f01dcd5 100644
> --- a/arch/powerpc/kernel/vdso32/gettimeofday.S
> +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S
> @@ -12,6 +12,8 @@
>  #include <asm/asm-offsets.h>
>  #include <asm/unistd.h>
>  
> +#include "datapage.h"
> +
>  /* Offset for the low 32-bit part of a field of long type */
>  #ifdef CONFIG_PPC64
>  #define LOPART	4
> @@ -35,8 +37,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday)
>  
>  	mr	r10,r3			/* r10 saves tv */
>  	mr	r11,r4			/* r11 saves tz */
> -	bl	__get_datapage@local	/* get data page */
> -	mr	r9, r3			/* datapage ptr in r9 */
> +	get_datapage	r9, r0
>  	cmplwi	r10,0			/* check if tv is NULL */
>  	beq	3f
>  	lis	r7,1000000@ha		/* load up USEC_PER_SEC */
> @@ -82,8 +83,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
>  	mflr	r12			/* r12 saves lr */
>    .cfi_register lr,r12
>  	mr	r11,r4			/* r11 saves tp */
> -	bl	__get_datapage@local	/* get data page */
> -	mr	r9,r3			/* datapage ptr in r9 */
> +	get_datapage	r9, r0
>  	lis	r7,NSEC_PER_SEC@h	/* want nanoseconds */
>  	ori	r7,r7,NSEC_PER_SEC@l
>  	beq	cr5,70f
> @@ -235,8 +235,7 @@ V_FUNCTION_BEGIN(__kernel_time)
>    .cfi_register lr,r12
>  
>  	mr	r11,r3			/* r11 holds t */
> -	bl	__get_datapage@local
> -	mr	r9, r3			/* datapage ptr in r9 */
> +	get_datapage	r9, r0
>  
>  	lwz	r3,STAMP_XTIME+TSPEC_TV_SEC(r9)
>  


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH] powerpc/vdso64: inline __get_datapage()
  2019-08-21  4:37   ` Benjamin Herrenschmidt
  (?)
@ 2019-08-21  9:29   ` Santosh Sivaraj
  2019-08-21  9:46     ` Christophe Leroy
  2019-08-21 11:44     ` Segher Boessenkool
  -1 siblings, 2 replies; 13+ messages in thread
From: Santosh Sivaraj @ 2019-08-21  9:29 UTC (permalink / raw)
  To: Benjamin Herrenschmidt; +Cc: linuxppc-dev

__get_datapage() is only a few instructions to retrieve the
address of the page where the kernel stores data to the VDSO.

By inlining this function into its users, a bl/blr pair and
a mflr/mtlr pair is avoided, plus a few reg moves.

clock-gettime-monotonic: syscall: 514 nsec/call  396 nsec/call
clock-gettime-monotonic:    libc: 25 nsec/call   24 nsec/call
clock-gettime-monotonic:    vdso: 20 nsec/call   20 nsec/call
clock-getres-monotonic: syscall: 347 nsec/call   372 nsec/call
clock-getres-monotonic:    libc: 19 nsec/call    19 nsec/call
clock-getres-monotonic:    vdso: 10 nsec/call    10 nsec/call
clock-gettime-monotonic-coarse: syscall: 511 nsec/call   396 nsec/call
clock-gettime-monotonic-coarse:    libc: 23 nsec/call    21 nsec/call
clock-gettime-monotonic-coarse:    vdso: 15 nsec/call    13 nsec/call
clock-gettime-realtime: syscall: 526 nsec/call   405 nsec/call
clock-gettime-realtime:    libc: 24 nsec/call    23 nsec/call
clock-gettime-realtime:    vdso: 18 nsec/call    18 nsec/call
clock-getres-realtime: syscall: 342 nsec/call    372 nsec/call
clock-getres-realtime:    libc: 19 nsec/call     19 nsec/call
clock-getres-realtime:    vdso: 10 nsec/call     10 nsec/call
clock-gettime-realtime-coarse: syscall: 515 nsec/call    373 nsec/call
clock-gettime-realtime-coarse:    libc: 23 nsec/call     22 nsec/call
clock-gettime-realtime-coarse:    vdso: 14 nsec/call     13 nsec/call

Based on the patch by Christophe Leroy <christophe.leroy@c-s.fr> for vdso32.

Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
---

except for a couple of calls (1 or 2 nsec reduction), there are no
improvements in the call times. Or is 10 nsec the minimum granularity??

So I don't know if its even worth updating vdso64 except to keep vdso32 and
vdso64 equal.


 arch/powerpc/kernel/vdso64/cacheflush.S   | 10 ++++----
 arch/powerpc/kernel/vdso64/datapage.S     | 29 ++++-------------------
 arch/powerpc/kernel/vdso64/datapage.h     | 10 ++++++++
 arch/powerpc/kernel/vdso64/gettimeofday.S |  8 ++++---
 4 files changed, 24 insertions(+), 33 deletions(-)
 create mode 100644 arch/powerpc/kernel/vdso64/datapage.h

diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso64/cacheflush.S
index 3f92561a64c4..30e8b0d29bea 100644
--- a/arch/powerpc/kernel/vdso64/cacheflush.S
+++ b/arch/powerpc/kernel/vdso64/cacheflush.S
@@ -10,6 +10,8 @@
 #include <asm/vdso.h>
 #include <asm/asm-offsets.h>
 
+#include "datapage.h"
+
 	.text
 
 /*
@@ -24,14 +26,12 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
   .cfi_startproc
 	mflr	r12
   .cfi_register lr,r12
-	mr	r11,r3
-	bl	V_LOCAL_FUNC(__get_datapage)
+	get_datapage	r11, r0
 	mtlr	r12
-	mr	r10,r3
 
 	lwz	r7,CFG_DCACHE_BLOCKSZ(r10)
 	addi	r5,r7,-1
-	andc	r6,r11,r5		/* round low to line bdy */
+	andc	r6,r3,r5		/* round low to line bdy */
 	subf	r8,r6,r4		/* compute length */
 	add	r8,r8,r5		/* ensure we get enough */
 	lwz	r9,CFG_DCACHE_LOGBLOCKSZ(r10)
@@ -48,7 +48,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
 
 	lwz	r7,CFG_ICACHE_BLOCKSZ(r10)
 	addi	r5,r7,-1
-	andc	r6,r11,r5		/* round low to line bdy */
+	andc	r6,r3,r5		/* round low to line bdy */
 	subf	r8,r6,r4		/* compute length */
 	add	r8,r8,r5
 	lwz	r9,CFG_ICACHE_LOGBLOCKSZ(r10)
diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S
index dc84f5ae3802..8712f57c931c 100644
--- a/arch/powerpc/kernel/vdso64/datapage.S
+++ b/arch/powerpc/kernel/vdso64/datapage.S
@@ -11,34 +11,13 @@
 #include <asm/unistd.h>
 #include <asm/vdso.h>
 
+#include "datapage.h"
+
 	.text
 .global	__kernel_datapage_offset;
 __kernel_datapage_offset:
 	.long	0
 
-V_FUNCTION_BEGIN(__get_datapage)
-  .cfi_startproc
-	/* We don't want that exposed or overridable as we want other objects
-	 * to be able to bl directly to here
-	 */
-	.protected __get_datapage
-	.hidden __get_datapage
-
-	mflr	r0
-  .cfi_register lr,r0
-
-	bcl	20,31,data_page_branch
-data_page_branch:
-	mflr	r3
-	mtlr	r0
-	addi	r3, r3, __kernel_datapage_offset-data_page_branch
-	lwz	r0,0(r3)
-  .cfi_restore lr
-	add	r3,r0,r3
-	blr
-  .cfi_endproc
-V_FUNCTION_END(__get_datapage)
-
 /*
  * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
  *
@@ -53,7 +32,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map)
 	mflr	r12
   .cfi_register lr,r12
 	mr	r4,r3
-	bl	V_LOCAL_FUNC(__get_datapage)
+	get_datapage	r3, r0
 	mtlr	r12
 	addi	r3,r3,CFG_SYSCALL_MAP64
 	cmpldi	cr0,r4,0
@@ -75,7 +54,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq)
   .cfi_startproc
 	mflr	r12
   .cfi_register lr,r12
-	bl	V_LOCAL_FUNC(__get_datapage)
+	get_datapage	r3, r0
 	ld	r3,CFG_TB_TICKS_PER_SEC(r3)
 	mtlr	r12
 	crclr	cr0*4+so
diff --git a/arch/powerpc/kernel/vdso64/datapage.h b/arch/powerpc/kernel/vdso64/datapage.h
new file mode 100644
index 000000000000..f2f0da0f65f3
--- /dev/null
+++ b/arch/powerpc/kernel/vdso64/datapage.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+.macro get_datapage ptr, tmp
+	bcl	20,31,888f
+888:
+	mflr	\ptr
+	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
+	lwz	\tmp, 0(\ptr)
+	add	\ptr, \tmp, \ptr
+.endm
diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S
index 07bfe33fe874..7bcc879392cc 100644
--- a/arch/powerpc/kernel/vdso64/gettimeofday.S
+++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
@@ -12,6 +12,8 @@
 #include <asm/asm-offsets.h>
 #include <asm/unistd.h>
 
+#include "datapage.h"
+
 	.text
 /*
  * Exact prototype of gettimeofday
@@ -26,7 +28,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday)
 
 	mr	r11,r3			/* r11 holds tv */
 	mr	r10,r4			/* r10 holds tz */
-	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
+	get_datapage	r3, r0
 	cmpldi	r11,0			/* check if tv is NULL */
 	beq	2f
 	lis	r7,1000000@ha		/* load up USEC_PER_SEC */
@@ -71,7 +73,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
 	mflr	r12			/* r12 saves lr */
   .cfi_register lr,r12
 	mr	r11,r4			/* r11 saves tp */
-	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
+	get_datapage	r3, r0		/* get data page */
 	lis	r7,NSEC_PER_SEC@h	/* want nanoseconds */
 	ori	r7,r7,NSEC_PER_SEC@l
 	beq	cr5,70f
@@ -218,7 +220,7 @@ V_FUNCTION_BEGIN(__kernel_time)
   .cfi_register lr,r12
 
 	mr	r11,r3			/* r11 holds t */
-	bl	V_LOCAL_FUNC(__get_datapage)
+	get_datapage	r3, r0
 
 	ld	r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
 
-- 
2.21.0


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc/vdso64: inline __get_datapage()
  2019-08-21  9:29   ` [PATCH] powerpc/vdso64: " Santosh Sivaraj
@ 2019-08-21  9:46     ` Christophe Leroy
  2019-08-21 11:20       ` Santosh Sivaraj
  2019-08-21 15:58       ` Nathan Lynch
  2019-08-21 11:44     ` Segher Boessenkool
  1 sibling, 2 replies; 13+ messages in thread
From: Christophe Leroy @ 2019-08-21  9:46 UTC (permalink / raw)
  To: Santosh Sivaraj, Benjamin Herrenschmidt; +Cc: linuxppc-dev



Le 21/08/2019 à 11:29, Santosh Sivaraj a écrit :
> __get_datapage() is only a few instructions to retrieve the
> address of the page where the kernel stores data to the VDSO.
> 
> By inlining this function into its users, a bl/blr pair and
> a mflr/mtlr pair is avoided, plus a few reg moves.
> 
> clock-gettime-monotonic: syscall: 514 nsec/call  396 nsec/call
> clock-gettime-monotonic:    libc: 25 nsec/call   24 nsec/call
> clock-gettime-monotonic:    vdso: 20 nsec/call   20 nsec/call
> clock-getres-monotonic: syscall: 347 nsec/call   372 nsec/call
> clock-getres-monotonic:    libc: 19 nsec/call    19 nsec/call
> clock-getres-monotonic:    vdso: 10 nsec/call    10 nsec/call
> clock-gettime-monotonic-coarse: syscall: 511 nsec/call   396 nsec/call
> clock-gettime-monotonic-coarse:    libc: 23 nsec/call    21 nsec/call
> clock-gettime-monotonic-coarse:    vdso: 15 nsec/call    13 nsec/call
> clock-gettime-realtime: syscall: 526 nsec/call   405 nsec/call
> clock-gettime-realtime:    libc: 24 nsec/call    23 nsec/call
> clock-gettime-realtime:    vdso: 18 nsec/call    18 nsec/call
> clock-getres-realtime: syscall: 342 nsec/call    372 nsec/call
> clock-getres-realtime:    libc: 19 nsec/call     19 nsec/call
> clock-getres-realtime:    vdso: 10 nsec/call     10 nsec/call
> clock-gettime-realtime-coarse: syscall: 515 nsec/call    373 nsec/call
> clock-gettime-realtime-coarse:    libc: 23 nsec/call     22 nsec/call
> clock-gettime-realtime-coarse:    vdso: 14 nsec/call     13 nsec/call

I think you should only put the measurements on vdso calls, and only the 
ones that are impacted by the change. For exemple, getres function 
doesn't use __get_datapage so showing it here is pointless.

gettimeofday should be shown there as it uses __get_datapage()


> 
> Based on the patch by Christophe Leroy <christophe.leroy@c-s.fr> for vdso32.
> 
> Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
> ---
> 
> except for a couple of calls (1 or 2 nsec reduction), there are no
> improvements in the call times. Or is 10 nsec the minimum granularity??

Maybe the ones that show no improvements are the ones that don't use 
__get_datapage() at all ...

> 
> So I don't know if its even worth updating vdso64 except to keep vdso32 and
> vdso64 equal.

2ns on a 15ns call is 13% so it is worth it I think.

Christophe


> 
> 
>   arch/powerpc/kernel/vdso64/cacheflush.S   | 10 ++++----
>   arch/powerpc/kernel/vdso64/datapage.S     | 29 ++++-------------------
>   arch/powerpc/kernel/vdso64/datapage.h     | 10 ++++++++
>   arch/powerpc/kernel/vdso64/gettimeofday.S |  8 ++++---
>   4 files changed, 24 insertions(+), 33 deletions(-)
>   create mode 100644 arch/powerpc/kernel/vdso64/datapage.h
> 
> diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso64/cacheflush.S
> index 3f92561a64c4..30e8b0d29bea 100644
> --- a/arch/powerpc/kernel/vdso64/cacheflush.S
> +++ b/arch/powerpc/kernel/vdso64/cacheflush.S
> @@ -10,6 +10,8 @@
>   #include <asm/vdso.h>
>   #include <asm/asm-offsets.h>
>   
> +#include "datapage.h"
> +
>   	.text
>   
>   /*
> @@ -24,14 +26,12 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
>     .cfi_startproc
>   	mflr	r12
>     .cfi_register lr,r12
> -	mr	r11,r3
> -	bl	V_LOCAL_FUNC(__get_datapage)
> +	get_datapage	r11, r0
>   	mtlr	r12
> -	mr	r10,r3
>   
>   	lwz	r7,CFG_DCACHE_BLOCKSZ(r10)
>   	addi	r5,r7,-1
> -	andc	r6,r11,r5		/* round low to line bdy */
> +	andc	r6,r3,r5		/* round low to line bdy */
>   	subf	r8,r6,r4		/* compute length */
>   	add	r8,r8,r5		/* ensure we get enough */
>   	lwz	r9,CFG_DCACHE_LOGBLOCKSZ(r10)
> @@ -48,7 +48,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
>   
>   	lwz	r7,CFG_ICACHE_BLOCKSZ(r10)
>   	addi	r5,r7,-1
> -	andc	r6,r11,r5		/* round low to line bdy */
> +	andc	r6,r3,r5		/* round low to line bdy */
>   	subf	r8,r6,r4		/* compute length */
>   	add	r8,r8,r5
>   	lwz	r9,CFG_ICACHE_LOGBLOCKSZ(r10)
> diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S
> index dc84f5ae3802..8712f57c931c 100644
> --- a/arch/powerpc/kernel/vdso64/datapage.S
> +++ b/arch/powerpc/kernel/vdso64/datapage.S
> @@ -11,34 +11,13 @@
>   #include <asm/unistd.h>
>   #include <asm/vdso.h>
>   
> +#include "datapage.h"
> +
>   	.text
>   .global	__kernel_datapage_offset;
>   __kernel_datapage_offset:
>   	.long	0
>   
> -V_FUNCTION_BEGIN(__get_datapage)
> -  .cfi_startproc
> -	/* We don't want that exposed or overridable as we want other objects
> -	 * to be able to bl directly to here
> -	 */
> -	.protected __get_datapage
> -	.hidden __get_datapage
> -
> -	mflr	r0
> -  .cfi_register lr,r0
> -
> -	bcl	20,31,data_page_branch
> -data_page_branch:
> -	mflr	r3
> -	mtlr	r0
> -	addi	r3, r3, __kernel_datapage_offset-data_page_branch
> -	lwz	r0,0(r3)
> -  .cfi_restore lr
> -	add	r3,r0,r3
> -	blr
> -  .cfi_endproc
> -V_FUNCTION_END(__get_datapage)
> -
>   /*
>    * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
>    *
> @@ -53,7 +32,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map)
>   	mflr	r12
>     .cfi_register lr,r12
>   	mr	r4,r3
> -	bl	V_LOCAL_FUNC(__get_datapage)
> +	get_datapage	r3, r0
>   	mtlr	r12
>   	addi	r3,r3,CFG_SYSCALL_MAP64
>   	cmpldi	cr0,r4,0
> @@ -75,7 +54,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq)
>     .cfi_startproc
>   	mflr	r12
>     .cfi_register lr,r12
> -	bl	V_LOCAL_FUNC(__get_datapage)
> +	get_datapage	r3, r0
>   	ld	r3,CFG_TB_TICKS_PER_SEC(r3)
>   	mtlr	r12
>   	crclr	cr0*4+so
> diff --git a/arch/powerpc/kernel/vdso64/datapage.h b/arch/powerpc/kernel/vdso64/datapage.h
> new file mode 100644
> index 000000000000..f2f0da0f65f3
> --- /dev/null
> +++ b/arch/powerpc/kernel/vdso64/datapage.h
> @@ -0,0 +1,10 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +
> +.macro get_datapage ptr, tmp
> +	bcl	20,31,888f
> +888:
> +	mflr	\ptr
> +	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
> +	lwz	\tmp, 0(\ptr)
> +	add	\ptr, \tmp, \ptr
> +.endm
> diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S
> index 07bfe33fe874..7bcc879392cc 100644
> --- a/arch/powerpc/kernel/vdso64/gettimeofday.S
> +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
> @@ -12,6 +12,8 @@
>   #include <asm/asm-offsets.h>
>   #include <asm/unistd.h>
>   
> +#include "datapage.h"
> +
>   	.text
>   /*
>    * Exact prototype of gettimeofday
> @@ -26,7 +28,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday)
>   
>   	mr	r11,r3			/* r11 holds tv */
>   	mr	r10,r4			/* r10 holds tz */
> -	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
> +	get_datapage	r3, r0
>   	cmpldi	r11,0			/* check if tv is NULL */
>   	beq	2f
>   	lis	r7,1000000@ha		/* load up USEC_PER_SEC */
> @@ -71,7 +73,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
>   	mflr	r12			/* r12 saves lr */
>     .cfi_register lr,r12
>   	mr	r11,r4			/* r11 saves tp */
> -	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
> +	get_datapage	r3, r0		/* get data page */
>   	lis	r7,NSEC_PER_SEC@h	/* want nanoseconds */
>   	ori	r7,r7,NSEC_PER_SEC@l
>   	beq	cr5,70f
> @@ -218,7 +220,7 @@ V_FUNCTION_BEGIN(__kernel_time)
>     .cfi_register lr,r12
>   
>   	mr	r11,r3			/* r11 holds t */
> -	bl	V_LOCAL_FUNC(__get_datapage)
> +	get_datapage	r3, r0
>   
>   	ld	r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
>   
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc/vdso64: inline __get_datapage()
  2019-08-21  9:46     ` Christophe Leroy
@ 2019-08-21 11:20       ` Santosh Sivaraj
  2019-08-21 15:58       ` Nathan Lynch
  1 sibling, 0 replies; 13+ messages in thread
From: Santosh Sivaraj @ 2019-08-21 11:20 UTC (permalink / raw)
  To: Christophe Leroy, Benjamin Herrenschmidt; +Cc: linuxppc-dev

Christophe Leroy <christophe.leroy@c-s.fr> writes:

> Le 21/08/2019 à 11:29, Santosh Sivaraj a écrit :
>> __get_datapage() is only a few instructions to retrieve the
>> address of the page where the kernel stores data to the VDSO.
>> 
>> By inlining this function into its users, a bl/blr pair and
>> a mflr/mtlr pair is avoided, plus a few reg moves.
>> 
>> clock-gettime-monotonic: syscall: 514 nsec/call  396 nsec/call
>> clock-gettime-monotonic:    libc: 25 nsec/call   24 nsec/call
>> clock-gettime-monotonic:    vdso: 20 nsec/call   20 nsec/call
>> clock-getres-monotonic: syscall: 347 nsec/call   372 nsec/call
>> clock-getres-monotonic:    libc: 19 nsec/call    19 nsec/call
>> clock-getres-monotonic:    vdso: 10 nsec/call    10 nsec/call
>> clock-gettime-monotonic-coarse: syscall: 511 nsec/call   396 nsec/call
>> clock-gettime-monotonic-coarse:    libc: 23 nsec/call    21 nsec/call
>> clock-gettime-monotonic-coarse:    vdso: 15 nsec/call    13 nsec/call
>> clock-gettime-realtime: syscall: 526 nsec/call   405 nsec/call
>> clock-gettime-realtime:    libc: 24 nsec/call    23 nsec/call
>> clock-gettime-realtime:    vdso: 18 nsec/call    18 nsec/call
>> clock-getres-realtime: syscall: 342 nsec/call    372 nsec/call
>> clock-getres-realtime:    libc: 19 nsec/call     19 nsec/call
>> clock-getres-realtime:    vdso: 10 nsec/call     10 nsec/call
>> clock-gettime-realtime-coarse: syscall: 515 nsec/call    373 nsec/call
>> clock-gettime-realtime-coarse:    libc: 23 nsec/call     22 nsec/call
>> clock-gettime-realtime-coarse:    vdso: 14 nsec/call     13 nsec/call
>
> I think you should only put the measurements on vdso calls, and only the 
> ones that are impacted by the change. For exemple, getres function 
> doesn't use __get_datapage so showing it here is pointless.
>
> gettimeofday should be shown there as it uses __get_datapage()
>
>
>> 
>> Based on the patch by Christophe Leroy <christophe.leroy@c-s.fr> for vdso32.
>> 
>> Signed-off-by: Santosh Sivaraj <santosh@fossix.org>
>> ---
>> 
>> except for a couple of calls (1 or 2 nsec reduction), there are no
>> improvements in the call times. Or is 10 nsec the minimum granularity??
>
> Maybe the ones that show no improvements are the ones that don't use 
> __get_datapage() at all ...

Yes makes sense.

>
>> 
>> So I don't know if its even worth updating vdso64 except to keep vdso32 and
>> vdso64 equal.
>
> 2ns on a 15ns call is 13% so it is worth it I think.

true. Since datapage.h is the same for both 32 and 64, may be we should put
it in include/asm.

Thanks,
Santosh
>
> Christophe
>
>
>> 
>> 
>>   arch/powerpc/kernel/vdso64/cacheflush.S   | 10 ++++----
>>   arch/powerpc/kernel/vdso64/datapage.S     | 29 ++++-------------------
>>   arch/powerpc/kernel/vdso64/datapage.h     | 10 ++++++++
>>   arch/powerpc/kernel/vdso64/gettimeofday.S |  8 ++++---
>>   4 files changed, 24 insertions(+), 33 deletions(-)
>>   create mode 100644 arch/powerpc/kernel/vdso64/datapage.h
>> 
>> diff --git a/arch/powerpc/kernel/vdso64/cacheflush.S b/arch/powerpc/kernel/vdso64/cacheflush.S
>> index 3f92561a64c4..30e8b0d29bea 100644
>> --- a/arch/powerpc/kernel/vdso64/cacheflush.S
>> +++ b/arch/powerpc/kernel/vdso64/cacheflush.S
>> @@ -10,6 +10,8 @@
>>   #include <asm/vdso.h>
>>   #include <asm/asm-offsets.h>
>>   
>> +#include "datapage.h"
>> +
>>   	.text
>>   
>>   /*
>> @@ -24,14 +26,12 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
>>     .cfi_startproc
>>   	mflr	r12
>>     .cfi_register lr,r12
>> -	mr	r11,r3
>> -	bl	V_LOCAL_FUNC(__get_datapage)
>> +	get_datapage	r11, r0
>>   	mtlr	r12
>> -	mr	r10,r3
>>   
>>   	lwz	r7,CFG_DCACHE_BLOCKSZ(r10)
>>   	addi	r5,r7,-1
>> -	andc	r6,r11,r5		/* round low to line bdy */
>> +	andc	r6,r3,r5		/* round low to line bdy */
>>   	subf	r8,r6,r4		/* compute length */
>>   	add	r8,r8,r5		/* ensure we get enough */
>>   	lwz	r9,CFG_DCACHE_LOGBLOCKSZ(r10)
>> @@ -48,7 +48,7 @@ V_FUNCTION_BEGIN(__kernel_sync_dicache)
>>   
>>   	lwz	r7,CFG_ICACHE_BLOCKSZ(r10)
>>   	addi	r5,r7,-1
>> -	andc	r6,r11,r5		/* round low to line bdy */
>> +	andc	r6,r3,r5		/* round low to line bdy */
>>   	subf	r8,r6,r4		/* compute length */
>>   	add	r8,r8,r5
>>   	lwz	r9,CFG_ICACHE_LOGBLOCKSZ(r10)
>> diff --git a/arch/powerpc/kernel/vdso64/datapage.S b/arch/powerpc/kernel/vdso64/datapage.S
>> index dc84f5ae3802..8712f57c931c 100644
>> --- a/arch/powerpc/kernel/vdso64/datapage.S
>> +++ b/arch/powerpc/kernel/vdso64/datapage.S
>> @@ -11,34 +11,13 @@
>>   #include <asm/unistd.h>
>>   #include <asm/vdso.h>
>>   
>> +#include "datapage.h"
>> +
>>   	.text
>>   .global	__kernel_datapage_offset;
>>   __kernel_datapage_offset:
>>   	.long	0
>>   
>> -V_FUNCTION_BEGIN(__get_datapage)
>> -  .cfi_startproc
>> -	/* We don't want that exposed or overridable as we want other objects
>> -	 * to be able to bl directly to here
>> -	 */
>> -	.protected __get_datapage
>> -	.hidden __get_datapage
>> -
>> -	mflr	r0
>> -  .cfi_register lr,r0
>> -
>> -	bcl	20,31,data_page_branch
>> -data_page_branch:
>> -	mflr	r3
>> -	mtlr	r0
>> -	addi	r3, r3, __kernel_datapage_offset-data_page_branch
>> -	lwz	r0,0(r3)
>> -  .cfi_restore lr
>> -	add	r3,r0,r3
>> -	blr
>> -  .cfi_endproc
>> -V_FUNCTION_END(__get_datapage)
>> -
>>   /*
>>    * void *__kernel_get_syscall_map(unsigned int *syscall_count) ;
>>    *
>> @@ -53,7 +32,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map)
>>   	mflr	r12
>>     .cfi_register lr,r12
>>   	mr	r4,r3
>> -	bl	V_LOCAL_FUNC(__get_datapage)
>> +	get_datapage	r3, r0
>>   	mtlr	r12
>>   	addi	r3,r3,CFG_SYSCALL_MAP64
>>   	cmpldi	cr0,r4,0
>> @@ -75,7 +54,7 @@ V_FUNCTION_BEGIN(__kernel_get_tbfreq)
>>     .cfi_startproc
>>   	mflr	r12
>>     .cfi_register lr,r12
>> -	bl	V_LOCAL_FUNC(__get_datapage)
>> +	get_datapage	r3, r0
>>   	ld	r3,CFG_TB_TICKS_PER_SEC(r3)
>>   	mtlr	r12
>>   	crclr	cr0*4+so
>> diff --git a/arch/powerpc/kernel/vdso64/datapage.h b/arch/powerpc/kernel/vdso64/datapage.h
>> new file mode 100644
>> index 000000000000..f2f0da0f65f3
>> --- /dev/null
>> +++ b/arch/powerpc/kernel/vdso64/datapage.h
>> @@ -0,0 +1,10 @@
>> +/* SPDX-License-Identifier: GPL-2.0-or-later */
>> +
>> +.macro get_datapage ptr, tmp
>> +	bcl	20,31,888f
>> +888:
>> +	mflr	\ptr
>> +	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
>> +	lwz	\tmp, 0(\ptr)
>> +	add	\ptr, \tmp, \ptr
>> +.endm
>> diff --git a/arch/powerpc/kernel/vdso64/gettimeofday.S b/arch/powerpc/kernel/vdso64/gettimeofday.S
>> index 07bfe33fe874..7bcc879392cc 100644
>> --- a/arch/powerpc/kernel/vdso64/gettimeofday.S
>> +++ b/arch/powerpc/kernel/vdso64/gettimeofday.S
>> @@ -12,6 +12,8 @@
>>   #include <asm/asm-offsets.h>
>>   #include <asm/unistd.h>
>>   
>> +#include "datapage.h"
>> +
>>   	.text
>>   /*
>>    * Exact prototype of gettimeofday
>> @@ -26,7 +28,7 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday)
>>   
>>   	mr	r11,r3			/* r11 holds tv */
>>   	mr	r10,r4			/* r10 holds tz */
>> -	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
>> +	get_datapage	r3, r0
>>   	cmpldi	r11,0			/* check if tv is NULL */
>>   	beq	2f
>>   	lis	r7,1000000@ha		/* load up USEC_PER_SEC */
>> @@ -71,7 +73,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime)
>>   	mflr	r12			/* r12 saves lr */
>>     .cfi_register lr,r12
>>   	mr	r11,r4			/* r11 saves tp */
>> -	bl	V_LOCAL_FUNC(__get_datapage)	/* get data page */
>> +	get_datapage	r3, r0		/* get data page */
>>   	lis	r7,NSEC_PER_SEC@h	/* want nanoseconds */
>>   	ori	r7,r7,NSEC_PER_SEC@l
>>   	beq	cr5,70f
>> @@ -218,7 +220,7 @@ V_FUNCTION_BEGIN(__kernel_time)
>>     .cfi_register lr,r12
>>   
>>   	mr	r11,r3			/* r11 holds t */
>> -	bl	V_LOCAL_FUNC(__get_datapage)
>> +	get_datapage	r3, r0
>>   
>>   	ld	r4,STAMP_XTIME+TSPC64_TV_SEC(r3)
>>   
>> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc/vdso64: inline __get_datapage()
  2019-08-21  9:29   ` [PATCH] powerpc/vdso64: " Santosh Sivaraj
  2019-08-21  9:46     ` Christophe Leroy
@ 2019-08-21 11:44     ` Segher Boessenkool
  2019-08-21 11:50       ` Christophe Leroy
  1 sibling, 1 reply; 13+ messages in thread
From: Segher Boessenkool @ 2019-08-21 11:44 UTC (permalink / raw)
  To: Santosh Sivaraj; +Cc: linuxppc-dev

Hi!

On Wed, Aug 21, 2019 at 02:59:59PM +0530, Santosh Sivaraj wrote:
> except for a couple of calls (1 or 2 nsec reduction), there are no
> improvements in the call times. Or is 10 nsec the minimum granularity??
> 
> So I don't know if its even worth updating vdso64 except to keep vdso32 and
> vdso64 equal.

Calls are cheap, in principle...  It is the LR stuff that can make it
slower on some cores, and a lot of calling sequence stuff may have
considerable overhead of course.

> +.macro get_datapage ptr, tmp
> +	bcl	20,31,888f
> +888:
> +	mflr	\ptr
> +	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
> +	lwz	\tmp, 0(\ptr)
> +	add	\ptr, \tmp, \ptr
> +.endm

(You can just write that as
	bcl 20,31,$+4
	mflr \ptr
etc.  Useless labels are useless :-) )

One thing you might want to do to improve performance is to do this without
the bcl etc., because you cannot really hide the LR latency of that.  But
that isn't very many ns either...  Superscalar helps, OoO helps, but it is
mostly just that >100MHz helps ;-)


Segher

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc/vdso64: inline __get_datapage()
  2019-08-21 11:44     ` Segher Boessenkool
@ 2019-08-21 11:50       ` Christophe Leroy
  2019-08-21 12:15         ` Segher Boessenkool
  0 siblings, 1 reply; 13+ messages in thread
From: Christophe Leroy @ 2019-08-21 11:50 UTC (permalink / raw)
  To: Segher Boessenkool, Santosh Sivaraj; +Cc: linuxppc-dev



Le 21/08/2019 à 13:44, Segher Boessenkool a écrit :
> Hi!
> 
> On Wed, Aug 21, 2019 at 02:59:59PM +0530, Santosh Sivaraj wrote:
>> except for a couple of calls (1 or 2 nsec reduction), there are no
>> improvements in the call times. Or is 10 nsec the minimum granularity??
>>
>> So I don't know if its even worth updating vdso64 except to keep vdso32 and
>> vdso64 equal.
> 
> Calls are cheap, in principle...  It is the LR stuff that can make it
> slower on some cores, and a lot of calling sequence stuff may have
> considerable overhead of course.

On an 8xx, a taken branch is 2 cycles and a non taken branch in 1 cycle 
(+ the refetch if that was not the anticipate branch).

> 
>> +.macro get_datapage ptr, tmp
>> +	bcl	20,31,888f
>> +888:
>> +	mflr	\ptr
>> +	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
>> +	lwz	\tmp, 0(\ptr)
>> +	add	\ptr, \tmp, \ptr
>> +.endm
> 
> (You can just write that as
> 	bcl 20,31,$+4
> 	mflr \ptr
> etc.  Useless labels are useless :-) )

Nice trick. Will use that.

> 
> One thing you might want to do to improve performance is to do this without
> the bcl etc., because you cannot really hide the LR latency of that.  But
> that isn't very many ns either...  Superscalar helps, OoO helps, but it is
> mostly just that >100MHz helps ;-)

Good idea. Did you have a look at my vdso32 similar patch ? 
https://patchwork.ozlabs.org/patch/1148274/

Do you have any idea on how to avoid that bcl/mflr stuff ?

Christophe

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc/vdso64: inline __get_datapage()
  2019-08-21 11:50       ` Christophe Leroy
@ 2019-08-21 12:15         ` Segher Boessenkool
  2019-08-21 16:23           ` Christophe Leroy
  0 siblings, 1 reply; 13+ messages in thread
From: Segher Boessenkool @ 2019-08-21 12:15 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: Santosh Sivaraj, linuxppc-dev

On Wed, Aug 21, 2019 at 01:50:52PM +0200, Christophe Leroy wrote:
> Le 21/08/2019 à 13:44, Segher Boessenkool a écrit :
> >Calls are cheap, in principle...  It is the LR stuff that can make it
> >slower on some cores, and a lot of calling sequence stuff may have
> >considerable overhead of course.
> 
> On an 8xx, a taken branch is 2 cycles and a non taken branch in 1 cycle 
> (+ the refetch if that was not the anticipate branch).

Yup.  And on the big cores they are all 0 cycles, if correctly predicted.
(Taken branches end your fetch group, of course, there are small
inefficiencies everywhere, but that's about the gist of it).

> >>+.macro get_datapage ptr, tmp
> >>+	bcl	20,31,888f
> >>+888:
> >>+	mflr	\ptr
> >>+	addi	\ptr, \ptr, __kernel_datapage_offset - 888b
> >>+	lwz	\tmp, 0(\ptr)
> >>+	add	\ptr, \tmp, \ptr
> >>+.endm
> >
> >(You can just write that as
> >	bcl 20,31,$+4
> >	mflr \ptr
> >etc.  Useless labels are useless :-) )
> 
> Nice trick. Will use that.

Or .+4 if you like that syntax better...  It's all the same thing.

> >One thing you might want to do to improve performance is to do this without
> >the bcl etc., because you cannot really hide the LR latency of that.  But
> >that isn't very many ns either...  Superscalar helps, OoO helps, but it is
> >mostly just that >100MHz helps ;-)
> 
> Good idea. Did you have a look at my vdso32 similar patch ? 
> https://patchwork.ozlabs.org/patch/1148274/

Yes, I saw it.

> Do you have any idea on how to avoid that bcl/mflr stuff ?

Do a load from some fixed address?  Maybe an absolute address, even?
lwz r3,-12344(0)  or similar (that address is in kernel space...)

There aren't many options, and certainly not many *good* options!


Segher

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc/vdso64: inline __get_datapage()
  2019-08-21  9:46     ` Christophe Leroy
  2019-08-21 11:20       ` Santosh Sivaraj
@ 2019-08-21 15:58       ` Nathan Lynch
  1 sibling, 0 replies; 13+ messages in thread
From: Nathan Lynch @ 2019-08-21 15:58 UTC (permalink / raw)
  To: Christophe Leroy; +Cc: Santosh Sivaraj, linuxppc-dev

Christophe Leroy <christophe.leroy@c-s.fr> writes:
> Le 21/08/2019 à 11:29, Santosh Sivaraj a écrit :
>> __get_datapage() is only a few instructions to retrieve the
>> address of the page where the kernel stores data to the VDSO.
>> 
>> By inlining this function into its users, a bl/blr pair and
>> a mflr/mtlr pair is avoided, plus a few reg moves.
>> 
>> clock-gettime-monotonic: syscall: 514 nsec/call  396 nsec/call
>> clock-gettime-monotonic:    libc: 25 nsec/call   24 nsec/call
>> clock-gettime-monotonic:    vdso: 20 nsec/call   20 nsec/call
>> clock-getres-monotonic: syscall: 347 nsec/call   372 nsec/call
>> clock-getres-monotonic:    libc: 19 nsec/call    19 nsec/call
>> clock-getres-monotonic:    vdso: 10 nsec/call    10 nsec/call
>> clock-gettime-monotonic-coarse: syscall: 511 nsec/call   396 nsec/call
>> clock-gettime-monotonic-coarse:    libc: 23 nsec/call    21 nsec/call
>> clock-gettime-monotonic-coarse:    vdso: 15 nsec/call    13 nsec/call
>> clock-gettime-realtime: syscall: 526 nsec/call   405 nsec/call
>> clock-gettime-realtime:    libc: 24 nsec/call    23 nsec/call
>> clock-gettime-realtime:    vdso: 18 nsec/call    18 nsec/call
>> clock-getres-realtime: syscall: 342 nsec/call    372 nsec/call
>> clock-getres-realtime:    libc: 19 nsec/call     19 nsec/call
>> clock-getres-realtime:    vdso: 10 nsec/call     10 nsec/call
>> clock-gettime-realtime-coarse: syscall: 515 nsec/call    373 nsec/call
>> clock-gettime-realtime-coarse:    libc: 23 nsec/call     22 nsec/call
>> clock-gettime-realtime-coarse:    vdso: 14 nsec/call     13 nsec/call
>
> I think you should only put the measurements on vdso calls, and only the 
> ones that are impacted by the change. For exemple, getres function 
> doesn't use __get_datapage so showing it here is pointless.

I agree with this point, but also, I would caution against using
vdsotest's benchmark function for anything like rigorous performance
analysis. The intention was to roughly confirm the VDSO's relative
performance vs the in-kernel implementations. Not to compare one VDSO
implementation of (say) clock_gettime to another.

I suggest using perf to confirm the expected effects of the change, if
possible.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc/vdso64: inline __get_datapage()
  2019-08-21 12:15         ` Segher Boessenkool
@ 2019-08-21 16:23           ` Christophe Leroy
  2019-08-22 16:18             ` Santosh Sivaraj
  0 siblings, 1 reply; 13+ messages in thread
From: Christophe Leroy @ 2019-08-21 16:23 UTC (permalink / raw)
  To: Segher Boessenkool; +Cc: Santosh Sivaraj, linuxppc-dev



Le 21/08/2019 à 14:15, Segher Boessenkool a écrit :
> On Wed, Aug 21, 2019 at 01:50:52PM +0200, Christophe Leroy wrote:
>> Do you have any idea on how to avoid that bcl/mflr stuff ?
> 
> Do a load from some fixed address?  Maybe an absolute address, even?
> lwz r3,-12344(0)  or similar (that address is in kernel space...)
> 
> There aren't many options, and certainly not many *good* options!
> 

IIUC, the VDSO is seen by apps the same way as a dynamic lib. Couldn't 
the relocation be done only once when the app loads the VDSO as for a 
regular .so lib ?

It looks like it is what others do, at least x86 and arm64, unless I 
misunderstood their code.

Christophe

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] powerpc/vdso64: inline __get_datapage()
  2019-08-21 16:23           ` Christophe Leroy
@ 2019-08-22 16:18             ` Santosh Sivaraj
  0 siblings, 0 replies; 13+ messages in thread
From: Santosh Sivaraj @ 2019-08-22 16:18 UTC (permalink / raw)
  To: Christophe Leroy, Segher Boessenkool; +Cc: linuxppc-dev

Christophe Leroy <christophe.leroy@c-s.fr> writes:

> Le 21/08/2019 à 14:15, Segher Boessenkool a écrit :
>> On Wed, Aug 21, 2019 at 01:50:52PM +0200, Christophe Leroy wrote:
>>> Do you have any idea on how to avoid that bcl/mflr stuff ?
>> 
>> Do a load from some fixed address?  Maybe an absolute address, even?
>> lwz r3,-12344(0)  or similar (that address is in kernel space...)
>> 
>> There aren't many options, and certainly not many *good* options!
>> 
>
> IIUC, the VDSO is seen by apps the same way as a dynamic lib. Couldn't 
> the relocation be done only once when the app loads the VDSO as for a 
> regular .so lib ?

How does address space randomization work for .so libs?

>
> It looks like it is what others do, at least x86 and arm64, unless I 
> misunderstood their code.
>
> Christophe

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2019-08-22 16:21 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-16 14:48 [PATCH] powerpc/vdso32: inline __get_datapage() Christophe Leroy
2019-08-16 14:48 ` Christophe Leroy
2019-08-21  4:37 ` Benjamin Herrenschmidt
2019-08-21  4:37   ` Benjamin Herrenschmidt
2019-08-21  9:29   ` [PATCH] powerpc/vdso64: " Santosh Sivaraj
2019-08-21  9:46     ` Christophe Leroy
2019-08-21 11:20       ` Santosh Sivaraj
2019-08-21 15:58       ` Nathan Lynch
2019-08-21 11:44     ` Segher Boessenkool
2019-08-21 11:50       ` Christophe Leroy
2019-08-21 12:15         ` Segher Boessenkool
2019-08-21 16:23           ` Christophe Leroy
2019-08-22 16:18             ` Santosh Sivaraj

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.