All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] arm: Added support for getcpu() vDSO using TPIDRURW
@ 2016-10-04 13:49 Fredrik Markstrom
  2016-10-04 15:35   ` Fredrik Markstrom
  0 siblings, 1 reply; 27+ messages in thread
From: Fredrik Markstrom @ 2016-10-04 13:49 UTC (permalink / raw)
  To: linux-arm-kernel

This makes getcpu() ~1000 times faster, this is very useful when
implementing per-cpu buffers in userspace (to avoid cache line
bouncing). As an example lttng ust becomes ~30% faster.

The patch will break applications using TPIDRURW (which is context switched
since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:
Preserve the user r/w register TPIDRURW on context switch and fork")) and
is therefore made configurable.

Signed-off-by: Fredrik Markstrom <fredrik.markstrom@gmail.com>
---
 arch/arm/include/asm/tls.h   |  8 +++++++-
 arch/arm/kernel/entry-armv.S |  1 -
 arch/arm/mm/Kconfig          | 10 ++++++++++
 arch/arm/vdso/Makefile       |  3 +++
 arch/arm/vdso/vdso.lds.S     |  3 +++
 5 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 5f833f7..170fd76 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -10,10 +10,15 @@
 	.endm
 
 	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
+#ifdef CONFIG_VDSO_GETCPU
+	ldr	\tpuser, [r2, #TI_CPU]
+#else
 	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
+	ldr	\tpuser, [r2, #TI_TP_VALUE + 4]
+	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+#endif
 	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
 	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
-	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
 	.endm
 
 	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
@@ -22,6 +27,7 @@
 	mov	\tmp2, #0xffff0fff
 	tst	\tmp1, #HWCAP_TLS		@ hardware TLS available?
 	streq	\tp, [\tmp2, #-15]		@ set TLS value at 0xffff0ff0
+	ldrne	\tpuser, [r2, #TI_TP_VALUE + 4] @ load the saved user r/w reg
 	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
 	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
 	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 9f157e7..4e1369a 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -787,7 +787,6 @@ ENTRY(__switch_to)
  THUMB(	str	sp, [ip], #4		   )
  THUMB(	str	lr, [ip], #4		   )
 	ldr	r4, [r2, #TI_TP_VALUE]
-	ldr	r5, [r2, #TI_TP_VALUE + 4]
 #ifdef CONFIG_CPU_USE_DOMAINS
 	mrc	p15, 0, r6, c3, c0, 0		@ Get domain register
 	str	r6, [r1, #TI_CPU_DOMAIN]	@ Save old domain register
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index c1799dd..f18334a 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -854,6 +854,16 @@ config VDSO
 	  You must have glibc 2.22 or later for programs to seamlessly
 	  take advantage of this.
 
+config VDSO_GETCPU
+	bool "Enable VDSO for getcpu"
+	depends on VDSO && (CPU_V6K || CPU_V7 || CPU_V7M)
+	help
+	  Say Y to make getcpu a VDSO (fast) call. This is useful if you
+	  want to implement per cpu buffers to avoid cache line bouncing
+	  in user mode.
+	  This mechanism uses the TPIDRURW register so enabling it will break
+	  applications using this register for it's own purpose.
+
 config DMA_CACHE_RWFO
 	bool "Enable read/write for ownership DMA cache maintenance"
 	depends on CPU_V6K && SMP
diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile
index 59a8fa7..9f1ec51 100644
--- a/arch/arm/vdso/Makefile
+++ b/arch/arm/vdso/Makefile
@@ -1,6 +1,9 @@
 hostprogs-y := vdsomunge
 
 obj-vdso := vgettimeofday.o datapage.o
+#ifeq ($(CONFIG_VDSO_GETCPU),y)
+obj-vdso += vgetcpu.o
+#endif
 
 # Build rules
 targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.so.raw vdso.lds
diff --git a/arch/arm/vdso/vdso.lds.S b/arch/arm/vdso/vdso.lds.S
index 89ca89f..1af39fb 100644
--- a/arch/arm/vdso/vdso.lds.S
+++ b/arch/arm/vdso/vdso.lds.S
@@ -82,6 +82,9 @@ VERSION
 	global:
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
+#ifdef CONFIG_VDSO_GETCPU
+		__vdso_getcpu;
+#endif
 	local: *;
 	};
 }
-- 
2.7.2

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
  2016-10-04 13:49 [PATCH] arm: Added support for getcpu() vDSO using TPIDRURW Fredrik Markstrom
@ 2016-10-04 15:35   ` Fredrik Markstrom
  0 siblings, 0 replies; 27+ messages in thread
From: Fredrik Markstrom @ 2016-10-04 15:35 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: Fredrik Markstrom, Russell King, Will Deacon, Chris Brandt,
	Nicolas Pitre, Ard Biesheuvel, Arnd Bergmann, Linus Walleij,
	Masahiro Yamada, Kees Cook, Jonathan Austin, Zhaoxiu Zeng,
	Mark Rutland, Michal Marek, linux-kernel

This makes getcpu() ~1000 times faster, this is very useful when
implementing per-cpu buffers in userspace (to avoid cache line
bouncing). As an example lttng ust becomes ~30% faster.

The patch will break applications using TPIDRURW (which is context switched
since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:
Preserve the user r/w register TPIDRURW on context switch and fork")) and
is therefore made configurable.

Signed-off-by: Fredrik Markstrom <fredrik.markstrom@gmail.com>
---
 arch/arm/include/asm/tls.h   |  8 +++++++-
 arch/arm/kernel/entry-armv.S |  1 -
 arch/arm/mm/Kconfig          | 10 ++++++++++
 arch/arm/vdso/Makefile       |  3 +++
 arch/arm/vdso/vdso.lds.S     |  3 +++
 arch/arm/vdso/vgetcpu.c      | 34 ++++++++++++++++++++++++++++++++++
 6 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm/vdso/vgetcpu.c

diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 5f833f7..170fd76 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -10,10 +10,15 @@
 	.endm
 
 	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
+#ifdef CONFIG_VDSO_GETCPU
+	ldr	\tpuser, [r2, #TI_CPU]
+#else
 	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
+	ldr	\tpuser, [r2, #TI_TP_VALUE + 4]
+	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+#endif
 	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
 	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
-	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
 	.endm
 
 	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
@@ -22,6 +27,7 @@
 	mov	\tmp2, #0xffff0fff
 	tst	\tmp1, #HWCAP_TLS		@ hardware TLS available?
 	streq	\tp, [\tmp2, #-15]		@ set TLS value at 0xffff0ff0
+	ldrne	\tpuser, [r2, #TI_TP_VALUE + 4] @ load the saved user r/w reg
 	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
 	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
 	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 9f157e7..4e1369a 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -787,7 +787,6 @@ ENTRY(__switch_to)
  THUMB(	str	sp, [ip], #4		   )
  THUMB(	str	lr, [ip], #4		   )
 	ldr	r4, [r2, #TI_TP_VALUE]
-	ldr	r5, [r2, #TI_TP_VALUE + 4]
 #ifdef CONFIG_CPU_USE_DOMAINS
 	mrc	p15, 0, r6, c3, c0, 0		@ Get domain register
 	str	r6, [r1, #TI_CPU_DOMAIN]	@ Save old domain register
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index c1799dd..f18334a 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -854,6 +854,16 @@ config VDSO
 	  You must have glibc 2.22 or later for programs to seamlessly
 	  take advantage of this.
 
+config VDSO_GETCPU
+	bool "Enable VDSO for getcpu"
+	depends on VDSO && (CPU_V6K || CPU_V7 || CPU_V7M)
+	help
+	  Say Y to make getcpu a VDSO (fast) call. This is useful if you
+	  want to implement per cpu buffers to avoid cache line bouncing
+	  in user mode.
+	  This mechanism uses the TPIDRURW register so enabling it will break
+	  applications using this register for it's own purpose.
+
 config DMA_CACHE_RWFO
 	bool "Enable read/write for ownership DMA cache maintenance"
 	depends on CPU_V6K && SMP
diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile
index 59a8fa7..9f1ec51 100644
--- a/arch/arm/vdso/Makefile
+++ b/arch/arm/vdso/Makefile
@@ -1,6 +1,9 @@
 hostprogs-y := vdsomunge
 
 obj-vdso := vgettimeofday.o datapage.o
+#ifeq ($(CONFIG_VDSO_GETCPU),y)
+obj-vdso += vgetcpu.o
+#endif
 
 # Build rules
 targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.so.raw vdso.lds
diff --git a/arch/arm/vdso/vdso.lds.S b/arch/arm/vdso/vdso.lds.S
index 89ca89f..1af39fb 100644
--- a/arch/arm/vdso/vdso.lds.S
+++ b/arch/arm/vdso/vdso.lds.S
@@ -82,6 +82,9 @@ VERSION
 	global:
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
+#ifdef CONFIG_VDSO_GETCPU
+		__vdso_getcpu;
+#endif
 	local: *;
 	};
 }
diff --git a/arch/arm/vdso/vgetcpu.c b/arch/arm/vdso/vgetcpu.c
new file mode 100644
index 0000000..1b710af
--- /dev/null
+++ b/arch/arm/vdso/vgetcpu.c
@@ -0,0 +1,34 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/compiler.h>
+#include <asm/topology.h>
+
+struct getcpu_cache;
+
+notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
+			  struct getcpu_cache *tcache)
+{
+	unsigned long node_and_cpu;
+
+	asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
+
+	if (nodep)
+		*nodep = cpu_to_node(node_and_cpu >> 16);
+	if (cpup)
+		*cpup  = node_and_cpu & 0xffffUL;
+
+	return 0;
+}
+
-- 
2.7.2

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
@ 2016-10-04 15:35   ` Fredrik Markstrom
  0 siblings, 0 replies; 27+ messages in thread
From: Fredrik Markstrom @ 2016-10-04 15:35 UTC (permalink / raw)
  To: linux-arm-kernel

This makes getcpu() ~1000 times faster, this is very useful when
implementing per-cpu buffers in userspace (to avoid cache line
bouncing). As an example lttng ust becomes ~30% faster.

The patch will break applications using TPIDRURW (which is context switched
since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:
Preserve the user r/w register TPIDRURW on context switch and fork")) and
is therefore made configurable.

Signed-off-by: Fredrik Markstrom <fredrik.markstrom@gmail.com>
---
 arch/arm/include/asm/tls.h   |  8 +++++++-
 arch/arm/kernel/entry-armv.S |  1 -
 arch/arm/mm/Kconfig          | 10 ++++++++++
 arch/arm/vdso/Makefile       |  3 +++
 arch/arm/vdso/vdso.lds.S     |  3 +++
 arch/arm/vdso/vgetcpu.c      | 34 ++++++++++++++++++++++++++++++++++
 6 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 arch/arm/vdso/vgetcpu.c

diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h
index 5f833f7..170fd76 100644
--- a/arch/arm/include/asm/tls.h
+++ b/arch/arm/include/asm/tls.h
@@ -10,10 +10,15 @@
 	.endm
 
 	.macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2
+#ifdef CONFIG_VDSO_GETCPU
+	ldr	\tpuser, [r2, #TI_CPU]
+#else
 	mrc	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
+	ldr	\tpuser, [r2, #TI_TP_VALUE + 4]
+	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
+#endif
 	mcr	p15, 0, \tp, c13, c0, 3		@ set TLS register
 	mcr	p15, 0, \tpuser, c13, c0, 2	@ and the user r/w register
-	str	\tmp2, [\base, #TI_TP_VALUE + 4] @ save it
 	.endm
 
 	.macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2
@@ -22,6 +27,7 @@
 	mov	\tmp2, #0xffff0fff
 	tst	\tmp1, #HWCAP_TLS		@ hardware TLS available?
 	streq	\tp, [\tmp2, #-15]		@ set TLS value at 0xffff0ff0
+	ldrne	\tpuser, [r2, #TI_TP_VALUE + 4] @ load the saved user r/w reg
 	mrcne	p15, 0, \tmp2, c13, c0, 2	@ get the user r/w register
 	mcrne	p15, 0, \tp, c13, c0, 3		@ yes, set TLS register
 	mcrne	p15, 0, \tpuser, c13, c0, 2	@ set user r/w register
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 9f157e7..4e1369a 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -787,7 +787,6 @@ ENTRY(__switch_to)
  THUMB(	str	sp, [ip], #4		   )
  THUMB(	str	lr, [ip], #4		   )
 	ldr	r4, [r2, #TI_TP_VALUE]
-	ldr	r5, [r2, #TI_TP_VALUE + 4]
 #ifdef CONFIG_CPU_USE_DOMAINS
 	mrc	p15, 0, r6, c3, c0, 0		@ Get domain register
 	str	r6, [r1, #TI_CPU_DOMAIN]	@ Save old domain register
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index c1799dd..f18334a 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -854,6 +854,16 @@ config VDSO
 	  You must have glibc 2.22 or later for programs to seamlessly
 	  take advantage of this.
 
+config VDSO_GETCPU
+	bool "Enable VDSO for getcpu"
+	depends on VDSO && (CPU_V6K || CPU_V7 || CPU_V7M)
+	help
+	  Say Y to make getcpu a VDSO (fast) call. This is useful if you
+	  want to implement per cpu buffers to avoid cache line bouncing
+	  in user mode.
+	  This mechanism uses the TPIDRURW register so enabling it will break
+	  applications using this register for it's own purpose.
+
 config DMA_CACHE_RWFO
 	bool "Enable read/write for ownership DMA cache maintenance"
 	depends on CPU_V6K && SMP
diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile
index 59a8fa7..9f1ec51 100644
--- a/arch/arm/vdso/Makefile
+++ b/arch/arm/vdso/Makefile
@@ -1,6 +1,9 @@
 hostprogs-y := vdsomunge
 
 obj-vdso := vgettimeofday.o datapage.o
+#ifeq ($(CONFIG_VDSO_GETCPU),y)
+obj-vdso += vgetcpu.o
+#endif
 
 # Build rules
 targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.so.raw vdso.lds
diff --git a/arch/arm/vdso/vdso.lds.S b/arch/arm/vdso/vdso.lds.S
index 89ca89f..1af39fb 100644
--- a/arch/arm/vdso/vdso.lds.S
+++ b/arch/arm/vdso/vdso.lds.S
@@ -82,6 +82,9 @@ VERSION
 	global:
 		__vdso_clock_gettime;
 		__vdso_gettimeofday;
+#ifdef CONFIG_VDSO_GETCPU
+		__vdso_getcpu;
+#endif
 	local: *;
 	};
 }
diff --git a/arch/arm/vdso/vgetcpu.c b/arch/arm/vdso/vgetcpu.c
new file mode 100644
index 0000000..1b710af
--- /dev/null
+++ b/arch/arm/vdso/vgetcpu.c
@@ -0,0 +1,34 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/compiler.h>
+#include <asm/topology.h>
+
+struct getcpu_cache;
+
+notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
+			  struct getcpu_cache *tcache)
+{
+	unsigned long node_and_cpu;
+
+	asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
+
+	if (nodep)
+		*nodep = cpu_to_node(node_and_cpu >> 16);
+	if (cpup)
+		*cpup  = node_and_cpu & 0xffffUL;
+
+	return 0;
+}
+
-- 
2.7.2

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
  2016-10-04 15:35   ` Fredrik Markstrom
@ 2016-10-04 17:07     ` Mark Rutland
  -1 siblings, 0 replies; 27+ messages in thread
From: Mark Rutland @ 2016-10-04 17:07 UTC (permalink / raw)
  To: Fredrik Markstrom
  Cc: linux-arm-kernel, Russell King, Will Deacon, Chris Brandt,
	Nicolas Pitre, Ard Biesheuvel, Arnd Bergmann, Linus Walleij,
	Masahiro Yamada, Kees Cook, Jonathan Austin, Zhaoxiu Zeng,
	Michal Marek, linux-kernel, kristina.martsenko

On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote:
> This makes getcpu() ~1000 times faster, this is very useful when
> implementing per-cpu buffers in userspace (to avoid cache line
> bouncing). As an example lttng ust becomes ~30% faster.
> 
> The patch will break applications using TPIDRURW (which is context switched
> since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:

It looks like you dropped the leading 'a' from the commit ID. For
everyone else's benefit, the full ID is:

  a4780adeefd042482f624f5e0d577bf9cdcbb760

Please note that arm64 has done similar for compat tasks since commit:

  d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for
  compat tasks")

> Preserve the user r/w register TPIDRURW on context switch and fork")) and
> is therefore made configurable.

As you note above, this is an ABI break and *will* break some existing
applications. That's generally a no-go.

This also leaves arm64's compat with the existing behaviour, differing
from arm.

I was under the impression that other mechanisms were being considered
for fast userspace access to per-cpu data structures, e.g. restartable
sequences. What is the state of those? Why is this better?

If getcpu() specifically is necessary, is there no other way to
implement it?

> +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
> +			  struct getcpu_cache *tcache)
> +{
> +	unsigned long node_and_cpu;
> +
> +	asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
> +
> +	if (nodep)
> +		*nodep = cpu_to_node(node_and_cpu >> 16);
> +	if (cpup)
> +		*cpup  = node_and_cpu & 0xffffUL;

Given this is directly user-accessible, this format is a de-facto ABI,
even if it's not documented as such. Is this definitely the format you
want long-term?

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
@ 2016-10-04 17:07     ` Mark Rutland
  0 siblings, 0 replies; 27+ messages in thread
From: Mark Rutland @ 2016-10-04 17:07 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote:
> This makes getcpu() ~1000 times faster, this is very useful when
> implementing per-cpu buffers in userspace (to avoid cache line
> bouncing). As an example lttng ust becomes ~30% faster.
> 
> The patch will break applications using TPIDRURW (which is context switched
> since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:

It looks like you dropped the leading 'a' from the commit ID. For
everyone else's benefit, the full ID is:

  a4780adeefd042482f624f5e0d577bf9cdcbb760

Please note that arm64 has done similar for compat tasks since commit:

  d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for
  compat tasks")

> Preserve the user r/w register TPIDRURW on context switch and fork")) and
> is therefore made configurable.

As you note above, this is an ABI break and *will* break some existing
applications. That's generally a no-go.

This also leaves arm64's compat with the existing behaviour, differing
from arm.

I was under the impression that other mechanisms were being considered
for fast userspace access to per-cpu data structures, e.g. restartable
sequences. What is the state of those? Why is this better?

If getcpu() specifically is necessary, is there no other way to
implement it?

> +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
> +			  struct getcpu_cache *tcache)
> +{
> +	unsigned long node_and_cpu;
> +
> +	asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
> +
> +	if (nodep)
> +		*nodep = cpu_to_node(node_and_cpu >> 16);
> +	if (cpup)
> +		*cpup  = node_and_cpu & 0xffffUL;

Given this is directly user-accessible, this format is a de-facto ABI,
even if it's not documented as such. Is this definitely the format you
want long-term?

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
  2016-10-04 17:07     ` Mark Rutland
@ 2016-10-05 12:25       ` Fredrik Markström
  -1 siblings, 0 replies; 27+ messages in thread
From: Fredrik Markström @ 2016-10-05 12:25 UTC (permalink / raw)
  To: Mark Rutland
  Cc: linux-arm-kernel, Russell King, Will Deacon, Chris Brandt,
	Nicolas Pitre, Ard Biesheuvel, Arnd Bergmann, Linus Walleij,
	Masahiro Yamada, Kees Cook, Jonathan Austin, Zhaoxiu Zeng,
	Michal Marek, linux-kernel, kristina.martsenko

On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote:
>
> On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote:
> > This makes getcpu() ~1000 times faster, this is very useful when
> > implementing per-cpu buffers in userspace (to avoid cache line
> > bouncing). As an example lttng ust becomes ~30% faster.
> >
> > The patch will break applications using TPIDRURW (which is context switched
> > since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:
>
> It looks like you dropped the leading 'a' from the commit ID. For
> everyone else's benefit, the full ID is:
>
>   a4780adeefd042482f624f5e0d577bf9cdcbb760


Sorry for that and thanks for fixing it.

>
>
> Please note that arm64 has done similar for compat tasks since commit:
>
>   d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for
>   compat tasks")
>
> > Preserve the user r/w register TPIDRURW on context switch and fork")) and
> > is therefore made configurable.
>
> As you note above, this is an ABI break and *will* break some existing
> applications. That's generally a no-go.


Ok, I wasn't sure this was considered an ABI (but I'm not entirely
surprised ;) ). The way I was
trying to defend the breakage was by reasoning that that if it was an
ABI we broke it both with a4780ad
and with 6a1c531, and since we don't break ABI:s, it can't be one.

But hey, I'm humble here and ready to back off.

>
> This also leaves arm64's compat with the existing behaviour, differing
> from arm.
>
> I was under the impression that other mechanisms were being considered
> for fast userspace access to per-cpu data structures, e.g. restartable
> sequences. What is the state of those? Why is this better?
>
> If getcpu() specifically is necessary, is there no other way to
> implement it?

If you are referring to the user space stuff can probably be
implemented other ways,
it's just convenient since the interface is there and it will speed up
stuff like lttng without
modifications (well, except glibc). It's also already implemented as a
vDSO on other
major architectures (like x86, x86_64, ppc32 and ppc64).

If you are referring to the implementation of the vdso call, there are
other possibilities, but
I haven't found any that doesn't introduce overhead in context switching.

But if TPIDRURW is definitely a no go, I can work on a patch that does
this with a thread notifier
and the vdso data page. Would that be a viable option ?

>
> > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
> > +                       struct getcpu_cache *tcache)
> > +{
> > +     unsigned long node_and_cpu;
> > +
> > +     asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
> > +
> > +     if (nodep)
> > +             *nodep = cpu_to_node(node_and_cpu >> 16);
> > +     if (cpup)
> > +             *cpup  = node_and_cpu & 0xffffUL;
>
> Given this is directly user-accessible, this format is a de-facto ABI,
> even if it's not documented as such. Is this definitely the format you
> want long-term?

Yes, this (the interface) is indeed the important part and therefore I
tried not to invent anything
on my own.
This is the interface used by ppc32, ppc64, x86, x86_64. It's also this is
how the getcpu(2) system call is documented.

/Fredrik


>
>
> Thanks,
> Mark.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
@ 2016-10-05 12:25       ` Fredrik Markström
  0 siblings, 0 replies; 27+ messages in thread
From: Fredrik Markström @ 2016-10-05 12:25 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote:
>
> On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote:
> > This makes getcpu() ~1000 times faster, this is very useful when
> > implementing per-cpu buffers in userspace (to avoid cache line
> > bouncing). As an example lttng ust becomes ~30% faster.
> >
> > The patch will break applications using TPIDRURW (which is context switched
> > since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:
>
> It looks like you dropped the leading 'a' from the commit ID. For
> everyone else's benefit, the full ID is:
>
>   a4780adeefd042482f624f5e0d577bf9cdcbb760


Sorry for that and thanks for fixing it.

>
>
> Please note that arm64 has done similar for compat tasks since commit:
>
>   d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for
>   compat tasks")
>
> > Preserve the user r/w register TPIDRURW on context switch and fork")) and
> > is therefore made configurable.
>
> As you note above, this is an ABI break and *will* break some existing
> applications. That's generally a no-go.


Ok, I wasn't sure this was considered an ABI (but I'm not entirely
surprised ;) ). The way I was
trying to defend the breakage was by reasoning that that if it was an
ABI we broke it both with a4780ad
and with 6a1c531, and since we don't break ABI:s, it can't be one.

But hey, I'm humble here and ready to back off.

>
> This also leaves arm64's compat with the existing behaviour, differing
> from arm.
>
> I was under the impression that other mechanisms were being considered
> for fast userspace access to per-cpu data structures, e.g. restartable
> sequences. What is the state of those? Why is this better?
>
> If getcpu() specifically is necessary, is there no other way to
> implement it?

If you are referring to the user space stuff can probably be
implemented other ways,
it's just convenient since the interface is there and it will speed up
stuff like lttng without
modifications (well, except glibc). It's also already implemented as a
vDSO on other
major architectures (like x86, x86_64, ppc32 and ppc64).

If you are referring to the implementation of the vdso call, there are
other possibilities, but
I haven't found any that doesn't introduce overhead in context switching.

But if TPIDRURW is definitely a no go, I can work on a patch that does
this with a thread notifier
and the vdso data page. Would that be a viable option ?

>
> > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
> > +                       struct getcpu_cache *tcache)
> > +{
> > +     unsigned long node_and_cpu;
> > +
> > +     asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
> > +
> > +     if (nodep)
> > +             *nodep = cpu_to_node(node_and_cpu >> 16);
> > +     if (cpup)
> > +             *cpup  = node_and_cpu & 0xffffUL;
>
> Given this is directly user-accessible, this format is a de-facto ABI,
> even if it's not documented as such. Is this definitely the format you
> want long-term?

Yes, this (the interface) is indeed the important part and therefore I
tried not to invent anything
on my own.
This is the interface used by ppc32, ppc64, x86, x86_64. It's also this is
how the getcpu(2) system call is documented.

/Fredrik


>
>
> Thanks,
> Mark.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
  2016-10-05 12:25       ` Fredrik Markström
@ 2016-10-05 16:39         ` Fredrik Markström
  -1 siblings, 0 replies; 27+ messages in thread
From: Fredrik Markström @ 2016-10-05 16:39 UTC (permalink / raw)
  To: Mark Rutland
  Cc: linux-arm-kernel, Russell King, Will Deacon, Chris Brandt,
	Nicolas Pitre, Ard Biesheuvel, Arnd Bergmann, Linus Walleij,
	Masahiro Yamada, Kees Cook, Jonathan Austin, Zhaoxiu Zeng,
	Michal Marek, linux-kernel, kristina.martsenko

The approach I suggested below with the vDSO data page will obviously
not work on smp, so suggestions are welcome.

/Fredrik


On Wed, Oct 5, 2016 at 2:25 PM, Fredrik Markström
<fredrik.markstrom@gmail.com> wrote:
> On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote:
>>
>> On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote:
>> > This makes getcpu() ~1000 times faster, this is very useful when
>> > implementing per-cpu buffers in userspace (to avoid cache line
>> > bouncing). As an example lttng ust becomes ~30% faster.
>> >
>> > The patch will break applications using TPIDRURW (which is context switched
>> > since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:
>>
>> It looks like you dropped the leading 'a' from the commit ID. For
>> everyone else's benefit, the full ID is:
>>
>>   a4780adeefd042482f624f5e0d577bf9cdcbb760
>
>
> Sorry for that and thanks for fixing it.
>
>>
>>
>> Please note that arm64 has done similar for compat tasks since commit:
>>
>>   d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for
>>   compat tasks")
>>
>> > Preserve the user r/w register TPIDRURW on context switch and fork")) and
>> > is therefore made configurable.
>>
>> As you note above, this is an ABI break and *will* break some existing
>> applications. That's generally a no-go.
>
>
> Ok, I wasn't sure this was considered an ABI (but I'm not entirely
> surprised ;) ). The way I was
> trying to defend the breakage was by reasoning that that if it was an
> ABI we broke it both with a4780ad
> and with 6a1c531, and since we don't break ABI:s, it can't be one.
>
> But hey, I'm humble here and ready to back off.
>
>>
>> This also leaves arm64's compat with the existing behaviour, differing
>> from arm.
>>
>> I was under the impression that other mechanisms were being considered
>> for fast userspace access to per-cpu data structures, e.g. restartable
>> sequences. What is the state of those? Why is this better?
>>
>> If getcpu() specifically is necessary, is there no other way to
>> implement it?
>
> If you are referring to the user space stuff can probably be
> implemented other ways,
> it's just convenient since the interface is there and it will speed up
> stuff like lttng without
> modifications (well, except glibc). It's also already implemented as a
> vDSO on other
> major architectures (like x86, x86_64, ppc32 and ppc64).
>
> If you are referring to the implementation of the vdso call, there are
> other possibilities, but
> I haven't found any that doesn't introduce overhead in context switching.
>
> But if TPIDRURW is definitely a no go, I can work on a patch that does
> this with a thread notifier
> and the vdso data page. Would that be a viable option ?
>
>>
>> > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
>> > +                       struct getcpu_cache *tcache)
>> > +{
>> > +     unsigned long node_and_cpu;
>> > +
>> > +     asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
>> > +
>> > +     if (nodep)
>> > +             *nodep = cpu_to_node(node_and_cpu >> 16);
>> > +     if (cpup)
>> > +             *cpup  = node_and_cpu & 0xffffUL;
>>
>> Given this is directly user-accessible, this format is a de-facto ABI,
>> even if it's not documented as such. Is this definitely the format you
>> want long-term?
>
> Yes, this (the interface) is indeed the important part and therefore I
> tried not to invent anything
> on my own.
> This is the interface used by ppc32, ppc64, x86, x86_64. It's also this is
> how the getcpu(2) system call is documented.
>
> /Fredrik
>
>
>>
>>
>> Thanks,
>> Mark.



-- 
/Fredrik

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
@ 2016-10-05 16:39         ` Fredrik Markström
  0 siblings, 0 replies; 27+ messages in thread
From: Fredrik Markström @ 2016-10-05 16:39 UTC (permalink / raw)
  To: linux-arm-kernel

The approach I suggested below with the vDSO data page will obviously
not work on smp, so suggestions are welcome.

/Fredrik


On Wed, Oct 5, 2016 at 2:25 PM, Fredrik Markstr?m
<fredrik.markstrom@gmail.com> wrote:
> On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote:
>>
>> On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote:
>> > This makes getcpu() ~1000 times faster, this is very useful when
>> > implementing per-cpu buffers in userspace (to avoid cache line
>> > bouncing). As an example lttng ust becomes ~30% faster.
>> >
>> > The patch will break applications using TPIDRURW (which is context switched
>> > since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:
>>
>> It looks like you dropped the leading 'a' from the commit ID. For
>> everyone else's benefit, the full ID is:
>>
>>   a4780adeefd042482f624f5e0d577bf9cdcbb760
>
>
> Sorry for that and thanks for fixing it.
>
>>
>>
>> Please note that arm64 has done similar for compat tasks since commit:
>>
>>   d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for
>>   compat tasks")
>>
>> > Preserve the user r/w register TPIDRURW on context switch and fork")) and
>> > is therefore made configurable.
>>
>> As you note above, this is an ABI break and *will* break some existing
>> applications. That's generally a no-go.
>
>
> Ok, I wasn't sure this was considered an ABI (but I'm not entirely
> surprised ;) ). The way I was
> trying to defend the breakage was by reasoning that that if it was an
> ABI we broke it both with a4780ad
> and with 6a1c531, and since we don't break ABI:s, it can't be one.
>
> But hey, I'm humble here and ready to back off.
>
>>
>> This also leaves arm64's compat with the existing behaviour, differing
>> from arm.
>>
>> I was under the impression that other mechanisms were being considered
>> for fast userspace access to per-cpu data structures, e.g. restartable
>> sequences. What is the state of those? Why is this better?
>>
>> If getcpu() specifically is necessary, is there no other way to
>> implement it?
>
> If you are referring to the user space stuff can probably be
> implemented other ways,
> it's just convenient since the interface is there and it will speed up
> stuff like lttng without
> modifications (well, except glibc). It's also already implemented as a
> vDSO on other
> major architectures (like x86, x86_64, ppc32 and ppc64).
>
> If you are referring to the implementation of the vdso call, there are
> other possibilities, but
> I haven't found any that doesn't introduce overhead in context switching.
>
> But if TPIDRURW is definitely a no go, I can work on a patch that does
> this with a thread notifier
> and the vdso data page. Would that be a viable option ?
>
>>
>> > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
>> > +                       struct getcpu_cache *tcache)
>> > +{
>> > +     unsigned long node_and_cpu;
>> > +
>> > +     asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
>> > +
>> > +     if (nodep)
>> > +             *nodep = cpu_to_node(node_and_cpu >> 16);
>> > +     if (cpup)
>> > +             *cpup  = node_and_cpu & 0xffffUL;
>>
>> Given this is directly user-accessible, this format is a de-facto ABI,
>> even if it's not documented as such. Is this definitely the format you
>> want long-term?
>
> Yes, this (the interface) is indeed the important part and therefore I
> tried not to invent anything
> on my own.
> This is the interface used by ppc32, ppc64, x86, x86_64. It's also this is
> how the getcpu(2) system call is documented.
>
> /Fredrik
>
>
>>
>>
>> Thanks,
>> Mark.



-- 
/Fredrik

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
  2016-10-05 16:39         ` Fredrik Markström
@ 2016-10-05 17:48           ` Robin Murphy
  -1 siblings, 0 replies; 27+ messages in thread
From: Robin Murphy @ 2016-10-05 17:48 UTC (permalink / raw)
  To: Fredrik Markström, Mark Rutland
  Cc: Kees Cook, Arnd Bergmann, Ard Biesheuvel, Linus Walleij,
	Nicolas Pitre, Will Deacon, Russell King, kristina.martsenko,
	linux-kernel, Masahiro Yamada, Chris Brandt, Michal Marek,
	Zhaoxiu Zeng, linux-arm-kernel, Jonathan Austin

On 05/10/16 17:39, Fredrik Markström wrote:
> The approach I suggested below with the vDSO data page will obviously
> not work on smp, so suggestions are welcome.

Well, given that it's user-writeable, is there any reason an application
which cares couldn't simply run some per-cpu threads to call getcpu()
once and cache the result in TPIDRURW themselves? That would appear to
both raise no compatibility issues and work with existing kernels.

Robin.

> /Fredrik
> 
> 
> On Wed, Oct 5, 2016 at 2:25 PM, Fredrik Markström
> <fredrik.markstrom@gmail.com> wrote:
>> On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote:
>>>
>>> On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote:
>>>> This makes getcpu() ~1000 times faster, this is very useful when
>>>> implementing per-cpu buffers in userspace (to avoid cache line
>>>> bouncing). As an example lttng ust becomes ~30% faster.
>>>>
>>>> The patch will break applications using TPIDRURW (which is context switched
>>>> since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:
>>>
>>> It looks like you dropped the leading 'a' from the commit ID. For
>>> everyone else's benefit, the full ID is:
>>>
>>>   a4780adeefd042482f624f5e0d577bf9cdcbb760
>>
>>
>> Sorry for that and thanks for fixing it.
>>
>>>
>>>
>>> Please note that arm64 has done similar for compat tasks since commit:
>>>
>>>   d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for
>>>   compat tasks")
>>>
>>>> Preserve the user r/w register TPIDRURW on context switch and fork")) and
>>>> is therefore made configurable.
>>>
>>> As you note above, this is an ABI break and *will* break some existing
>>> applications. That's generally a no-go.
>>
>>
>> Ok, I wasn't sure this was considered an ABI (but I'm not entirely
>> surprised ;) ). The way I was
>> trying to defend the breakage was by reasoning that that if it was an
>> ABI we broke it both with a4780ad
>> and with 6a1c531, and since we don't break ABI:s, it can't be one.
>>
>> But hey, I'm humble here and ready to back off.
>>
>>>
>>> This also leaves arm64's compat with the existing behaviour, differing
>>> from arm.
>>>
>>> I was under the impression that other mechanisms were being considered
>>> for fast userspace access to per-cpu data structures, e.g. restartable
>>> sequences. What is the state of those? Why is this better?
>>>
>>> If getcpu() specifically is necessary, is there no other way to
>>> implement it?
>>
>> If you are referring to the user space stuff can probably be
>> implemented other ways,
>> it's just convenient since the interface is there and it will speed up
>> stuff like lttng without
>> modifications (well, except glibc). It's also already implemented as a
>> vDSO on other
>> major architectures (like x86, x86_64, ppc32 and ppc64).
>>
>> If you are referring to the implementation of the vdso call, there are
>> other possibilities, but
>> I haven't found any that doesn't introduce overhead in context switching.
>>
>> But if TPIDRURW is definitely a no go, I can work on a patch that does
>> this with a thread notifier
>> and the vdso data page. Would that be a viable option ?
>>
>>>
>>>> +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
>>>> +                       struct getcpu_cache *tcache)
>>>> +{
>>>> +     unsigned long node_and_cpu;
>>>> +
>>>> +     asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
>>>> +
>>>> +     if (nodep)
>>>> +             *nodep = cpu_to_node(node_and_cpu >> 16);
>>>> +     if (cpup)
>>>> +             *cpup  = node_and_cpu & 0xffffUL;
>>>
>>> Given this is directly user-accessible, this format is a de-facto ABI,
>>> even if it's not documented as such. Is this definitely the format you
>>> want long-term?
>>
>> Yes, this (the interface) is indeed the important part and therefore I
>> tried not to invent anything
>> on my own.
>> This is the interface used by ppc32, ppc64, x86, x86_64. It's also this is
>> how the getcpu(2) system call is documented.
>>
>> /Fredrik
>>
>>
>>>
>>>
>>> Thanks,
>>> Mark.
> 
> 
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
@ 2016-10-05 17:48           ` Robin Murphy
  0 siblings, 0 replies; 27+ messages in thread
From: Robin Murphy @ 2016-10-05 17:48 UTC (permalink / raw)
  To: linux-arm-kernel

On 05/10/16 17:39, Fredrik Markstr?m wrote:
> The approach I suggested below with the vDSO data page will obviously
> not work on smp, so suggestions are welcome.

Well, given that it's user-writeable, is there any reason an application
which cares couldn't simply run some per-cpu threads to call getcpu()
once and cache the result in TPIDRURW themselves? That would appear to
both raise no compatibility issues and work with existing kernels.

Robin.

> /Fredrik
> 
> 
> On Wed, Oct 5, 2016 at 2:25 PM, Fredrik Markstr?m
> <fredrik.markstrom@gmail.com> wrote:
>> On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote:
>>>
>>> On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote:
>>>> This makes getcpu() ~1000 times faster, this is very useful when
>>>> implementing per-cpu buffers in userspace (to avoid cache line
>>>> bouncing). As an example lttng ust becomes ~30% faster.
>>>>
>>>> The patch will break applications using TPIDRURW (which is context switched
>>>> since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2:
>>>
>>> It looks like you dropped the leading 'a' from the commit ID. For
>>> everyone else's benefit, the full ID is:
>>>
>>>   a4780adeefd042482f624f5e0d577bf9cdcbb760
>>
>>
>> Sorry for that and thanks for fixing it.
>>
>>>
>>>
>>> Please note that arm64 has done similar for compat tasks since commit:
>>>
>>>   d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for
>>>   compat tasks")
>>>
>>>> Preserve the user r/w register TPIDRURW on context switch and fork")) and
>>>> is therefore made configurable.
>>>
>>> As you note above, this is an ABI break and *will* break some existing
>>> applications. That's generally a no-go.
>>
>>
>> Ok, I wasn't sure this was considered an ABI (but I'm not entirely
>> surprised ;) ). The way I was
>> trying to defend the breakage was by reasoning that that if it was an
>> ABI we broke it both with a4780ad
>> and with 6a1c531, and since we don't break ABI:s, it can't be one.
>>
>> But hey, I'm humble here and ready to back off.
>>
>>>
>>> This also leaves arm64's compat with the existing behaviour, differing
>>> from arm.
>>>
>>> I was under the impression that other mechanisms were being considered
>>> for fast userspace access to per-cpu data structures, e.g. restartable
>>> sequences. What is the state of those? Why is this better?
>>>
>>> If getcpu() specifically is necessary, is there no other way to
>>> implement it?
>>
>> If you are referring to the user space stuff can probably be
>> implemented other ways,
>> it's just convenient since the interface is there and it will speed up
>> stuff like lttng without
>> modifications (well, except glibc). It's also already implemented as a
>> vDSO on other
>> major architectures (like x86, x86_64, ppc32 and ppc64).
>>
>> If you are referring to the implementation of the vdso call, there are
>> other possibilities, but
>> I haven't found any that doesn't introduce overhead in context switching.
>>
>> But if TPIDRURW is definitely a no go, I can work on a patch that does
>> this with a thread notifier
>> and the vdso data page. Would that be a viable option ?
>>
>>>
>>>> +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
>>>> +                       struct getcpu_cache *tcache)
>>>> +{
>>>> +     unsigned long node_and_cpu;
>>>> +
>>>> +     asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
>>>> +
>>>> +     if (nodep)
>>>> +             *nodep = cpu_to_node(node_and_cpu >> 16);
>>>> +     if (cpup)
>>>> +             *cpup  = node_and_cpu & 0xffffUL;
>>>
>>> Given this is directly user-accessible, this format is a de-facto ABI,
>>> even if it's not documented as such. Is this definitely the format you
>>> want long-term?
>>
>> Yes, this (the interface) is indeed the important part and therefore I
>> tried not to invent anything
>> on my own.
>> This is the interface used by ppc32, ppc64, x86, x86_64. It's also this is
>> how the getcpu(2) system call is documented.
>>
>> /Fredrik
>>
>>
>>>
>>>
>>> Thanks,
>>> Mark.
> 
> 
> 

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
  2016-10-05 17:48           ` Robin Murphy
@ 2016-10-05 19:53             ` Russell King - ARM Linux
  -1 siblings, 0 replies; 27+ messages in thread
From: Russell King - ARM Linux @ 2016-10-05 19:53 UTC (permalink / raw)
  To: Robin Murphy
  Cc: Fredrik Markström, Mark Rutland, Kees Cook, Arnd Bergmann,
	Ard Biesheuvel, Linus Walleij, Nicolas Pitre, Will Deacon,
	kristina.martsenko, linux-kernel, Masahiro Yamada, Chris Brandt,
	Michal Marek, Zhaoxiu Zeng, linux-arm-kernel, Jonathan Austin

On Wed, Oct 05, 2016 at 06:48:05PM +0100, Robin Murphy wrote:
> On 05/10/16 17:39, Fredrik Markström wrote:
> > The approach I suggested below with the vDSO data page will obviously
> > not work on smp, so suggestions are welcome.
> 
> Well, given that it's user-writeable, is there any reason an application
> which cares couldn't simply run some per-cpu threads to call getcpu()
> once and cache the result in TPIDRURW themselves? That would appear to
> both raise no compatibility issues and work with existing kernels.

There is - the contents of TPIDRURW is thread specific, and it moves
with the thread between CPU cores.  So, if a thread was running on CPU0
when it cached the getcpu() value in TPIDRURW, and then migrated to CPU1,
TPIDRURW would still contain 0.

I'm also not in favour of changing the TPIDRURW usage to be a storage
repository for the CPU number - it's far too specific a usage and seems
like a waste of hardware resources to solve one problem.  As Mark says,
it's an ABI breaking change too, even if it is under a config option.

Take a moment to consider distro kernels: how should they set this
config option - should they enable it to get faster getcpu() or should
they disable it to retain existing compatibility to prevent userspace
breakage.  Who can advise them to make the right decision?  Kernel
developers can't, because the usage of this register is purely a
userspace issue right now, and kernels devs don't know what use it's
been put to.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
@ 2016-10-05 19:53             ` Russell King - ARM Linux
  0 siblings, 0 replies; 27+ messages in thread
From: Russell King - ARM Linux @ 2016-10-05 19:53 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Oct 05, 2016 at 06:48:05PM +0100, Robin Murphy wrote:
> On 05/10/16 17:39, Fredrik Markstr?m wrote:
> > The approach I suggested below with the vDSO data page will obviously
> > not work on smp, so suggestions are welcome.
> 
> Well, given that it's user-writeable, is there any reason an application
> which cares couldn't simply run some per-cpu threads to call getcpu()
> once and cache the result in TPIDRURW themselves? That would appear to
> both raise no compatibility issues and work with existing kernels.

There is - the contents of TPIDRURW is thread specific, and it moves
with the thread between CPU cores.  So, if a thread was running on CPU0
when it cached the getcpu() value in TPIDRURW, and then migrated to CPU1,
TPIDRURW would still contain 0.

I'm also not in favour of changing the TPIDRURW usage to be a storage
repository for the CPU number - it's far too specific a usage and seems
like a waste of hardware resources to solve one problem.  As Mark says,
it's an ABI breaking change too, even if it is under a config option.

Take a moment to consider distro kernels: how should they set this
config option - should they enable it to get faster getcpu() or should
they disable it to retain existing compatibility to prevent userspace
breakage.  Who can advise them to make the right decision?  Kernel
developers can't, because the usage of this register is purely a
userspace issue right now, and kernels devs don't know what use it's
been put to.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
  2016-10-05 12:25       ` Fredrik Markström
@ 2016-10-05 20:12         ` Mark Rutland
  -1 siblings, 0 replies; 27+ messages in thread
From: Mark Rutland @ 2016-10-05 20:12 UTC (permalink / raw)
  To: Fredrik Markström
  Cc: linux-arm-kernel, Russell King, Will Deacon, Chris Brandt,
	Nicolas Pitre, Ard Biesheuvel, Arnd Bergmann, Linus Walleij,
	Masahiro Yamada, Kees Cook, Jonathan Austin, Zhaoxiu Zeng,
	Michal Marek, linux-kernel, kristina.martsenko

On Wed, Oct 05, 2016 at 02:25:22PM +0200, Fredrik Markström wrote:
> On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote:
> > On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote:
> The way I was trying to defend the breakage was by reasoning that that if it
> was an ABI we broke it both with a4780ad and with 6a1c531, and since we don't
> break ABI:s, it can't be one.

Prior to commit 6a1c531, other programs and/or cpuidle could have corrupted
TPIDRURW to arbitrary values at any point in time, so it couldn't have been
relied upon.

Arguably, someone could have (ab)used TPIDRURW between commits 6a1c531 and
a4780ad to detect context switches, but in practice they don't appear to have,
and we know of an established user relying on the current behaviour.

For better or worse, the current behaviour is ABI.

> > I was under the impression that other mechanisms were being considered
> > for fast userspace access to per-cpu data structures, e.g. restartable
> > sequences. What is the state of those? Why is this better?
> >
> > If getcpu() specifically is necessary, is there no other way to
> > implement it?
> 
> If you are referring to the user space stuff can probably be implemented
> other ways, it's just convenient since the interface is there and it will
> speed up stuff like lttng without modifications (well, except glibc). It's
> also already implemented as a vDSO on other major architectures (like x86,
> x86_64, ppc32 and ppc64).
> 
> If you are referring to the implementation of the vdso call, there are other
> possibilities, but I haven't found any that doesn't introduce overhead in
> context switching.
>
> But if TPIDRURW is definitely a no go, I can work on a patch that does this
> with a thread notifier and the vdso data page. Would that be a viable option?

As pointed out, that won't work for SMP, but perhaps we can come up with
something that does.

> > > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
> > > +                       struct getcpu_cache *tcache)
> > > +{
> > > +     unsigned long node_and_cpu;
> > > +
> > > +     asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
> > > +
> > > +     if (nodep)
> > > +             *nodep = cpu_to_node(node_and_cpu >> 16);
> > > +     if (cpup)
> > > +             *cpup  = node_and_cpu & 0xffffUL;
> >
> > Given this is directly user-accessible, this format is a de-facto ABI,
> > even if it's not documented as such. Is this definitely the format you
> > want long-term?
> 
> Yes, this (the interface) is indeed the important part and therefore I tried
> not to invent anything on my own.  This is the interface used by ppc32,
> ppc64, x86, x86_64. It's also this is how the getcpu(2) system call is
> documented.

I was referring to the value in TPIDRURW specifically. If an application
started reading that directly (rather than going through the vDSO), we wouldn't
be able to change the register value in future.

That's all moot, given we can't repurpose TPIDRURW.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
@ 2016-10-05 20:12         ` Mark Rutland
  0 siblings, 0 replies; 27+ messages in thread
From: Mark Rutland @ 2016-10-05 20:12 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Oct 05, 2016 at 02:25:22PM +0200, Fredrik Markstr?m wrote:
> On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote:
> > On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote:
> The way I was trying to defend the breakage was by reasoning that that if it
> was an ABI we broke it both with a4780ad and with 6a1c531, and since we don't
> break ABI:s, it can't be one.

Prior to commit 6a1c531, other programs and/or cpuidle could have corrupted
TPIDRURW to arbitrary values at any point in time, so it couldn't have been
relied upon.

Arguably, someone could have (ab)used TPIDRURW between commits 6a1c531 and
a4780ad to detect context switches, but in practice they don't appear to have,
and we know of an established user relying on the current behaviour.

For better or worse, the current behaviour is ABI.

> > I was under the impression that other mechanisms were being considered
> > for fast userspace access to per-cpu data structures, e.g. restartable
> > sequences. What is the state of those? Why is this better?
> >
> > If getcpu() specifically is necessary, is there no other way to
> > implement it?
> 
> If you are referring to the user space stuff can probably be implemented
> other ways, it's just convenient since the interface is there and it will
> speed up stuff like lttng without modifications (well, except glibc). It's
> also already implemented as a vDSO on other major architectures (like x86,
> x86_64, ppc32 and ppc64).
> 
> If you are referring to the implementation of the vdso call, there are other
> possibilities, but I haven't found any that doesn't introduce overhead in
> context switching.
>
> But if TPIDRURW is definitely a no go, I can work on a patch that does this
> with a thread notifier and the vdso data page. Would that be a viable option?

As pointed out, that won't work for SMP, but perhaps we can come up with
something that does.

> > > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep,
> > > +                       struct getcpu_cache *tcache)
> > > +{
> > > +     unsigned long node_and_cpu;
> > > +
> > > +     asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu));
> > > +
> > > +     if (nodep)
> > > +             *nodep = cpu_to_node(node_and_cpu >> 16);
> > > +     if (cpup)
> > > +             *cpup  = node_and_cpu & 0xffffUL;
> >
> > Given this is directly user-accessible, this format is a de-facto ABI,
> > even if it's not documented as such. Is this definitely the format you
> > want long-term?
> 
> Yes, this (the interface) is indeed the important part and therefore I tried
> not to invent anything on my own.  This is the interface used by ppc32,
> ppc64, x86, x86_64. It's also this is how the getcpu(2) system call is
> documented.

I was referring to the value in TPIDRURW specifically. If an application
started reading that directly (rather than going through the vDSO), we wouldn't
be able to change the register value in future.

That's all moot, given we can't repurpose TPIDRURW.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
       [not found]           ` <CAKdL+dQH=9C2aGf7ys5-vXM7pkdPYUQ8xYWLipwVbABOz09f1g@mail.gmail.com>
@ 2016-10-05 20:44               ` Mark Rutland
  0 siblings, 0 replies; 27+ messages in thread
From: Mark Rutland @ 2016-10-05 20:44 UTC (permalink / raw)
  To: Fredrik Markström
  Cc: Robin Murphy, Kees Cook, Arnd Bergmann, Ard Biesheuvel,
	Linus Walleij, Nicolas Pitre, Will Deacon, Russell King,
	kristina.martsenko, linux-kernel, Masahiro Yamada, Chris Brandt,
	Michal Marek, Zhaoxiu Zeng, linux-arm-kernel, Jonathan Austin

On Wed, Oct 05, 2016 at 08:00:38PM +0000, Fredrik Markström wrote:
> On Wed, Oct 5, 2016 at 7:48 PM Robin Murphy <robin.murphy@arm.com> wrote:
> As far as I understand TPIDRURW isn't anything else then an architecture
> specific piece of tls since the last patch, possibly slightly faster then a
> "__thread u32 x;"
> 
> The irony is that the two different ways it was handled earlier (not context
> switched or always set to zero on swap in) would have made it useful for this
> purpose.

The "not context switched" case was also arbitrarily corrupted, and could not
have been relied upon.

The zeroing case is similar to the restartable sequences design. So that's
probably worth looking into.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
@ 2016-10-05 20:44               ` Mark Rutland
  0 siblings, 0 replies; 27+ messages in thread
From: Mark Rutland @ 2016-10-05 20:44 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Oct 05, 2016 at 08:00:38PM +0000, Fredrik Markstr?m wrote:
> On Wed, Oct 5, 2016 at 7:48 PM Robin Murphy <robin.murphy@arm.com> wrote:
> As far as I understand TPIDRURW isn't anything else then an architecture
> specific piece of tls since the last patch, possibly slightly faster then a
> "__thread u32 x;"
> 
> The irony is that the two different ways it was handled earlier (not context
> switched or always set to zero on swap in) would have made it useful for this
> purpose.

The "not context switched" case was also arbitrarily corrupted, and could not
have been relied upon.

The zeroing case is similar to the restartable sequences design. So that's
probably worth looking into.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
  2016-10-05 20:44               ` Mark Rutland
@ 2016-10-05 21:01                 ` Russell King - ARM Linux
  -1 siblings, 0 replies; 27+ messages in thread
From: Russell King - ARM Linux @ 2016-10-05 21:01 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Fredrik Markström, Robin Murphy, Kees Cook, Arnd Bergmann,
	Ard Biesheuvel, Linus Walleij, Nicolas Pitre, Will Deacon,
	kristina.martsenko, linux-kernel, Masahiro Yamada, Chris Brandt,
	Michal Marek, Zhaoxiu Zeng, linux-arm-kernel, Jonathan Austin

On Wed, Oct 05, 2016 at 09:44:53PM +0100, Mark Rutland wrote:
> The zeroing case is similar to the restartable sequences design. So that's
> probably worth looking into.

You're sending mixed messages: in your previous message, you said:

  Arguably, someone could have (ab)used TPIDRURW between commits 6a1c531
  and a4780ad to detect context switches, but in practice they don't
  appear to have, and we know of an established user relying on the
  current behaviour.

  For better or worse, the current behaviour is ABI.

Now you're suggesting that we could go back to the case where the
register is zeroed.

Well, the fact is that we _can_ change the TPIDRURW behaviour - we just
need to be careful about how we change it.  Eg, we _could_ introduce a
per-process flag which indicates that we want some other behaviour from
TPIDRURW such as zeroing it on context switches.  The default would be
to preserve the existing behaviour as doing anything else breaks
existing programs.  The problem there is finding an acceptable way to
control such a flag from userspace (eg, prctl, syscall, etc).

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
@ 2016-10-05 21:01                 ` Russell King - ARM Linux
  0 siblings, 0 replies; 27+ messages in thread
From: Russell King - ARM Linux @ 2016-10-05 21:01 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Oct 05, 2016 at 09:44:53PM +0100, Mark Rutland wrote:
> The zeroing case is similar to the restartable sequences design. So that's
> probably worth looking into.

You're sending mixed messages: in your previous message, you said:

  Arguably, someone could have (ab)used TPIDRURW between commits 6a1c531
  and a4780ad to detect context switches, but in practice they don't
  appear to have, and we know of an established user relying on the
  current behaviour.

  For better or worse, the current behaviour is ABI.

Now you're suggesting that we could go back to the case where the
register is zeroed.

Well, the fact is that we _can_ change the TPIDRURW behaviour - we just
need to be careful about how we change it.  Eg, we _could_ introduce a
per-process flag which indicates that we want some other behaviour from
TPIDRURW such as zeroing it on context switches.  The default would be
to preserve the existing behaviour as doing anything else breaks
existing programs.  The problem there is finding an acceptable way to
control such a flag from userspace (eg, prctl, syscall, etc).

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
  2016-10-05 20:44               ` Mark Rutland
@ 2016-10-05 21:37                 ` Fredrik Markström
  -1 siblings, 0 replies; 27+ messages in thread
From: Fredrik Markström @ 2016-10-05 21:37 UTC (permalink / raw)
  To: Mark Rutland
  Cc: Robin Murphy, Kees Cook, Arnd Bergmann, Ard Biesheuvel,
	Linus Walleij, Nicolas Pitre, Will Deacon, Russell King,
	kristina.martsenko, linux-kernel, Masahiro Yamada, Chris Brandt,
	Michal Marek, Zhaoxiu Zeng, linux-arm-kernel, Jonathan Austin

On Wed, Oct 5, 2016 at 10:44 PM, Mark Rutland <mark.rutland@arm.com> wrote:
> On Wed, Oct 05, 2016 at 08:00:38PM +0000, Fredrik Markström wrote:
>> On Wed, Oct 5, 2016 at 7:48 PM Robin Murphy <robin.murphy@arm.com> wrote:
>> As far as I understand TPIDRURW isn't anything else then an architecture
>> specific piece of tls since the last patch, possibly slightly faster then a
>> "__thread u32 x;"
>>
>> The irony is that the two different ways it was handled earlier (not context
>> switched or always set to zero on swap in) would have made it useful for this
>> purpose.
>
> The "not context switched" case was also arbitrarily corrupted, and could not
> have been relied upon.

Ok, I missed that, sorry !

>
> The zeroing case is similar to the restartable sequences design. So that's
> probably worth looking into.

Ok, I'm starting to believe my best bet is to hope that those make it
into the kernel
eventually, until then I'll probably just go with a local solution.

/Fredrik

>
> Thanks,
> Mark.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
@ 2016-10-05 21:37                 ` Fredrik Markström
  0 siblings, 0 replies; 27+ messages in thread
From: Fredrik Markström @ 2016-10-05 21:37 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Oct 5, 2016 at 10:44 PM, Mark Rutland <mark.rutland@arm.com> wrote:
> On Wed, Oct 05, 2016 at 08:00:38PM +0000, Fredrik Markstr?m wrote:
>> On Wed, Oct 5, 2016 at 7:48 PM Robin Murphy <robin.murphy@arm.com> wrote:
>> As far as I understand TPIDRURW isn't anything else then an architecture
>> specific piece of tls since the last patch, possibly slightly faster then a
>> "__thread u32 x;"
>>
>> The irony is that the two different ways it was handled earlier (not context
>> switched or always set to zero on swap in) would have made it useful for this
>> purpose.
>
> The "not context switched" case was also arbitrarily corrupted, and could not
> have been relied upon.

Ok, I missed that, sorry !

>
> The zeroing case is similar to the restartable sequences design. So that's
> probably worth looking into.

Ok, I'm starting to believe my best bet is to hope that those make it
into the kernel
eventually, until then I'll probably just go with a local solution.

/Fredrik

>
> Thanks,
> Mark.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
  2016-10-05 21:01                 ` Russell King - ARM Linux
@ 2016-10-05 21:47                   ` Mark Rutland
  -1 siblings, 0 replies; 27+ messages in thread
From: Mark Rutland @ 2016-10-05 21:47 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Fredrik Markström, Robin Murphy, Kees Cook, Arnd Bergmann,
	Ard Biesheuvel, Linus Walleij, Nicolas Pitre, Will Deacon,
	kristina.martsenko, linux-kernel, Masahiro Yamada, Chris Brandt,
	Michal Marek, Zhaoxiu Zeng, linux-arm-kernel, Jonathan Austin

On Wed, Oct 05, 2016 at 10:01:38PM +0100, Russell King - ARM Linux wrote:
> On Wed, Oct 05, 2016 at 09:44:53PM +0100, Mark Rutland wrote:
> > The zeroing case is similar to the restartable sequences design. So that's
> > probably worth looking into.
> 
> You're sending mixed messages: in your previous message, you said:
> 
>   Arguably, someone could have (ab)used TPIDRURW between commits 6a1c531
>   and a4780ad to detect context switches, but in practice they don't
>   appear to have, and we know of an established user relying on the
>   current behaviour.
> 
>   For better or worse, the current behaviour is ABI.
> 
> Now you're suggesting that we could go back to the case where the
> register is zeroed.

Sorry; clumsy wording on my behalf.

I meant that functionality-wise, restartable sequences had similar behaviour to
the zeroing case (without touching TPIDRURW at all) and were probably worth
looking at. I did not intend to suggest that we should go pack to case where
TPIDRURW was zeroed.

> Well, the fact is that we _can_ change the TPIDRURW behaviour - we just
> need to be careful about how we change it.  Eg, we _could_ introduce a
> per-process flag which indicates that we want some other behaviour from
> TPIDRURW such as zeroing it on context switches.  The default would be
> to preserve the existing behaviour as doing anything else breaks
> existing programs.  The problem there is finding an acceptable way to
> control such a flag from userspace (eg, prctl, syscall, etc).

Sure. Something like that could work.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
@ 2016-10-05 21:47                   ` Mark Rutland
  0 siblings, 0 replies; 27+ messages in thread
From: Mark Rutland @ 2016-10-05 21:47 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Oct 05, 2016 at 10:01:38PM +0100, Russell King - ARM Linux wrote:
> On Wed, Oct 05, 2016 at 09:44:53PM +0100, Mark Rutland wrote:
> > The zeroing case is similar to the restartable sequences design. So that's
> > probably worth looking into.
> 
> You're sending mixed messages: in your previous message, you said:
> 
>   Arguably, someone could have (ab)used TPIDRURW between commits 6a1c531
>   and a4780ad to detect context switches, but in practice they don't
>   appear to have, and we know of an established user relying on the
>   current behaviour.
> 
>   For better or worse, the current behaviour is ABI.
> 
> Now you're suggesting that we could go back to the case where the
> register is zeroed.

Sorry; clumsy wording on my behalf.

I meant that functionality-wise, restartable sequences had similar behaviour to
the zeroing case (without touching TPIDRURW at all) and were probably worth
looking at. I did not intend to suggest that we should go pack to case where
TPIDRURW was zeroed.

> Well, the fact is that we _can_ change the TPIDRURW behaviour - we just
> need to be careful about how we change it.  Eg, we _could_ introduce a
> per-process flag which indicates that we want some other behaviour from
> TPIDRURW such as zeroing it on context switches.  The default would be
> to preserve the existing behaviour as doing anything else breaks
> existing programs.  The problem there is finding an acceptable way to
> control such a flag from userspace (eg, prctl, syscall, etc).

Sure. Something like that could work.

Thanks,
Mark.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
       [not found]             ` <CAKdL+dSt+cBCpwW5q+VCQh+7XeKrnyJgfTsEsuo2nKoUr9ytxw@mail.gmail.com>
@ 2016-10-10 15:29                 ` Will Deacon
  0 siblings, 0 replies; 27+ messages in thread
From: Will Deacon @ 2016-10-10 15:29 UTC (permalink / raw)
  To: Fredrik Markström
  Cc: Russell King - ARM Linux, Robin Murphy, Mark Rutland, Kees Cook,
	Arnd Bergmann, Ard Biesheuvel, Linus Walleij, Nicolas Pitre,
	kristina.martsenko, linux-kernel, Masahiro Yamada, Chris Brandt,
	Michal Marek, Zhaoxiu Zeng, linux-arm-kernel, Jonathan Austin,
	mathieu.desnoyers

Hi Fredrik,

[adding Mathieu -- background is getcpu() in userspace for arm]

On Thu, Oct 06, 2016 at 12:17:07AM +0200, Fredrik Markström wrote:
> On Wed, Oct 5, 2016 at 9:53 PM, Russell King - ARM Linux <linux@armlinux.org.uk
> > wrote:
> > On Wed, Oct 05, 2016 at 06:48:05PM +0100, Robin Murphy wrote:
> >> On 05/10/16 17:39, Fredrik Markström wrote:
> >> > The approach I suggested below with the vDSO data page will obviously
> >> > not work on smp, so suggestions are welcome.
> >>
> >> Well, given that it's user-writeable, is there any reason an application
> >> which cares couldn't simply run some per-cpu threads to call getcpu()
> >> once and cache the result in TPIDRURW themselves? That would appear to
> >> both raise no compatibility issues and work with existing kernels.
> >
> > There is - the contents of TPIDRURW is thread specific, and it moves
> > with the thread between CPU cores.  So, if a thread was running on CPU0
> > when it cached the getcpu() value in TPIDRURW, and then migrated to CPU1,
> > TPIDRURW would still contain 0.
> >
> > I'm also not in favour of changing the TPIDRURW usage to be a storage
> > repository for the CPU number - it's far too specific a usage and seems
> > like a waste of hardware resources to solve one problem. 
> 
> Ok, but right now it's nothing but a (architecture specific) piece of TLS,
> which we have generic mechanisms for. From my point of view that is a waste of
> hardware resources.
> 
> > As Mark says, it's an ABI breaking change too, even if it is under a config
> option.
> 
> I can't argue with that. If it's an ABI it's an ABI, even if I can't imagine
> why anyone would use it over normal tls... but then again, people probably do. 
> 
> So in conclusion I agree and give up.

Rather than give up, you could take a look at the patches from Mathieu
Desnoyers, that tackle this in a very different way. It's also the reason
we've been holding off implementing an optimised getcpu in the arm64 vdso,
because it might all well be replaced by the new restartable sequences
approach:

  http://lkml.kernel.org/r/1471637274-13583-1-git-send-email-mathieu.desnoyers@efficios.com

He's also got support for arch/arm/ in that series, so you could take
them for a spin. The main thing missing at the moment is justification
for the feature using real-world code, as requested by Linus:

  http://lkml.kernel.org/r/CA+55aFz+Q33m1+ju3ANaznBwYCcWo9D9WDr2=p0YLEF4gJF12g@mail.gmail.com

so if your per-cpu buffer use-case is compelling in its own right (as
opposed to a micro-benchmark), then you could chime in over there.

Will

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW
@ 2016-10-10 15:29                 ` Will Deacon
  0 siblings, 0 replies; 27+ messages in thread
From: Will Deacon @ 2016-10-10 15:29 UTC (permalink / raw)
  To: linux-arm-kernel

Hi Fredrik,

[adding Mathieu -- background is getcpu() in userspace for arm]

On Thu, Oct 06, 2016 at 12:17:07AM +0200, Fredrik Markstr?m wrote:
> On Wed, Oct 5, 2016 at 9:53 PM, Russell King - ARM Linux <linux@armlinux.org.uk
> > wrote:
> > On Wed, Oct 05, 2016 at 06:48:05PM +0100, Robin Murphy wrote:
> >> On 05/10/16 17:39, Fredrik Markstr?m wrote:
> >> > The approach I suggested below with the vDSO data page will obviously
> >> > not work on smp, so suggestions are welcome.
> >>
> >> Well, given that it's user-writeable, is there any reason an application
> >> which cares couldn't simply run some per-cpu threads to call getcpu()
> >> once and cache the result in TPIDRURW themselves? That would appear to
> >> both raise no compatibility issues and work with existing kernels.
> >
> > There is - the contents of TPIDRURW is thread specific, and it moves
> > with the thread between CPU cores.  So, if a thread was running on CPU0
> > when it cached the getcpu() value in TPIDRURW, and then migrated to CPU1,
> > TPIDRURW would still contain 0.
> >
> > I'm also not in favour of changing the TPIDRURW usage to be a storage
> > repository for the CPU number - it's far too specific a usage and seems
> > like a waste of hardware resources to solve one problem. 
> 
> Ok, but right now it's nothing but a (architecture specific) piece of TLS,
> which we have generic mechanisms for. From my point of view that is a waste of
> hardware resources.
> 
> > As Mark says, it's an ABI breaking change too, even if it is under a config
> option.
> 
> I can't argue with that. If it's an ABI it's an ABI, even if I can't imagine
> why anyone would use it over normal tls... but then again, people probably do. 
> 
> So in conclusion I agree and give up.

Rather than give up, you could take a look at the patches from Mathieu
Desnoyers, that tackle this in a very different way. It's also the reason
we've been holding off implementing an optimised getcpu in the arm64 vdso,
because it might all well be replaced by the new restartable sequences
approach:

  http://lkml.kernel.org/r/1471637274-13583-1-git-send-email-mathieu.desnoyers at efficios.com

He's also got support for arch/arm/ in that series, so you could take
them for a spin. The main thing missing at the moment is justification
for the feature using real-world code, as requested by Linus:

  http://lkml.kernel.org/r/CA+55aFz+Q33m1+ju3ANaznBwYCcWo9D9WDr2=p0YLEF4gJF12g at mail.gmail.com

so if your per-cpu buffer use-case is compelling in its own right (as
opposed to a micro-benchmark), then you could chime in over there.

Will

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Restartable Sequences benchmarks (was: Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW)
  2016-10-10 15:29                 ` Will Deacon
@ 2016-10-10 16:15                   ` Mathieu Desnoyers
  -1 siblings, 0 replies; 27+ messages in thread
From: Mathieu Desnoyers @ 2016-10-10 16:15 UTC (permalink / raw)
  To: Will Deacon, Peter Zijlstra, Boqun Feng, Paul E. McKenney,
	Linus Torvalds, Dave Watson, Ben Maurer, Paul Turner,
	Andrew Hunter
  Cc: Fredrik Markström, Russell King - ARM Linux, Robin Murphy,
	Mark Rutland, Kees Cook, Arnd Bergmann, Ard Biesheuvel,
	Linus Walleij, Nicolas Pitre, kristina martsenko, linux-kernel,
	Masahiro Yamada, Chris Brandt, Michal Marek, Zhaoxiu Zeng,
	linux-arm-kernel, Jonathan Austin

----- On Oct 10, 2016, at 5:29 PM, Will Deacon will.deacon@arm.com wrote:

> Hi Fredrik,
> 
> [adding Mathieu -- background is getcpu() in userspace for arm]
> 
> On Thu, Oct 06, 2016 at 12:17:07AM +0200, Fredrik Markström wrote:
>> On Wed, Oct 5, 2016 at 9:53 PM, Russell King - ARM Linux <linux@armlinux.org.uk
>> > wrote:
>> > On Wed, Oct 05, 2016 at 06:48:05PM +0100, Robin Murphy wrote:
>> >> On 05/10/16 17:39, Fredrik Markström wrote:
>> >> > The approach I suggested below with the vDSO data page will obviously
>> >> > not work on smp, so suggestions are welcome.
>> >>
>> >> Well, given that it's user-writeable, is there any reason an application
>> >> which cares couldn't simply run some per-cpu threads to call getcpu()
>> >> once and cache the result in TPIDRURW themselves? That would appear to
>> >> both raise no compatibility issues and work with existing kernels.
>> >
>> > There is - the contents of TPIDRURW is thread specific, and it moves
>> > with the thread between CPU cores.  So, if a thread was running on CPU0
>> > when it cached the getcpu() value in TPIDRURW, and then migrated to CPU1,
>> > TPIDRURW would still contain 0.
>> >
>> > I'm also not in favour of changing the TPIDRURW usage to be a storage
>> > repository for the CPU number - it's far too specific a usage and seems
>> > like a waste of hardware resources to solve one problem.
>> 
>> Ok, but right now it's nothing but a (architecture specific) piece of TLS,
>> which we have generic mechanisms for. From my point of view that is a waste of
>> hardware resources.
>> 
>> > As Mark says, it's an ABI breaking change too, even if it is under a config
>> option.
>> 
>> I can't argue with that. If it's an ABI it's an ABI, even if I can't imagine
>> why anyone would use it over normal tls... but then again, people probably do.
>> 
>> So in conclusion I agree and give up.
> 
> Rather than give up, you could take a look at the patches from Mathieu
> Desnoyers, that tackle this in a very different way. It's also the reason
> we've been holding off implementing an optimised getcpu in the arm64 vdso,
> because it might all well be replaced by the new restartable sequences
> approach:
> 
>  http://lkml.kernel.org/r/1471637274-13583-1-git-send-email-mathieu.desnoyers@efficios.com
> 
> He's also got support for arch/arm/ in that series, so you could take
> them for a spin. The main thing missing at the moment is justification
> for the feature using real-world code, as requested by Linus:
> 
>  http://lkml.kernel.org/r/CA+55aFz+Q33m1+ju3ANaznBwYCcWo9D9WDr2=p0YLEF4gJF12g@mail.gmail.com
> 
> so if your per-cpu buffer use-case is compelling in its own right (as
> opposed to a micro-benchmark), then you could chime in over there.
> 
> Will

FYI, I've adapted lttng-ust ring buffer (as a POC) to rseq in a dev
branch. I see interesting speedups. See top 3-4 commits of
https://github.com/compudj/lttng-ust-dev/tree/rseq-integration
(start with "Use rseq for...").

On x86-64, we have a 7ns speedup over sched_getcpu on x86-64, and
30ns speedup by using rseq atomics on x86-64, which brings the cost
per event record down to about 100ns/event. This replaces 3 atomic
operations on the fast path. (37% speedup)

On arm32, the cpu_id acceleration gives a 327 ns/event speed increase,
which brings speed to 2000ns/event. Note that reading time on that
system does not use the vDSO (old glibc), so it implies a system call.
This accounts for 857ns/events. I don't observe speed increase nor
slowdown by using rseq instead of ll/sc atomic operations on that
specific board (Cubietruck, only has 2 cores). I suspect that boards
with more core will benefit more of replacing ll/sc by rseq atomics.
If we don't account the overhead of reading time through system call,
we get a 22% speedup.

I have extra benchmarks in this branch:
https://github.com/compudj/rseq-test

Updated ref for current rseq-enabled kernel based on 4.8:
https://github.com/compudj/linux-percpu-dev/tree/rseq-fallback

(ARM64 port would be welcome!) :)

As Will pointed out, what I'm currently looking for is real-life
benchmarks that shows benefits of rseq. I fear that the microbenchmarks
I have for the lttng-ust tracer may be dismissed as being too specific.
Most heavy users of LTTng-UST are closed source applications, so it's
not easy for me to provide numbers in real-life scenarios.

The major use-case besides per-cpu buffering/tracing AFAIU is memory
allocators. It will mainly benefit in use-cases where there are more
threads than cores in a multithreaded application. This mainly makes
sense if threads are either dedicated to specific tasks, and therefore
are often idle, or in use-cases where worker threads are expected to
block (else, if threads are not expected to block, the application
should simply have one thread per core).

Dave Watson had interesting RSS shrinkage on this stress-test program:
http://locklessinc.com/downloads/t-test1.c modified to have 500 threads.
It uses jemalloc modified to use rseq.

I reproduced it on my laptop with 100 threads, 50000 loops:

4-core, 100 threads, 50000 loops.

malloc:
  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND
10136 compudj   20   0 2857840  24756   1348 R 379.4  0.4   3:49.50 t-test1
real    3m20.830s
user    3m22.164s
sys     9m40.936s

upstream jemalloc:
  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND
21234 compudj   20   0 2227124  49300   2280 S 306.2  0.7   2:26.97 t-test1
real    1m3.297s
user    3m19.616s
sys     0m8.500s

rseq jemalloc 4.2.1:
  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND
25652 compudj   20   0  877956  35624   3260 S 301.2  0.5   1:26.07 t-test1
real    0m27.639s
user    1m18.172s
sys     0m1.752s

The next step to translate this into a "real-life" number would be to
run rseq-jemalloc on a facebook node, but Dave has been in vacation for
the past few weeks. Perhaps someone else at Facebook or Google could
look into this ?

Cheers,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Restartable Sequences benchmarks (was: Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW)
@ 2016-10-10 16:15                   ` Mathieu Desnoyers
  0 siblings, 0 replies; 27+ messages in thread
From: Mathieu Desnoyers @ 2016-10-10 16:15 UTC (permalink / raw)
  To: linux-arm-kernel

----- On Oct 10, 2016, at 5:29 PM, Will Deacon will.deacon at arm.com wrote:

> Hi Fredrik,
> 
> [adding Mathieu -- background is getcpu() in userspace for arm]
> 
> On Thu, Oct 06, 2016 at 12:17:07AM +0200, Fredrik Markstr?m wrote:
>> On Wed, Oct 5, 2016 at 9:53 PM, Russell King - ARM Linux <linux@armlinux.org.uk
>> > wrote:
>> > On Wed, Oct 05, 2016 at 06:48:05PM +0100, Robin Murphy wrote:
>> >> On 05/10/16 17:39, Fredrik Markstr?m wrote:
>> >> > The approach I suggested below with the vDSO data page will obviously
>> >> > not work on smp, so suggestions are welcome.
>> >>
>> >> Well, given that it's user-writeable, is there any reason an application
>> >> which cares couldn't simply run some per-cpu threads to call getcpu()
>> >> once and cache the result in TPIDRURW themselves? That would appear to
>> >> both raise no compatibility issues and work with existing kernels.
>> >
>> > There is - the contents of TPIDRURW is thread specific, and it moves
>> > with the thread between CPU cores.  So, if a thread was running on CPU0
>> > when it cached the getcpu() value in TPIDRURW, and then migrated to CPU1,
>> > TPIDRURW would still contain 0.
>> >
>> > I'm also not in favour of changing the TPIDRURW usage to be a storage
>> > repository for the CPU number - it's far too specific a usage and seems
>> > like a waste of hardware resources to solve one problem.
>> 
>> Ok, but right now it's nothing but a (architecture specific) piece of TLS,
>> which we have generic mechanisms for. From my point of view that is a waste of
>> hardware resources.
>> 
>> > As Mark says, it's an ABI breaking change too, even if it is under a config
>> option.
>> 
>> I can't argue with that. If it's an ABI it's an ABI, even if I can't imagine
>> why anyone would use it over normal tls... but then again, people probably do.
>> 
>> So in conclusion I agree and give up.
> 
> Rather than give up, you could take a look at the patches from Mathieu
> Desnoyers, that tackle this in a very different way. It's also the reason
> we've been holding off implementing an optimised getcpu in the arm64 vdso,
> because it might all well be replaced by the new restartable sequences
> approach:
> 
>  http://lkml.kernel.org/r/1471637274-13583-1-git-send-email-mathieu.desnoyers at efficios.com
> 
> He's also got support for arch/arm/ in that series, so you could take
> them for a spin. The main thing missing at the moment is justification
> for the feature using real-world code, as requested by Linus:
> 
>  http://lkml.kernel.org/r/CA+55aFz+Q33m1+ju3ANaznBwYCcWo9D9WDr2=p0YLEF4gJF12g at mail.gmail.com
> 
> so if your per-cpu buffer use-case is compelling in its own right (as
> opposed to a micro-benchmark), then you could chime in over there.
> 
> Will

FYI, I've adapted lttng-ust ring buffer (as a POC) to rseq in a dev
branch. I see interesting speedups. See top 3-4 commits of
https://github.com/compudj/lttng-ust-dev/tree/rseq-integration
(start with "Use rseq for...").

On x86-64, we have a 7ns speedup over sched_getcpu on x86-64, and
30ns speedup by using rseq atomics on x86-64, which brings the cost
per event record down to about 100ns/event. This replaces 3 atomic
operations on the fast path. (37% speedup)

On arm32, the cpu_id acceleration gives a 327 ns/event speed increase,
which brings speed to 2000ns/event. Note that reading time on that
system does not use the vDSO (old glibc), so it implies a system call.
This accounts for 857ns/events. I don't observe speed increase nor
slowdown by using rseq instead of ll/sc atomic operations on that
specific board (Cubietruck, only has 2 cores). I suspect that boards
with more core will benefit more of replacing ll/sc by rseq atomics.
If we don't account the overhead of reading time through system call,
we get a 22% speedup.

I have extra benchmarks in this branch:
https://github.com/compudj/rseq-test

Updated ref for current rseq-enabled kernel based on 4.8:
https://github.com/compudj/linux-percpu-dev/tree/rseq-fallback

(ARM64 port would be welcome!) :)

As Will pointed out, what I'm currently looking for is real-life
benchmarks that shows benefits of rseq. I fear that the microbenchmarks
I have for the lttng-ust tracer may be dismissed as being too specific.
Most heavy users of LTTng-UST are closed source applications, so it's
not easy for me to provide numbers in real-life scenarios.

The major use-case besides per-cpu buffering/tracing AFAIU is memory
allocators. It will mainly benefit in use-cases where there are more
threads than cores in a multithreaded application. This mainly makes
sense if threads are either dedicated to specific tasks, and therefore
are often idle, or in use-cases where worker threads are expected to
block (else, if threads are not expected to block, the application
should simply have one thread per core).

Dave Watson had interesting RSS shrinkage on this stress-test program:
http://locklessinc.com/downloads/t-test1.c modified to have 500 threads.
It uses jemalloc modified to use rseq.

I reproduced it on my laptop with 100 threads, 50000 loops:

4-core, 100 threads, 50000 loops.

malloc:
  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND
10136 compudj   20   0 2857840  24756   1348 R 379.4  0.4   3:49.50 t-test1
real    3m20.830s
user    3m22.164s
sys     9m40.936s

upstream jemalloc:
  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND
21234 compudj   20   0 2227124  49300   2280 S 306.2  0.7   2:26.97 t-test1
real    1m3.297s
user    3m19.616s
sys     0m8.500s

rseq jemalloc 4.2.1:
  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM     TIME+ COMMAND
25652 compudj   20   0  877956  35624   3260 S 301.2  0.5   1:26.07 t-test1
real    0m27.639s
user    1m18.172s
sys     0m1.752s

The next step to translate this into a "real-life" number would be to
run rseq-jemalloc on a facebook node, but Dave has been in vacation for
the past few weeks. Perhaps someone else at Facebook or Google could
look into this ?

Cheers,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2016-10-10 16:15 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-10-04 13:49 [PATCH] arm: Added support for getcpu() vDSO using TPIDRURW Fredrik Markstrom
2016-10-04 15:35 ` [PATCH v2] " Fredrik Markstrom
2016-10-04 15:35   ` Fredrik Markstrom
2016-10-04 17:07   ` Mark Rutland
2016-10-04 17:07     ` Mark Rutland
2016-10-05 12:25     ` Fredrik Markström
2016-10-05 12:25       ` Fredrik Markström
2016-10-05 16:39       ` Fredrik Markström
2016-10-05 16:39         ` Fredrik Markström
2016-10-05 17:48         ` Robin Murphy
2016-10-05 17:48           ` Robin Murphy
2016-10-05 19:53           ` Russell King - ARM Linux
2016-10-05 19:53             ` Russell King - ARM Linux
     [not found]             ` <CAKdL+dSt+cBCpwW5q+VCQh+7XeKrnyJgfTsEsuo2nKoUr9ytxw@mail.gmail.com>
2016-10-10 15:29               ` Will Deacon
2016-10-10 15:29                 ` Will Deacon
2016-10-10 16:15                 ` Restartable Sequences benchmarks (was: Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW) Mathieu Desnoyers
2016-10-10 16:15                   ` Mathieu Desnoyers
     [not found]           ` <CAKdL+dQH=9C2aGf7ys5-vXM7pkdPYUQ8xYWLipwVbABOz09f1g@mail.gmail.com>
2016-10-05 20:44             ` [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW Mark Rutland
2016-10-05 20:44               ` Mark Rutland
2016-10-05 21:01               ` Russell King - ARM Linux
2016-10-05 21:01                 ` Russell King - ARM Linux
2016-10-05 21:47                 ` Mark Rutland
2016-10-05 21:47                   ` Mark Rutland
2016-10-05 21:37               ` Fredrik Markström
2016-10-05 21:37                 ` Fredrik Markström
2016-10-05 20:12       ` Mark Rutland
2016-10-05 20:12         ` Mark Rutland

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.