* [PATCH] arm: Added support for getcpu() vDSO using TPIDRURW @ 2016-10-04 13:49 Fredrik Markstrom 2016-10-04 15:35 ` Fredrik Markstrom 0 siblings, 1 reply; 27+ messages in thread From: Fredrik Markstrom @ 2016-10-04 13:49 UTC (permalink / raw) To: linux-arm-kernel This makes getcpu() ~1000 times faster, this is very useful when implementing per-cpu buffers in userspace (to avoid cache line bouncing). As an example lttng ust becomes ~30% faster. The patch will break applications using TPIDRURW (which is context switched since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2: Preserve the user r/w register TPIDRURW on context switch and fork")) and is therefore made configurable. Signed-off-by: Fredrik Markstrom <fredrik.markstrom@gmail.com> --- arch/arm/include/asm/tls.h | 8 +++++++- arch/arm/kernel/entry-armv.S | 1 - arch/arm/mm/Kconfig | 10 ++++++++++ arch/arm/vdso/Makefile | 3 +++ arch/arm/vdso/vdso.lds.S | 3 +++ 5 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h index 5f833f7..170fd76 100644 --- a/arch/arm/include/asm/tls.h +++ b/arch/arm/include/asm/tls.h @@ -10,10 +10,15 @@ .endm .macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2 +#ifdef CONFIG_VDSO_GETCPU + ldr \tpuser, [r2, #TI_CPU] +#else mrc p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register + ldr \tpuser, [r2, #TI_TP_VALUE + 4] + str \tmp2, [\base, #TI_TP_VALUE + 4] @ save it +#endif mcr p15, 0, \tp, c13, c0, 3 @ set TLS register mcr p15, 0, \tpuser, c13, c0, 2 @ and the user r/w register - str \tmp2, [\base, #TI_TP_VALUE + 4] @ save it .endm .macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2 @@ -22,6 +27,7 @@ mov \tmp2, #0xffff0fff tst \tmp1, #HWCAP_TLS @ hardware TLS available? streq \tp, [\tmp2, #-15] @ set TLS value at 0xffff0ff0 + ldrne \tpuser, [r2, #TI_TP_VALUE + 4] @ load the saved user r/w reg mrcne p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register mcrne p15, 0, \tp, c13, c0, 3 @ yes, set TLS register mcrne p15, 0, \tpuser, c13, c0, 2 @ set user r/w register diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 9f157e7..4e1369a 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -787,7 +787,6 @@ ENTRY(__switch_to) THUMB( str sp, [ip], #4 ) THUMB( str lr, [ip], #4 ) ldr r4, [r2, #TI_TP_VALUE] - ldr r5, [r2, #TI_TP_VALUE + 4] #ifdef CONFIG_CPU_USE_DOMAINS mrc p15, 0, r6, c3, c0, 0 @ Get domain register str r6, [r1, #TI_CPU_DOMAIN] @ Save old domain register diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index c1799dd..f18334a 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -854,6 +854,16 @@ config VDSO You must have glibc 2.22 or later for programs to seamlessly take advantage of this. +config VDSO_GETCPU + bool "Enable VDSO for getcpu" + depends on VDSO && (CPU_V6K || CPU_V7 || CPU_V7M) + help + Say Y to make getcpu a VDSO (fast) call. This is useful if you + want to implement per cpu buffers to avoid cache line bouncing + in user mode. + This mechanism uses the TPIDRURW register so enabling it will break + applications using this register for it's own purpose. + config DMA_CACHE_RWFO bool "Enable read/write for ownership DMA cache maintenance" depends on CPU_V6K && SMP diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index 59a8fa7..9f1ec51 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -1,6 +1,9 @@ hostprogs-y := vdsomunge obj-vdso := vgettimeofday.o datapage.o +#ifeq ($(CONFIG_VDSO_GETCPU),y) +obj-vdso += vgetcpu.o +#endif # Build rules targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.so.raw vdso.lds diff --git a/arch/arm/vdso/vdso.lds.S b/arch/arm/vdso/vdso.lds.S index 89ca89f..1af39fb 100644 --- a/arch/arm/vdso/vdso.lds.S +++ b/arch/arm/vdso/vdso.lds.S @@ -82,6 +82,9 @@ VERSION global: __vdso_clock_gettime; __vdso_gettimeofday; +#ifdef CONFIG_VDSO_GETCPU + __vdso_getcpu; +#endif local: *; }; } -- 2.7.2 ^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW 2016-10-04 13:49 [PATCH] arm: Added support for getcpu() vDSO using TPIDRURW Fredrik Markstrom @ 2016-10-04 15:35 ` Fredrik Markstrom 0 siblings, 0 replies; 27+ messages in thread From: Fredrik Markstrom @ 2016-10-04 15:35 UTC (permalink / raw) To: linux-arm-kernel Cc: Fredrik Markstrom, Russell King, Will Deacon, Chris Brandt, Nicolas Pitre, Ard Biesheuvel, Arnd Bergmann, Linus Walleij, Masahiro Yamada, Kees Cook, Jonathan Austin, Zhaoxiu Zeng, Mark Rutland, Michal Marek, linux-kernel This makes getcpu() ~1000 times faster, this is very useful when implementing per-cpu buffers in userspace (to avoid cache line bouncing). As an example lttng ust becomes ~30% faster. The patch will break applications using TPIDRURW (which is context switched since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2: Preserve the user r/w register TPIDRURW on context switch and fork")) and is therefore made configurable. Signed-off-by: Fredrik Markstrom <fredrik.markstrom@gmail.com> --- arch/arm/include/asm/tls.h | 8 +++++++- arch/arm/kernel/entry-armv.S | 1 - arch/arm/mm/Kconfig | 10 ++++++++++ arch/arm/vdso/Makefile | 3 +++ arch/arm/vdso/vdso.lds.S | 3 +++ arch/arm/vdso/vgetcpu.c | 34 ++++++++++++++++++++++++++++++++++ 6 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 arch/arm/vdso/vgetcpu.c diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h index 5f833f7..170fd76 100644 --- a/arch/arm/include/asm/tls.h +++ b/arch/arm/include/asm/tls.h @@ -10,10 +10,15 @@ .endm .macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2 +#ifdef CONFIG_VDSO_GETCPU + ldr \tpuser, [r2, #TI_CPU] +#else mrc p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register + ldr \tpuser, [r2, #TI_TP_VALUE + 4] + str \tmp2, [\base, #TI_TP_VALUE + 4] @ save it +#endif mcr p15, 0, \tp, c13, c0, 3 @ set TLS register mcr p15, 0, \tpuser, c13, c0, 2 @ and the user r/w register - str \tmp2, [\base, #TI_TP_VALUE + 4] @ save it .endm .macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2 @@ -22,6 +27,7 @@ mov \tmp2, #0xffff0fff tst \tmp1, #HWCAP_TLS @ hardware TLS available? streq \tp, [\tmp2, #-15] @ set TLS value at 0xffff0ff0 + ldrne \tpuser, [r2, #TI_TP_VALUE + 4] @ load the saved user r/w reg mrcne p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register mcrne p15, 0, \tp, c13, c0, 3 @ yes, set TLS register mcrne p15, 0, \tpuser, c13, c0, 2 @ set user r/w register diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 9f157e7..4e1369a 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -787,7 +787,6 @@ ENTRY(__switch_to) THUMB( str sp, [ip], #4 ) THUMB( str lr, [ip], #4 ) ldr r4, [r2, #TI_TP_VALUE] - ldr r5, [r2, #TI_TP_VALUE + 4] #ifdef CONFIG_CPU_USE_DOMAINS mrc p15, 0, r6, c3, c0, 0 @ Get domain register str r6, [r1, #TI_CPU_DOMAIN] @ Save old domain register diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index c1799dd..f18334a 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -854,6 +854,16 @@ config VDSO You must have glibc 2.22 or later for programs to seamlessly take advantage of this. +config VDSO_GETCPU + bool "Enable VDSO for getcpu" + depends on VDSO && (CPU_V6K || CPU_V7 || CPU_V7M) + help + Say Y to make getcpu a VDSO (fast) call. This is useful if you + want to implement per cpu buffers to avoid cache line bouncing + in user mode. + This mechanism uses the TPIDRURW register so enabling it will break + applications using this register for it's own purpose. + config DMA_CACHE_RWFO bool "Enable read/write for ownership DMA cache maintenance" depends on CPU_V6K && SMP diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index 59a8fa7..9f1ec51 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -1,6 +1,9 @@ hostprogs-y := vdsomunge obj-vdso := vgettimeofday.o datapage.o +#ifeq ($(CONFIG_VDSO_GETCPU),y) +obj-vdso += vgetcpu.o +#endif # Build rules targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.so.raw vdso.lds diff --git a/arch/arm/vdso/vdso.lds.S b/arch/arm/vdso/vdso.lds.S index 89ca89f..1af39fb 100644 --- a/arch/arm/vdso/vdso.lds.S +++ b/arch/arm/vdso/vdso.lds.S @@ -82,6 +82,9 @@ VERSION global: __vdso_clock_gettime; __vdso_gettimeofday; +#ifdef CONFIG_VDSO_GETCPU + __vdso_getcpu; +#endif local: *; }; } diff --git a/arch/arm/vdso/vgetcpu.c b/arch/arm/vdso/vgetcpu.c new file mode 100644 index 0000000..1b710af --- /dev/null +++ b/arch/arm/vdso/vgetcpu.c @@ -0,0 +1,34 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 of the + * License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#include <linux/compiler.h> +#include <asm/topology.h> + +struct getcpu_cache; + +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep, + struct getcpu_cache *tcache) +{ + unsigned long node_and_cpu; + + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu)); + + if (nodep) + *nodep = cpu_to_node(node_and_cpu >> 16); + if (cpup) + *cpup = node_and_cpu & 0xffffUL; + + return 0; +} + -- 2.7.2 ^ permalink raw reply related [flat|nested] 27+ messages in thread
* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW @ 2016-10-04 15:35 ` Fredrik Markstrom 0 siblings, 0 replies; 27+ messages in thread From: Fredrik Markstrom @ 2016-10-04 15:35 UTC (permalink / raw) To: linux-arm-kernel This makes getcpu() ~1000 times faster, this is very useful when implementing per-cpu buffers in userspace (to avoid cache line bouncing). As an example lttng ust becomes ~30% faster. The patch will break applications using TPIDRURW (which is context switched since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2: Preserve the user r/w register TPIDRURW on context switch and fork")) and is therefore made configurable. Signed-off-by: Fredrik Markstrom <fredrik.markstrom@gmail.com> --- arch/arm/include/asm/tls.h | 8 +++++++- arch/arm/kernel/entry-armv.S | 1 - arch/arm/mm/Kconfig | 10 ++++++++++ arch/arm/vdso/Makefile | 3 +++ arch/arm/vdso/vdso.lds.S | 3 +++ arch/arm/vdso/vgetcpu.c | 34 ++++++++++++++++++++++++++++++++++ 6 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 arch/arm/vdso/vgetcpu.c diff --git a/arch/arm/include/asm/tls.h b/arch/arm/include/asm/tls.h index 5f833f7..170fd76 100644 --- a/arch/arm/include/asm/tls.h +++ b/arch/arm/include/asm/tls.h @@ -10,10 +10,15 @@ .endm .macro switch_tls_v6k, base, tp, tpuser, tmp1, tmp2 +#ifdef CONFIG_VDSO_GETCPU + ldr \tpuser, [r2, #TI_CPU] +#else mrc p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register + ldr \tpuser, [r2, #TI_TP_VALUE + 4] + str \tmp2, [\base, #TI_TP_VALUE + 4] @ save it +#endif mcr p15, 0, \tp, c13, c0, 3 @ set TLS register mcr p15, 0, \tpuser, c13, c0, 2 @ and the user r/w register - str \tmp2, [\base, #TI_TP_VALUE + 4] @ save it .endm .macro switch_tls_v6, base, tp, tpuser, tmp1, tmp2 @@ -22,6 +27,7 @@ mov \tmp2, #0xffff0fff tst \tmp1, #HWCAP_TLS @ hardware TLS available? streq \tp, [\tmp2, #-15] @ set TLS value at 0xffff0ff0 + ldrne \tpuser, [r2, #TI_TP_VALUE + 4] @ load the saved user r/w reg mrcne p15, 0, \tmp2, c13, c0, 2 @ get the user r/w register mcrne p15, 0, \tp, c13, c0, 3 @ yes, set TLS register mcrne p15, 0, \tpuser, c13, c0, 2 @ set user r/w register diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 9f157e7..4e1369a 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -787,7 +787,6 @@ ENTRY(__switch_to) THUMB( str sp, [ip], #4 ) THUMB( str lr, [ip], #4 ) ldr r4, [r2, #TI_TP_VALUE] - ldr r5, [r2, #TI_TP_VALUE + 4] #ifdef CONFIG_CPU_USE_DOMAINS mrc p15, 0, r6, c3, c0, 0 @ Get domain register str r6, [r1, #TI_CPU_DOMAIN] @ Save old domain register diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index c1799dd..f18334a 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -854,6 +854,16 @@ config VDSO You must have glibc 2.22 or later for programs to seamlessly take advantage of this. +config VDSO_GETCPU + bool "Enable VDSO for getcpu" + depends on VDSO && (CPU_V6K || CPU_V7 || CPU_V7M) + help + Say Y to make getcpu a VDSO (fast) call. This is useful if you + want to implement per cpu buffers to avoid cache line bouncing + in user mode. + This mechanism uses the TPIDRURW register so enabling it will break + applications using this register for it's own purpose. + config DMA_CACHE_RWFO bool "Enable read/write for ownership DMA cache maintenance" depends on CPU_V6K && SMP diff --git a/arch/arm/vdso/Makefile b/arch/arm/vdso/Makefile index 59a8fa7..9f1ec51 100644 --- a/arch/arm/vdso/Makefile +++ b/arch/arm/vdso/Makefile @@ -1,6 +1,9 @@ hostprogs-y := vdsomunge obj-vdso := vgettimeofday.o datapage.o +#ifeq ($(CONFIG_VDSO_GETCPU),y) +obj-vdso += vgetcpu.o +#endif # Build rules targets := $(obj-vdso) vdso.so vdso.so.dbg vdso.so.raw vdso.lds diff --git a/arch/arm/vdso/vdso.lds.S b/arch/arm/vdso/vdso.lds.S index 89ca89f..1af39fb 100644 --- a/arch/arm/vdso/vdso.lds.S +++ b/arch/arm/vdso/vdso.lds.S @@ -82,6 +82,9 @@ VERSION global: __vdso_clock_gettime; __vdso_gettimeofday; +#ifdef CONFIG_VDSO_GETCPU + __vdso_getcpu; +#endif local: *; }; } diff --git a/arch/arm/vdso/vgetcpu.c b/arch/arm/vdso/vgetcpu.c new file mode 100644 index 0000000..1b710af --- /dev/null +++ b/arch/arm/vdso/vgetcpu.c @@ -0,0 +1,34 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 of the + * License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ +#include <linux/compiler.h> +#include <asm/topology.h> + +struct getcpu_cache; + +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep, + struct getcpu_cache *tcache) +{ + unsigned long node_and_cpu; + + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu)); + + if (nodep) + *nodep = cpu_to_node(node_and_cpu >> 16); + if (cpup) + *cpup = node_and_cpu & 0xffffUL; + + return 0; +} + -- 2.7.2 ^ permalink raw reply related [flat|nested] 27+ messages in thread
* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW 2016-10-04 15:35 ` Fredrik Markstrom @ 2016-10-04 17:07 ` Mark Rutland -1 siblings, 0 replies; 27+ messages in thread From: Mark Rutland @ 2016-10-04 17:07 UTC (permalink / raw) To: Fredrik Markstrom Cc: linux-arm-kernel, Russell King, Will Deacon, Chris Brandt, Nicolas Pitre, Ard Biesheuvel, Arnd Bergmann, Linus Walleij, Masahiro Yamada, Kees Cook, Jonathan Austin, Zhaoxiu Zeng, Michal Marek, linux-kernel, kristina.martsenko On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote: > This makes getcpu() ~1000 times faster, this is very useful when > implementing per-cpu buffers in userspace (to avoid cache line > bouncing). As an example lttng ust becomes ~30% faster. > > The patch will break applications using TPIDRURW (which is context switched > since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2: It looks like you dropped the leading 'a' from the commit ID. For everyone else's benefit, the full ID is: a4780adeefd042482f624f5e0d577bf9cdcbb760 Please note that arm64 has done similar for compat tasks since commit: d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for compat tasks") > Preserve the user r/w register TPIDRURW on context switch and fork")) and > is therefore made configurable. As you note above, this is an ABI break and *will* break some existing applications. That's generally a no-go. This also leaves arm64's compat with the existing behaviour, differing from arm. I was under the impression that other mechanisms were being considered for fast userspace access to per-cpu data structures, e.g. restartable sequences. What is the state of those? Why is this better? If getcpu() specifically is necessary, is there no other way to implement it? > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep, > + struct getcpu_cache *tcache) > +{ > + unsigned long node_and_cpu; > + > + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu)); > + > + if (nodep) > + *nodep = cpu_to_node(node_and_cpu >> 16); > + if (cpup) > + *cpup = node_and_cpu & 0xffffUL; Given this is directly user-accessible, this format is a de-facto ABI, even if it's not documented as such. Is this definitely the format you want long-term? Thanks, Mark. ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW @ 2016-10-04 17:07 ` Mark Rutland 0 siblings, 0 replies; 27+ messages in thread From: Mark Rutland @ 2016-10-04 17:07 UTC (permalink / raw) To: linux-arm-kernel On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote: > This makes getcpu() ~1000 times faster, this is very useful when > implementing per-cpu buffers in userspace (to avoid cache line > bouncing). As an example lttng ust becomes ~30% faster. > > The patch will break applications using TPIDRURW (which is context switched > since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2: It looks like you dropped the leading 'a' from the commit ID. For everyone else's benefit, the full ID is: a4780adeefd042482f624f5e0d577bf9cdcbb760 Please note that arm64 has done similar for compat tasks since commit: d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for compat tasks") > Preserve the user r/w register TPIDRURW on context switch and fork")) and > is therefore made configurable. As you note above, this is an ABI break and *will* break some existing applications. That's generally a no-go. This also leaves arm64's compat with the existing behaviour, differing from arm. I was under the impression that other mechanisms were being considered for fast userspace access to per-cpu data structures, e.g. restartable sequences. What is the state of those? Why is this better? If getcpu() specifically is necessary, is there no other way to implement it? > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep, > + struct getcpu_cache *tcache) > +{ > + unsigned long node_and_cpu; > + > + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu)); > + > + if (nodep) > + *nodep = cpu_to_node(node_and_cpu >> 16); > + if (cpup) > + *cpup = node_and_cpu & 0xffffUL; Given this is directly user-accessible, this format is a de-facto ABI, even if it's not documented as such. Is this definitely the format you want long-term? Thanks, Mark. ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW 2016-10-04 17:07 ` Mark Rutland @ 2016-10-05 12:25 ` Fredrik Markström -1 siblings, 0 replies; 27+ messages in thread From: Fredrik Markström @ 2016-10-05 12:25 UTC (permalink / raw) To: Mark Rutland Cc: linux-arm-kernel, Russell King, Will Deacon, Chris Brandt, Nicolas Pitre, Ard Biesheuvel, Arnd Bergmann, Linus Walleij, Masahiro Yamada, Kees Cook, Jonathan Austin, Zhaoxiu Zeng, Michal Marek, linux-kernel, kristina.martsenko On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote: > > On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote: > > This makes getcpu() ~1000 times faster, this is very useful when > > implementing per-cpu buffers in userspace (to avoid cache line > > bouncing). As an example lttng ust becomes ~30% faster. > > > > The patch will break applications using TPIDRURW (which is context switched > > since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2: > > It looks like you dropped the leading 'a' from the commit ID. For > everyone else's benefit, the full ID is: > > a4780adeefd042482f624f5e0d577bf9cdcbb760 Sorry for that and thanks for fixing it. > > > Please note that arm64 has done similar for compat tasks since commit: > > d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for > compat tasks") > > > Preserve the user r/w register TPIDRURW on context switch and fork")) and > > is therefore made configurable. > > As you note above, this is an ABI break and *will* break some existing > applications. That's generally a no-go. Ok, I wasn't sure this was considered an ABI (but I'm not entirely surprised ;) ). The way I was trying to defend the breakage was by reasoning that that if it was an ABI we broke it both with a4780ad and with 6a1c531, and since we don't break ABI:s, it can't be one. But hey, I'm humble here and ready to back off. > > This also leaves arm64's compat with the existing behaviour, differing > from arm. > > I was under the impression that other mechanisms were being considered > for fast userspace access to per-cpu data structures, e.g. restartable > sequences. What is the state of those? Why is this better? > > If getcpu() specifically is necessary, is there no other way to > implement it? If you are referring to the user space stuff can probably be implemented other ways, it's just convenient since the interface is there and it will speed up stuff like lttng without modifications (well, except glibc). It's also already implemented as a vDSO on other major architectures (like x86, x86_64, ppc32 and ppc64). If you are referring to the implementation of the vdso call, there are other possibilities, but I haven't found any that doesn't introduce overhead in context switching. But if TPIDRURW is definitely a no go, I can work on a patch that does this with a thread notifier and the vdso data page. Would that be a viable option ? > > > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep, > > + struct getcpu_cache *tcache) > > +{ > > + unsigned long node_and_cpu; > > + > > + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu)); > > + > > + if (nodep) > > + *nodep = cpu_to_node(node_and_cpu >> 16); > > + if (cpup) > > + *cpup = node_and_cpu & 0xffffUL; > > Given this is directly user-accessible, this format is a de-facto ABI, > even if it's not documented as such. Is this definitely the format you > want long-term? Yes, this (the interface) is indeed the important part and therefore I tried not to invent anything on my own. This is the interface used by ppc32, ppc64, x86, x86_64. It's also this is how the getcpu(2) system call is documented. /Fredrik > > > Thanks, > Mark. ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW @ 2016-10-05 12:25 ` Fredrik Markström 0 siblings, 0 replies; 27+ messages in thread From: Fredrik Markström @ 2016-10-05 12:25 UTC (permalink / raw) To: linux-arm-kernel On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote: > > On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote: > > This makes getcpu() ~1000 times faster, this is very useful when > > implementing per-cpu buffers in userspace (to avoid cache line > > bouncing). As an example lttng ust becomes ~30% faster. > > > > The patch will break applications using TPIDRURW (which is context switched > > since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2: > > It looks like you dropped the leading 'a' from the commit ID. For > everyone else's benefit, the full ID is: > > a4780adeefd042482f624f5e0d577bf9cdcbb760 Sorry for that and thanks for fixing it. > > > Please note that arm64 has done similar for compat tasks since commit: > > d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for > compat tasks") > > > Preserve the user r/w register TPIDRURW on context switch and fork")) and > > is therefore made configurable. > > As you note above, this is an ABI break and *will* break some existing > applications. That's generally a no-go. Ok, I wasn't sure this was considered an ABI (but I'm not entirely surprised ;) ). The way I was trying to defend the breakage was by reasoning that that if it was an ABI we broke it both with a4780ad and with 6a1c531, and since we don't break ABI:s, it can't be one. But hey, I'm humble here and ready to back off. > > This also leaves arm64's compat with the existing behaviour, differing > from arm. > > I was under the impression that other mechanisms were being considered > for fast userspace access to per-cpu data structures, e.g. restartable > sequences. What is the state of those? Why is this better? > > If getcpu() specifically is necessary, is there no other way to > implement it? If you are referring to the user space stuff can probably be implemented other ways, it's just convenient since the interface is there and it will speed up stuff like lttng without modifications (well, except glibc). It's also already implemented as a vDSO on other major architectures (like x86, x86_64, ppc32 and ppc64). If you are referring to the implementation of the vdso call, there are other possibilities, but I haven't found any that doesn't introduce overhead in context switching. But if TPIDRURW is definitely a no go, I can work on a patch that does this with a thread notifier and the vdso data page. Would that be a viable option ? > > > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep, > > + struct getcpu_cache *tcache) > > +{ > > + unsigned long node_and_cpu; > > + > > + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu)); > > + > > + if (nodep) > > + *nodep = cpu_to_node(node_and_cpu >> 16); > > + if (cpup) > > + *cpup = node_and_cpu & 0xffffUL; > > Given this is directly user-accessible, this format is a de-facto ABI, > even if it's not documented as such. Is this definitely the format you > want long-term? Yes, this (the interface) is indeed the important part and therefore I tried not to invent anything on my own. This is the interface used by ppc32, ppc64, x86, x86_64. It's also this is how the getcpu(2) system call is documented. /Fredrik > > > Thanks, > Mark. ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW 2016-10-05 12:25 ` Fredrik Markström @ 2016-10-05 16:39 ` Fredrik Markström -1 siblings, 0 replies; 27+ messages in thread From: Fredrik Markström @ 2016-10-05 16:39 UTC (permalink / raw) To: Mark Rutland Cc: linux-arm-kernel, Russell King, Will Deacon, Chris Brandt, Nicolas Pitre, Ard Biesheuvel, Arnd Bergmann, Linus Walleij, Masahiro Yamada, Kees Cook, Jonathan Austin, Zhaoxiu Zeng, Michal Marek, linux-kernel, kristina.martsenko The approach I suggested below with the vDSO data page will obviously not work on smp, so suggestions are welcome. /Fredrik On Wed, Oct 5, 2016 at 2:25 PM, Fredrik Markström <fredrik.markstrom@gmail.com> wrote: > On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote: >> >> On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote: >> > This makes getcpu() ~1000 times faster, this is very useful when >> > implementing per-cpu buffers in userspace (to avoid cache line >> > bouncing). As an example lttng ust becomes ~30% faster. >> > >> > The patch will break applications using TPIDRURW (which is context switched >> > since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2: >> >> It looks like you dropped the leading 'a' from the commit ID. For >> everyone else's benefit, the full ID is: >> >> a4780adeefd042482f624f5e0d577bf9cdcbb760 > > > Sorry for that and thanks for fixing it. > >> >> >> Please note that arm64 has done similar for compat tasks since commit: >> >> d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for >> compat tasks") >> >> > Preserve the user r/w register TPIDRURW on context switch and fork")) and >> > is therefore made configurable. >> >> As you note above, this is an ABI break and *will* break some existing >> applications. That's generally a no-go. > > > Ok, I wasn't sure this was considered an ABI (but I'm not entirely > surprised ;) ). The way I was > trying to defend the breakage was by reasoning that that if it was an > ABI we broke it both with a4780ad > and with 6a1c531, and since we don't break ABI:s, it can't be one. > > But hey, I'm humble here and ready to back off. > >> >> This also leaves arm64's compat with the existing behaviour, differing >> from arm. >> >> I was under the impression that other mechanisms were being considered >> for fast userspace access to per-cpu data structures, e.g. restartable >> sequences. What is the state of those? Why is this better? >> >> If getcpu() specifically is necessary, is there no other way to >> implement it? > > If you are referring to the user space stuff can probably be > implemented other ways, > it's just convenient since the interface is there and it will speed up > stuff like lttng without > modifications (well, except glibc). It's also already implemented as a > vDSO on other > major architectures (like x86, x86_64, ppc32 and ppc64). > > If you are referring to the implementation of the vdso call, there are > other possibilities, but > I haven't found any that doesn't introduce overhead in context switching. > > But if TPIDRURW is definitely a no go, I can work on a patch that does > this with a thread notifier > and the vdso data page. Would that be a viable option ? > >> >> > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep, >> > + struct getcpu_cache *tcache) >> > +{ >> > + unsigned long node_and_cpu; >> > + >> > + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu)); >> > + >> > + if (nodep) >> > + *nodep = cpu_to_node(node_and_cpu >> 16); >> > + if (cpup) >> > + *cpup = node_and_cpu & 0xffffUL; >> >> Given this is directly user-accessible, this format is a de-facto ABI, >> even if it's not documented as such. Is this definitely the format you >> want long-term? > > Yes, this (the interface) is indeed the important part and therefore I > tried not to invent anything > on my own. > This is the interface used by ppc32, ppc64, x86, x86_64. It's also this is > how the getcpu(2) system call is documented. > > /Fredrik > > >> >> >> Thanks, >> Mark. -- /Fredrik ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW @ 2016-10-05 16:39 ` Fredrik Markström 0 siblings, 0 replies; 27+ messages in thread From: Fredrik Markström @ 2016-10-05 16:39 UTC (permalink / raw) To: linux-arm-kernel The approach I suggested below with the vDSO data page will obviously not work on smp, so suggestions are welcome. /Fredrik On Wed, Oct 5, 2016 at 2:25 PM, Fredrik Markstr?m <fredrik.markstrom@gmail.com> wrote: > On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote: >> >> On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote: >> > This makes getcpu() ~1000 times faster, this is very useful when >> > implementing per-cpu buffers in userspace (to avoid cache line >> > bouncing). As an example lttng ust becomes ~30% faster. >> > >> > The patch will break applications using TPIDRURW (which is context switched >> > since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2: >> >> It looks like you dropped the leading 'a' from the commit ID. For >> everyone else's benefit, the full ID is: >> >> a4780adeefd042482f624f5e0d577bf9cdcbb760 > > > Sorry for that and thanks for fixing it. > >> >> >> Please note that arm64 has done similar for compat tasks since commit: >> >> d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for >> compat tasks") >> >> > Preserve the user r/w register TPIDRURW on context switch and fork")) and >> > is therefore made configurable. >> >> As you note above, this is an ABI break and *will* break some existing >> applications. That's generally a no-go. > > > Ok, I wasn't sure this was considered an ABI (but I'm not entirely > surprised ;) ). The way I was > trying to defend the breakage was by reasoning that that if it was an > ABI we broke it both with a4780ad > and with 6a1c531, and since we don't break ABI:s, it can't be one. > > But hey, I'm humble here and ready to back off. > >> >> This also leaves arm64's compat with the existing behaviour, differing >> from arm. >> >> I was under the impression that other mechanisms were being considered >> for fast userspace access to per-cpu data structures, e.g. restartable >> sequences. What is the state of those? Why is this better? >> >> If getcpu() specifically is necessary, is there no other way to >> implement it? > > If you are referring to the user space stuff can probably be > implemented other ways, > it's just convenient since the interface is there and it will speed up > stuff like lttng without > modifications (well, except glibc). It's also already implemented as a > vDSO on other > major architectures (like x86, x86_64, ppc32 and ppc64). > > If you are referring to the implementation of the vdso call, there are > other possibilities, but > I haven't found any that doesn't introduce overhead in context switching. > > But if TPIDRURW is definitely a no go, I can work on a patch that does > this with a thread notifier > and the vdso data page. Would that be a viable option ? > >> >> > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep, >> > + struct getcpu_cache *tcache) >> > +{ >> > + unsigned long node_and_cpu; >> > + >> > + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu)); >> > + >> > + if (nodep) >> > + *nodep = cpu_to_node(node_and_cpu >> 16); >> > + if (cpup) >> > + *cpup = node_and_cpu & 0xffffUL; >> >> Given this is directly user-accessible, this format is a de-facto ABI, >> even if it's not documented as such. Is this definitely the format you >> want long-term? > > Yes, this (the interface) is indeed the important part and therefore I > tried not to invent anything > on my own. > This is the interface used by ppc32, ppc64, x86, x86_64. It's also this is > how the getcpu(2) system call is documented. > > /Fredrik > > >> >> >> Thanks, >> Mark. -- /Fredrik ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW 2016-10-05 16:39 ` Fredrik Markström @ 2016-10-05 17:48 ` Robin Murphy -1 siblings, 0 replies; 27+ messages in thread From: Robin Murphy @ 2016-10-05 17:48 UTC (permalink / raw) To: Fredrik Markström, Mark Rutland Cc: Kees Cook, Arnd Bergmann, Ard Biesheuvel, Linus Walleij, Nicolas Pitre, Will Deacon, Russell King, kristina.martsenko, linux-kernel, Masahiro Yamada, Chris Brandt, Michal Marek, Zhaoxiu Zeng, linux-arm-kernel, Jonathan Austin On 05/10/16 17:39, Fredrik Markström wrote: > The approach I suggested below with the vDSO data page will obviously > not work on smp, so suggestions are welcome. Well, given that it's user-writeable, is there any reason an application which cares couldn't simply run some per-cpu threads to call getcpu() once and cache the result in TPIDRURW themselves? That would appear to both raise no compatibility issues and work with existing kernels. Robin. > /Fredrik > > > On Wed, Oct 5, 2016 at 2:25 PM, Fredrik Markström > <fredrik.markstrom@gmail.com> wrote: >> On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote: >>> >>> On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote: >>>> This makes getcpu() ~1000 times faster, this is very useful when >>>> implementing per-cpu buffers in userspace (to avoid cache line >>>> bouncing). As an example lttng ust becomes ~30% faster. >>>> >>>> The patch will break applications using TPIDRURW (which is context switched >>>> since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2: >>> >>> It looks like you dropped the leading 'a' from the commit ID. For >>> everyone else's benefit, the full ID is: >>> >>> a4780adeefd042482f624f5e0d577bf9cdcbb760 >> >> >> Sorry for that and thanks for fixing it. >> >>> >>> >>> Please note that arm64 has done similar for compat tasks since commit: >>> >>> d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for >>> compat tasks") >>> >>>> Preserve the user r/w register TPIDRURW on context switch and fork")) and >>>> is therefore made configurable. >>> >>> As you note above, this is an ABI break and *will* break some existing >>> applications. That's generally a no-go. >> >> >> Ok, I wasn't sure this was considered an ABI (but I'm not entirely >> surprised ;) ). The way I was >> trying to defend the breakage was by reasoning that that if it was an >> ABI we broke it both with a4780ad >> and with 6a1c531, and since we don't break ABI:s, it can't be one. >> >> But hey, I'm humble here and ready to back off. >> >>> >>> This also leaves arm64's compat with the existing behaviour, differing >>> from arm. >>> >>> I was under the impression that other mechanisms were being considered >>> for fast userspace access to per-cpu data structures, e.g. restartable >>> sequences. What is the state of those? Why is this better? >>> >>> If getcpu() specifically is necessary, is there no other way to >>> implement it? >> >> If you are referring to the user space stuff can probably be >> implemented other ways, >> it's just convenient since the interface is there and it will speed up >> stuff like lttng without >> modifications (well, except glibc). It's also already implemented as a >> vDSO on other >> major architectures (like x86, x86_64, ppc32 and ppc64). >> >> If you are referring to the implementation of the vdso call, there are >> other possibilities, but >> I haven't found any that doesn't introduce overhead in context switching. >> >> But if TPIDRURW is definitely a no go, I can work on a patch that does >> this with a thread notifier >> and the vdso data page. Would that be a viable option ? >> >>> >>>> +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep, >>>> + struct getcpu_cache *tcache) >>>> +{ >>>> + unsigned long node_and_cpu; >>>> + >>>> + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu)); >>>> + >>>> + if (nodep) >>>> + *nodep = cpu_to_node(node_and_cpu >> 16); >>>> + if (cpup) >>>> + *cpup = node_and_cpu & 0xffffUL; >>> >>> Given this is directly user-accessible, this format is a de-facto ABI, >>> even if it's not documented as such. Is this definitely the format you >>> want long-term? >> >> Yes, this (the interface) is indeed the important part and therefore I >> tried not to invent anything >> on my own. >> This is the interface used by ppc32, ppc64, x86, x86_64. It's also this is >> how the getcpu(2) system call is documented. >> >> /Fredrik >> >> >>> >>> >>> Thanks, >>> Mark. > > > ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW @ 2016-10-05 17:48 ` Robin Murphy 0 siblings, 0 replies; 27+ messages in thread From: Robin Murphy @ 2016-10-05 17:48 UTC (permalink / raw) To: linux-arm-kernel On 05/10/16 17:39, Fredrik Markstr?m wrote: > The approach I suggested below with the vDSO data page will obviously > not work on smp, so suggestions are welcome. Well, given that it's user-writeable, is there any reason an application which cares couldn't simply run some per-cpu threads to call getcpu() once and cache the result in TPIDRURW themselves? That would appear to both raise no compatibility issues and work with existing kernels. Robin. > /Fredrik > > > On Wed, Oct 5, 2016 at 2:25 PM, Fredrik Markstr?m > <fredrik.markstrom@gmail.com> wrote: >> On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote: >>> >>> On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote: >>>> This makes getcpu() ~1000 times faster, this is very useful when >>>> implementing per-cpu buffers in userspace (to avoid cache line >>>> bouncing). As an example lttng ust becomes ~30% faster. >>>> >>>> The patch will break applications using TPIDRURW (which is context switched >>>> since commit 4780adeefd042482f624f5e0d577bf9cdcbb760 ("ARM: 7735/2: >>> >>> It looks like you dropped the leading 'a' from the commit ID. For >>> everyone else's benefit, the full ID is: >>> >>> a4780adeefd042482f624f5e0d577bf9cdcbb760 >> >> >> Sorry for that and thanks for fixing it. >> >>> >>> >>> Please note that arm64 has done similar for compat tasks since commit: >>> >>> d00a3810c16207d2 ("arm64: context-switch user tls register tpidr_el0 for >>> compat tasks") >>> >>>> Preserve the user r/w register TPIDRURW on context switch and fork")) and >>>> is therefore made configurable. >>> >>> As you note above, this is an ABI break and *will* break some existing >>> applications. That's generally a no-go. >> >> >> Ok, I wasn't sure this was considered an ABI (but I'm not entirely >> surprised ;) ). The way I was >> trying to defend the breakage was by reasoning that that if it was an >> ABI we broke it both with a4780ad >> and with 6a1c531, and since we don't break ABI:s, it can't be one. >> >> But hey, I'm humble here and ready to back off. >> >>> >>> This also leaves arm64's compat with the existing behaviour, differing >>> from arm. >>> >>> I was under the impression that other mechanisms were being considered >>> for fast userspace access to per-cpu data structures, e.g. restartable >>> sequences. What is the state of those? Why is this better? >>> >>> If getcpu() specifically is necessary, is there no other way to >>> implement it? >> >> If you are referring to the user space stuff can probably be >> implemented other ways, >> it's just convenient since the interface is there and it will speed up >> stuff like lttng without >> modifications (well, except glibc). It's also already implemented as a >> vDSO on other >> major architectures (like x86, x86_64, ppc32 and ppc64). >> >> If you are referring to the implementation of the vdso call, there are >> other possibilities, but >> I haven't found any that doesn't introduce overhead in context switching. >> >> But if TPIDRURW is definitely a no go, I can work on a patch that does >> this with a thread notifier >> and the vdso data page. Would that be a viable option ? >> >>> >>>> +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep, >>>> + struct getcpu_cache *tcache) >>>> +{ >>>> + unsigned long node_and_cpu; >>>> + >>>> + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu)); >>>> + >>>> + if (nodep) >>>> + *nodep = cpu_to_node(node_and_cpu >> 16); >>>> + if (cpup) >>>> + *cpup = node_and_cpu & 0xffffUL; >>> >>> Given this is directly user-accessible, this format is a de-facto ABI, >>> even if it's not documented as such. Is this definitely the format you >>> want long-term? >> >> Yes, this (the interface) is indeed the important part and therefore I >> tried not to invent anything >> on my own. >> This is the interface used by ppc32, ppc64, x86, x86_64. It's also this is >> how the getcpu(2) system call is documented. >> >> /Fredrik >> >> >>> >>> >>> Thanks, >>> Mark. > > > ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW 2016-10-05 17:48 ` Robin Murphy @ 2016-10-05 19:53 ` Russell King - ARM Linux -1 siblings, 0 replies; 27+ messages in thread From: Russell King - ARM Linux @ 2016-10-05 19:53 UTC (permalink / raw) To: Robin Murphy Cc: Fredrik Markström, Mark Rutland, Kees Cook, Arnd Bergmann, Ard Biesheuvel, Linus Walleij, Nicolas Pitre, Will Deacon, kristina.martsenko, linux-kernel, Masahiro Yamada, Chris Brandt, Michal Marek, Zhaoxiu Zeng, linux-arm-kernel, Jonathan Austin On Wed, Oct 05, 2016 at 06:48:05PM +0100, Robin Murphy wrote: > On 05/10/16 17:39, Fredrik Markström wrote: > > The approach I suggested below with the vDSO data page will obviously > > not work on smp, so suggestions are welcome. > > Well, given that it's user-writeable, is there any reason an application > which cares couldn't simply run some per-cpu threads to call getcpu() > once and cache the result in TPIDRURW themselves? That would appear to > both raise no compatibility issues and work with existing kernels. There is - the contents of TPIDRURW is thread specific, and it moves with the thread between CPU cores. So, if a thread was running on CPU0 when it cached the getcpu() value in TPIDRURW, and then migrated to CPU1, TPIDRURW would still contain 0. I'm also not in favour of changing the TPIDRURW usage to be a storage repository for the CPU number - it's far too specific a usage and seems like a waste of hardware resources to solve one problem. As Mark says, it's an ABI breaking change too, even if it is under a config option. Take a moment to consider distro kernels: how should they set this config option - should they enable it to get faster getcpu() or should they disable it to retain existing compatibility to prevent userspace breakage. Who can advise them to make the right decision? Kernel developers can't, because the usage of this register is purely a userspace issue right now, and kernels devs don't know what use it's been put to. -- RMK's Patch system: http://www.armlinux.org.uk/developer/patches/ FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up according to speedtest.net. ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW @ 2016-10-05 19:53 ` Russell King - ARM Linux 0 siblings, 0 replies; 27+ messages in thread From: Russell King - ARM Linux @ 2016-10-05 19:53 UTC (permalink / raw) To: linux-arm-kernel On Wed, Oct 05, 2016 at 06:48:05PM +0100, Robin Murphy wrote: > On 05/10/16 17:39, Fredrik Markstr?m wrote: > > The approach I suggested below with the vDSO data page will obviously > > not work on smp, so suggestions are welcome. > > Well, given that it's user-writeable, is there any reason an application > which cares couldn't simply run some per-cpu threads to call getcpu() > once and cache the result in TPIDRURW themselves? That would appear to > both raise no compatibility issues and work with existing kernels. There is - the contents of TPIDRURW is thread specific, and it moves with the thread between CPU cores. So, if a thread was running on CPU0 when it cached the getcpu() value in TPIDRURW, and then migrated to CPU1, TPIDRURW would still contain 0. I'm also not in favour of changing the TPIDRURW usage to be a storage repository for the CPU number - it's far too specific a usage and seems like a waste of hardware resources to solve one problem. As Mark says, it's an ABI breaking change too, even if it is under a config option. Take a moment to consider distro kernels: how should they set this config option - should they enable it to get faster getcpu() or should they disable it to retain existing compatibility to prevent userspace breakage. Who can advise them to make the right decision? Kernel developers can't, because the usage of this register is purely a userspace issue right now, and kernels devs don't know what use it's been put to. -- RMK's Patch system: http://www.armlinux.org.uk/developer/patches/ FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up according to speedtest.net. ^ permalink raw reply [flat|nested] 27+ messages in thread
[parent not found: <CAKdL+dSt+cBCpwW5q+VCQh+7XeKrnyJgfTsEsuo2nKoUr9ytxw@mail.gmail.com>]
* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW [not found] ` <CAKdL+dSt+cBCpwW5q+VCQh+7XeKrnyJgfTsEsuo2nKoUr9ytxw@mail.gmail.com> @ 2016-10-10 15:29 ` Will Deacon 0 siblings, 0 replies; 27+ messages in thread From: Will Deacon @ 2016-10-10 15:29 UTC (permalink / raw) To: Fredrik Markström Cc: Russell King - ARM Linux, Robin Murphy, Mark Rutland, Kees Cook, Arnd Bergmann, Ard Biesheuvel, Linus Walleij, Nicolas Pitre, kristina.martsenko, linux-kernel, Masahiro Yamada, Chris Brandt, Michal Marek, Zhaoxiu Zeng, linux-arm-kernel, Jonathan Austin, mathieu.desnoyers Hi Fredrik, [adding Mathieu -- background is getcpu() in userspace for arm] On Thu, Oct 06, 2016 at 12:17:07AM +0200, Fredrik Markström wrote: > On Wed, Oct 5, 2016 at 9:53 PM, Russell King - ARM Linux <linux@armlinux.org.uk > > wrote: > > On Wed, Oct 05, 2016 at 06:48:05PM +0100, Robin Murphy wrote: > >> On 05/10/16 17:39, Fredrik Markström wrote: > >> > The approach I suggested below with the vDSO data page will obviously > >> > not work on smp, so suggestions are welcome. > >> > >> Well, given that it's user-writeable, is there any reason an application > >> which cares couldn't simply run some per-cpu threads to call getcpu() > >> once and cache the result in TPIDRURW themselves? That would appear to > >> both raise no compatibility issues and work with existing kernels. > > > > There is - the contents of TPIDRURW is thread specific, and it moves > > with the thread between CPU cores. So, if a thread was running on CPU0 > > when it cached the getcpu() value in TPIDRURW, and then migrated to CPU1, > > TPIDRURW would still contain 0. > > > > I'm also not in favour of changing the TPIDRURW usage to be a storage > > repository for the CPU number - it's far too specific a usage and seems > > like a waste of hardware resources to solve one problem. > > Ok, but right now it's nothing but a (architecture specific) piece of TLS, > which we have generic mechanisms for. From my point of view that is a waste of > hardware resources. > > > As Mark says, it's an ABI breaking change too, even if it is under a config > option. > > I can't argue with that. If it's an ABI it's an ABI, even if I can't imagine > why anyone would use it over normal tls... but then again, people probably do. > > So in conclusion I agree and give up. Rather than give up, you could take a look at the patches from Mathieu Desnoyers, that tackle this in a very different way. It's also the reason we've been holding off implementing an optimised getcpu in the arm64 vdso, because it might all well be replaced by the new restartable sequences approach: http://lkml.kernel.org/r/1471637274-13583-1-git-send-email-mathieu.desnoyers@efficios.com He's also got support for arch/arm/ in that series, so you could take them for a spin. The main thing missing at the moment is justification for the feature using real-world code, as requested by Linus: http://lkml.kernel.org/r/CA+55aFz+Q33m1+ju3ANaznBwYCcWo9D9WDr2=p0YLEF4gJF12g@mail.gmail.com so if your per-cpu buffer use-case is compelling in its own right (as opposed to a micro-benchmark), then you could chime in over there. Will ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW @ 2016-10-10 15:29 ` Will Deacon 0 siblings, 0 replies; 27+ messages in thread From: Will Deacon @ 2016-10-10 15:29 UTC (permalink / raw) To: linux-arm-kernel Hi Fredrik, [adding Mathieu -- background is getcpu() in userspace for arm] On Thu, Oct 06, 2016 at 12:17:07AM +0200, Fredrik Markstr?m wrote: > On Wed, Oct 5, 2016 at 9:53 PM, Russell King - ARM Linux <linux@armlinux.org.uk > > wrote: > > On Wed, Oct 05, 2016 at 06:48:05PM +0100, Robin Murphy wrote: > >> On 05/10/16 17:39, Fredrik Markstr?m wrote: > >> > The approach I suggested below with the vDSO data page will obviously > >> > not work on smp, so suggestions are welcome. > >> > >> Well, given that it's user-writeable, is there any reason an application > >> which cares couldn't simply run some per-cpu threads to call getcpu() > >> once and cache the result in TPIDRURW themselves? That would appear to > >> both raise no compatibility issues and work with existing kernels. > > > > There is - the contents of TPIDRURW is thread specific, and it moves > > with the thread between CPU cores. So, if a thread was running on CPU0 > > when it cached the getcpu() value in TPIDRURW, and then migrated to CPU1, > > TPIDRURW would still contain 0. > > > > I'm also not in favour of changing the TPIDRURW usage to be a storage > > repository for the CPU number - it's far too specific a usage and seems > > like a waste of hardware resources to solve one problem. > > Ok, but right now it's nothing but a (architecture specific) piece of TLS, > which we have generic mechanisms for. From my point of view that is a waste of > hardware resources. > > > As Mark says, it's an ABI breaking change too, even if it is under a config > option. > > I can't argue with that. If it's an ABI it's an ABI, even if I can't imagine > why anyone would use it over normal tls... but then again, people probably do. > > So in conclusion I agree and give up. Rather than give up, you could take a look at the patches from Mathieu Desnoyers, that tackle this in a very different way. It's also the reason we've been holding off implementing an optimised getcpu in the arm64 vdso, because it might all well be replaced by the new restartable sequences approach: http://lkml.kernel.org/r/1471637274-13583-1-git-send-email-mathieu.desnoyers at efficios.com He's also got support for arch/arm/ in that series, so you could take them for a spin. The main thing missing at the moment is justification for the feature using real-world code, as requested by Linus: http://lkml.kernel.org/r/CA+55aFz+Q33m1+ju3ANaznBwYCcWo9D9WDr2=p0YLEF4gJF12g at mail.gmail.com so if your per-cpu buffer use-case is compelling in its own right (as opposed to a micro-benchmark), then you could chime in over there. Will ^ permalink raw reply [flat|nested] 27+ messages in thread
* Restartable Sequences benchmarks (was: Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW) 2016-10-10 15:29 ` Will Deacon @ 2016-10-10 16:15 ` Mathieu Desnoyers -1 siblings, 0 replies; 27+ messages in thread From: Mathieu Desnoyers @ 2016-10-10 16:15 UTC (permalink / raw) To: Will Deacon, Peter Zijlstra, Boqun Feng, Paul E. McKenney, Linus Torvalds, Dave Watson, Ben Maurer, Paul Turner, Andrew Hunter Cc: Fredrik Markström, Russell King - ARM Linux, Robin Murphy, Mark Rutland, Kees Cook, Arnd Bergmann, Ard Biesheuvel, Linus Walleij, Nicolas Pitre, kristina martsenko, linux-kernel, Masahiro Yamada, Chris Brandt, Michal Marek, Zhaoxiu Zeng, linux-arm-kernel, Jonathan Austin ----- On Oct 10, 2016, at 5:29 PM, Will Deacon will.deacon@arm.com wrote: > Hi Fredrik, > > [adding Mathieu -- background is getcpu() in userspace for arm] > > On Thu, Oct 06, 2016 at 12:17:07AM +0200, Fredrik Markström wrote: >> On Wed, Oct 5, 2016 at 9:53 PM, Russell King - ARM Linux <linux@armlinux.org.uk >> > wrote: >> > On Wed, Oct 05, 2016 at 06:48:05PM +0100, Robin Murphy wrote: >> >> On 05/10/16 17:39, Fredrik Markström wrote: >> >> > The approach I suggested below with the vDSO data page will obviously >> >> > not work on smp, so suggestions are welcome. >> >> >> >> Well, given that it's user-writeable, is there any reason an application >> >> which cares couldn't simply run some per-cpu threads to call getcpu() >> >> once and cache the result in TPIDRURW themselves? That would appear to >> >> both raise no compatibility issues and work with existing kernels. >> > >> > There is - the contents of TPIDRURW is thread specific, and it moves >> > with the thread between CPU cores. So, if a thread was running on CPU0 >> > when it cached the getcpu() value in TPIDRURW, and then migrated to CPU1, >> > TPIDRURW would still contain 0. >> > >> > I'm also not in favour of changing the TPIDRURW usage to be a storage >> > repository for the CPU number - it's far too specific a usage and seems >> > like a waste of hardware resources to solve one problem. >> >> Ok, but right now it's nothing but a (architecture specific) piece of TLS, >> which we have generic mechanisms for. From my point of view that is a waste of >> hardware resources. >> >> > As Mark says, it's an ABI breaking change too, even if it is under a config >> option. >> >> I can't argue with that. If it's an ABI it's an ABI, even if I can't imagine >> why anyone would use it over normal tls... but then again, people probably do. >> >> So in conclusion I agree and give up. > > Rather than give up, you could take a look at the patches from Mathieu > Desnoyers, that tackle this in a very different way. It's also the reason > we've been holding off implementing an optimised getcpu in the arm64 vdso, > because it might all well be replaced by the new restartable sequences > approach: > > http://lkml.kernel.org/r/1471637274-13583-1-git-send-email-mathieu.desnoyers@efficios.com > > He's also got support for arch/arm/ in that series, so you could take > them for a spin. The main thing missing at the moment is justification > for the feature using real-world code, as requested by Linus: > > http://lkml.kernel.org/r/CA+55aFz+Q33m1+ju3ANaznBwYCcWo9D9WDr2=p0YLEF4gJF12g@mail.gmail.com > > so if your per-cpu buffer use-case is compelling in its own right (as > opposed to a micro-benchmark), then you could chime in over there. > > Will FYI, I've adapted lttng-ust ring buffer (as a POC) to rseq in a dev branch. I see interesting speedups. See top 3-4 commits of https://github.com/compudj/lttng-ust-dev/tree/rseq-integration (start with "Use rseq for..."). On x86-64, we have a 7ns speedup over sched_getcpu on x86-64, and 30ns speedup by using rseq atomics on x86-64, which brings the cost per event record down to about 100ns/event. This replaces 3 atomic operations on the fast path. (37% speedup) On arm32, the cpu_id acceleration gives a 327 ns/event speed increase, which brings speed to 2000ns/event. Note that reading time on that system does not use the vDSO (old glibc), so it implies a system call. This accounts for 857ns/events. I don't observe speed increase nor slowdown by using rseq instead of ll/sc atomic operations on that specific board (Cubietruck, only has 2 cores). I suspect that boards with more core will benefit more of replacing ll/sc by rseq atomics. If we don't account the overhead of reading time through system call, we get a 22% speedup. I have extra benchmarks in this branch: https://github.com/compudj/rseq-test Updated ref for current rseq-enabled kernel based on 4.8: https://github.com/compudj/linux-percpu-dev/tree/rseq-fallback (ARM64 port would be welcome!) :) As Will pointed out, what I'm currently looking for is real-life benchmarks that shows benefits of rseq. I fear that the microbenchmarks I have for the lttng-ust tracer may be dismissed as being too specific. Most heavy users of LTTng-UST are closed source applications, so it's not easy for me to provide numbers in real-life scenarios. The major use-case besides per-cpu buffering/tracing AFAIU is memory allocators. It will mainly benefit in use-cases where there are more threads than cores in a multithreaded application. This mainly makes sense if threads are either dedicated to specific tasks, and therefore are often idle, or in use-cases where worker threads are expected to block (else, if threads are not expected to block, the application should simply have one thread per core). Dave Watson had interesting RSS shrinkage on this stress-test program: http://locklessinc.com/downloads/t-test1.c modified to have 500 threads. It uses jemalloc modified to use rseq. I reproduced it on my laptop with 100 threads, 50000 loops: 4-core, 100 threads, 50000 loops. malloc: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 10136 compudj 20 0 2857840 24756 1348 R 379.4 0.4 3:49.50 t-test1 real 3m20.830s user 3m22.164s sys 9m40.936s upstream jemalloc: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 21234 compudj 20 0 2227124 49300 2280 S 306.2 0.7 2:26.97 t-test1 real 1m3.297s user 3m19.616s sys 0m8.500s rseq jemalloc 4.2.1: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 25652 compudj 20 0 877956 35624 3260 S 301.2 0.5 1:26.07 t-test1 real 0m27.639s user 1m18.172s sys 0m1.752s The next step to translate this into a "real-life" number would be to run rseq-jemalloc on a facebook node, but Dave has been in vacation for the past few weeks. Perhaps someone else at Facebook or Google could look into this ? Cheers, Mathieu -- Mathieu Desnoyers EfficiOS Inc. http://www.efficios.com ^ permalink raw reply [flat|nested] 27+ messages in thread
* Restartable Sequences benchmarks (was: Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW) @ 2016-10-10 16:15 ` Mathieu Desnoyers 0 siblings, 0 replies; 27+ messages in thread From: Mathieu Desnoyers @ 2016-10-10 16:15 UTC (permalink / raw) To: linux-arm-kernel ----- On Oct 10, 2016, at 5:29 PM, Will Deacon will.deacon at arm.com wrote: > Hi Fredrik, > > [adding Mathieu -- background is getcpu() in userspace for arm] > > On Thu, Oct 06, 2016 at 12:17:07AM +0200, Fredrik Markstr?m wrote: >> On Wed, Oct 5, 2016 at 9:53 PM, Russell King - ARM Linux <linux@armlinux.org.uk >> > wrote: >> > On Wed, Oct 05, 2016 at 06:48:05PM +0100, Robin Murphy wrote: >> >> On 05/10/16 17:39, Fredrik Markstr?m wrote: >> >> > The approach I suggested below with the vDSO data page will obviously >> >> > not work on smp, so suggestions are welcome. >> >> >> >> Well, given that it's user-writeable, is there any reason an application >> >> which cares couldn't simply run some per-cpu threads to call getcpu() >> >> once and cache the result in TPIDRURW themselves? That would appear to >> >> both raise no compatibility issues and work with existing kernels. >> > >> > There is - the contents of TPIDRURW is thread specific, and it moves >> > with the thread between CPU cores. So, if a thread was running on CPU0 >> > when it cached the getcpu() value in TPIDRURW, and then migrated to CPU1, >> > TPIDRURW would still contain 0. >> > >> > I'm also not in favour of changing the TPIDRURW usage to be a storage >> > repository for the CPU number - it's far too specific a usage and seems >> > like a waste of hardware resources to solve one problem. >> >> Ok, but right now it's nothing but a (architecture specific) piece of TLS, >> which we have generic mechanisms for. From my point of view that is a waste of >> hardware resources. >> >> > As Mark says, it's an ABI breaking change too, even if it is under a config >> option. >> >> I can't argue with that. If it's an ABI it's an ABI, even if I can't imagine >> why anyone would use it over normal tls... but then again, people probably do. >> >> So in conclusion I agree and give up. > > Rather than give up, you could take a look at the patches from Mathieu > Desnoyers, that tackle this in a very different way. It's also the reason > we've been holding off implementing an optimised getcpu in the arm64 vdso, > because it might all well be replaced by the new restartable sequences > approach: > > http://lkml.kernel.org/r/1471637274-13583-1-git-send-email-mathieu.desnoyers at efficios.com > > He's also got support for arch/arm/ in that series, so you could take > them for a spin. The main thing missing at the moment is justification > for the feature using real-world code, as requested by Linus: > > http://lkml.kernel.org/r/CA+55aFz+Q33m1+ju3ANaznBwYCcWo9D9WDr2=p0YLEF4gJF12g at mail.gmail.com > > so if your per-cpu buffer use-case is compelling in its own right (as > opposed to a micro-benchmark), then you could chime in over there. > > Will FYI, I've adapted lttng-ust ring buffer (as a POC) to rseq in a dev branch. I see interesting speedups. See top 3-4 commits of https://github.com/compudj/lttng-ust-dev/tree/rseq-integration (start with "Use rseq for..."). On x86-64, we have a 7ns speedup over sched_getcpu on x86-64, and 30ns speedup by using rseq atomics on x86-64, which brings the cost per event record down to about 100ns/event. This replaces 3 atomic operations on the fast path. (37% speedup) On arm32, the cpu_id acceleration gives a 327 ns/event speed increase, which brings speed to 2000ns/event. Note that reading time on that system does not use the vDSO (old glibc), so it implies a system call. This accounts for 857ns/events. I don't observe speed increase nor slowdown by using rseq instead of ll/sc atomic operations on that specific board (Cubietruck, only has 2 cores). I suspect that boards with more core will benefit more of replacing ll/sc by rseq atomics. If we don't account the overhead of reading time through system call, we get a 22% speedup. I have extra benchmarks in this branch: https://github.com/compudj/rseq-test Updated ref for current rseq-enabled kernel based on 4.8: https://github.com/compudj/linux-percpu-dev/tree/rseq-fallback (ARM64 port would be welcome!) :) As Will pointed out, what I'm currently looking for is real-life benchmarks that shows benefits of rseq. I fear that the microbenchmarks I have for the lttng-ust tracer may be dismissed as being too specific. Most heavy users of LTTng-UST are closed source applications, so it's not easy for me to provide numbers in real-life scenarios. The major use-case besides per-cpu buffering/tracing AFAIU is memory allocators. It will mainly benefit in use-cases where there are more threads than cores in a multithreaded application. This mainly makes sense if threads are either dedicated to specific tasks, and therefore are often idle, or in use-cases where worker threads are expected to block (else, if threads are not expected to block, the application should simply have one thread per core). Dave Watson had interesting RSS shrinkage on this stress-test program: http://locklessinc.com/downloads/t-test1.c modified to have 500 threads. It uses jemalloc modified to use rseq. I reproduced it on my laptop with 100 threads, 50000 loops: 4-core, 100 threads, 50000 loops. malloc: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 10136 compudj 20 0 2857840 24756 1348 R 379.4 0.4 3:49.50 t-test1 real 3m20.830s user 3m22.164s sys 9m40.936s upstream jemalloc: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 21234 compudj 20 0 2227124 49300 2280 S 306.2 0.7 2:26.97 t-test1 real 1m3.297s user 3m19.616s sys 0m8.500s rseq jemalloc 4.2.1: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 25652 compudj 20 0 877956 35624 3260 S 301.2 0.5 1:26.07 t-test1 real 0m27.639s user 1m18.172s sys 0m1.752s The next step to translate this into a "real-life" number would be to run rseq-jemalloc on a facebook node, but Dave has been in vacation for the past few weeks. Perhaps someone else at Facebook or Google could look into this ? Cheers, Mathieu -- Mathieu Desnoyers EfficiOS Inc. http://www.efficios.com ^ permalink raw reply [flat|nested] 27+ messages in thread
[parent not found: <CAKdL+dQH=9C2aGf7ys5-vXM7pkdPYUQ8xYWLipwVbABOz09f1g@mail.gmail.com>]
* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW [not found] ` <CAKdL+dQH=9C2aGf7ys5-vXM7pkdPYUQ8xYWLipwVbABOz09f1g@mail.gmail.com> @ 2016-10-05 20:44 ` Mark Rutland 0 siblings, 0 replies; 27+ messages in thread From: Mark Rutland @ 2016-10-05 20:44 UTC (permalink / raw) To: Fredrik Markström Cc: Robin Murphy, Kees Cook, Arnd Bergmann, Ard Biesheuvel, Linus Walleij, Nicolas Pitre, Will Deacon, Russell King, kristina.martsenko, linux-kernel, Masahiro Yamada, Chris Brandt, Michal Marek, Zhaoxiu Zeng, linux-arm-kernel, Jonathan Austin On Wed, Oct 05, 2016 at 08:00:38PM +0000, Fredrik Markström wrote: > On Wed, Oct 5, 2016 at 7:48 PM Robin Murphy <robin.murphy@arm.com> wrote: > As far as I understand TPIDRURW isn't anything else then an architecture > specific piece of tls since the last patch, possibly slightly faster then a > "__thread u32 x;" > > The irony is that the two different ways it was handled earlier (not context > switched or always set to zero on swap in) would have made it useful for this > purpose. The "not context switched" case was also arbitrarily corrupted, and could not have been relied upon. The zeroing case is similar to the restartable sequences design. So that's probably worth looking into. Thanks, Mark. ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW @ 2016-10-05 20:44 ` Mark Rutland 0 siblings, 0 replies; 27+ messages in thread From: Mark Rutland @ 2016-10-05 20:44 UTC (permalink / raw) To: linux-arm-kernel On Wed, Oct 05, 2016 at 08:00:38PM +0000, Fredrik Markstr?m wrote: > On Wed, Oct 5, 2016 at 7:48 PM Robin Murphy <robin.murphy@arm.com> wrote: > As far as I understand TPIDRURW isn't anything else then an architecture > specific piece of tls since the last patch, possibly slightly faster then a > "__thread u32 x;" > > The irony is that the two different ways it was handled earlier (not context > switched or always set to zero on swap in) would have made it useful for this > purpose. The "not context switched" case was also arbitrarily corrupted, and could not have been relied upon. The zeroing case is similar to the restartable sequences design. So that's probably worth looking into. Thanks, Mark. ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW 2016-10-05 20:44 ` Mark Rutland @ 2016-10-05 21:01 ` Russell King - ARM Linux -1 siblings, 0 replies; 27+ messages in thread From: Russell King - ARM Linux @ 2016-10-05 21:01 UTC (permalink / raw) To: Mark Rutland Cc: Fredrik Markström, Robin Murphy, Kees Cook, Arnd Bergmann, Ard Biesheuvel, Linus Walleij, Nicolas Pitre, Will Deacon, kristina.martsenko, linux-kernel, Masahiro Yamada, Chris Brandt, Michal Marek, Zhaoxiu Zeng, linux-arm-kernel, Jonathan Austin On Wed, Oct 05, 2016 at 09:44:53PM +0100, Mark Rutland wrote: > The zeroing case is similar to the restartable sequences design. So that's > probably worth looking into. You're sending mixed messages: in your previous message, you said: Arguably, someone could have (ab)used TPIDRURW between commits 6a1c531 and a4780ad to detect context switches, but in practice they don't appear to have, and we know of an established user relying on the current behaviour. For better or worse, the current behaviour is ABI. Now you're suggesting that we could go back to the case where the register is zeroed. Well, the fact is that we _can_ change the TPIDRURW behaviour - we just need to be careful about how we change it. Eg, we _could_ introduce a per-process flag which indicates that we want some other behaviour from TPIDRURW such as zeroing it on context switches. The default would be to preserve the existing behaviour as doing anything else breaks existing programs. The problem there is finding an acceptable way to control such a flag from userspace (eg, prctl, syscall, etc). -- RMK's Patch system: http://www.armlinux.org.uk/developer/patches/ FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up according to speedtest.net. ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW @ 2016-10-05 21:01 ` Russell King - ARM Linux 0 siblings, 0 replies; 27+ messages in thread From: Russell King - ARM Linux @ 2016-10-05 21:01 UTC (permalink / raw) To: linux-arm-kernel On Wed, Oct 05, 2016 at 09:44:53PM +0100, Mark Rutland wrote: > The zeroing case is similar to the restartable sequences design. So that's > probably worth looking into. You're sending mixed messages: in your previous message, you said: Arguably, someone could have (ab)used TPIDRURW between commits 6a1c531 and a4780ad to detect context switches, but in practice they don't appear to have, and we know of an established user relying on the current behaviour. For better or worse, the current behaviour is ABI. Now you're suggesting that we could go back to the case where the register is zeroed. Well, the fact is that we _can_ change the TPIDRURW behaviour - we just need to be careful about how we change it. Eg, we _could_ introduce a per-process flag which indicates that we want some other behaviour from TPIDRURW such as zeroing it on context switches. The default would be to preserve the existing behaviour as doing anything else breaks existing programs. The problem there is finding an acceptable way to control such a flag from userspace (eg, prctl, syscall, etc). -- RMK's Patch system: http://www.armlinux.org.uk/developer/patches/ FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up according to speedtest.net. ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW 2016-10-05 21:01 ` Russell King - ARM Linux @ 2016-10-05 21:47 ` Mark Rutland -1 siblings, 0 replies; 27+ messages in thread From: Mark Rutland @ 2016-10-05 21:47 UTC (permalink / raw) To: Russell King - ARM Linux Cc: Fredrik Markström, Robin Murphy, Kees Cook, Arnd Bergmann, Ard Biesheuvel, Linus Walleij, Nicolas Pitre, Will Deacon, kristina.martsenko, linux-kernel, Masahiro Yamada, Chris Brandt, Michal Marek, Zhaoxiu Zeng, linux-arm-kernel, Jonathan Austin On Wed, Oct 05, 2016 at 10:01:38PM +0100, Russell King - ARM Linux wrote: > On Wed, Oct 05, 2016 at 09:44:53PM +0100, Mark Rutland wrote: > > The zeroing case is similar to the restartable sequences design. So that's > > probably worth looking into. > > You're sending mixed messages: in your previous message, you said: > > Arguably, someone could have (ab)used TPIDRURW between commits 6a1c531 > and a4780ad to detect context switches, but in practice they don't > appear to have, and we know of an established user relying on the > current behaviour. > > For better or worse, the current behaviour is ABI. > > Now you're suggesting that we could go back to the case where the > register is zeroed. Sorry; clumsy wording on my behalf. I meant that functionality-wise, restartable sequences had similar behaviour to the zeroing case (without touching TPIDRURW at all) and were probably worth looking at. I did not intend to suggest that we should go pack to case where TPIDRURW was zeroed. > Well, the fact is that we _can_ change the TPIDRURW behaviour - we just > need to be careful about how we change it. Eg, we _could_ introduce a > per-process flag which indicates that we want some other behaviour from > TPIDRURW such as zeroing it on context switches. The default would be > to preserve the existing behaviour as doing anything else breaks > existing programs. The problem there is finding an acceptable way to > control such a flag from userspace (eg, prctl, syscall, etc). Sure. Something like that could work. Thanks, Mark. ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW @ 2016-10-05 21:47 ` Mark Rutland 0 siblings, 0 replies; 27+ messages in thread From: Mark Rutland @ 2016-10-05 21:47 UTC (permalink / raw) To: linux-arm-kernel On Wed, Oct 05, 2016 at 10:01:38PM +0100, Russell King - ARM Linux wrote: > On Wed, Oct 05, 2016 at 09:44:53PM +0100, Mark Rutland wrote: > > The zeroing case is similar to the restartable sequences design. So that's > > probably worth looking into. > > You're sending mixed messages: in your previous message, you said: > > Arguably, someone could have (ab)used TPIDRURW between commits 6a1c531 > and a4780ad to detect context switches, but in practice they don't > appear to have, and we know of an established user relying on the > current behaviour. > > For better or worse, the current behaviour is ABI. > > Now you're suggesting that we could go back to the case where the > register is zeroed. Sorry; clumsy wording on my behalf. I meant that functionality-wise, restartable sequences had similar behaviour to the zeroing case (without touching TPIDRURW at all) and were probably worth looking at. I did not intend to suggest that we should go pack to case where TPIDRURW was zeroed. > Well, the fact is that we _can_ change the TPIDRURW behaviour - we just > need to be careful about how we change it. Eg, we _could_ introduce a > per-process flag which indicates that we want some other behaviour from > TPIDRURW such as zeroing it on context switches. The default would be > to preserve the existing behaviour as doing anything else breaks > existing programs. The problem there is finding an acceptable way to > control such a flag from userspace (eg, prctl, syscall, etc). Sure. Something like that could work. Thanks, Mark. ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW 2016-10-05 20:44 ` Mark Rutland @ 2016-10-05 21:37 ` Fredrik Markström -1 siblings, 0 replies; 27+ messages in thread From: Fredrik Markström @ 2016-10-05 21:37 UTC (permalink / raw) To: Mark Rutland Cc: Robin Murphy, Kees Cook, Arnd Bergmann, Ard Biesheuvel, Linus Walleij, Nicolas Pitre, Will Deacon, Russell King, kristina.martsenko, linux-kernel, Masahiro Yamada, Chris Brandt, Michal Marek, Zhaoxiu Zeng, linux-arm-kernel, Jonathan Austin On Wed, Oct 5, 2016 at 10:44 PM, Mark Rutland <mark.rutland@arm.com> wrote: > On Wed, Oct 05, 2016 at 08:00:38PM +0000, Fredrik Markström wrote: >> On Wed, Oct 5, 2016 at 7:48 PM Robin Murphy <robin.murphy@arm.com> wrote: >> As far as I understand TPIDRURW isn't anything else then an architecture >> specific piece of tls since the last patch, possibly slightly faster then a >> "__thread u32 x;" >> >> The irony is that the two different ways it was handled earlier (not context >> switched or always set to zero on swap in) would have made it useful for this >> purpose. > > The "not context switched" case was also arbitrarily corrupted, and could not > have been relied upon. Ok, I missed that, sorry ! > > The zeroing case is similar to the restartable sequences design. So that's > probably worth looking into. Ok, I'm starting to believe my best bet is to hope that those make it into the kernel eventually, until then I'll probably just go with a local solution. /Fredrik > > Thanks, > Mark. ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW @ 2016-10-05 21:37 ` Fredrik Markström 0 siblings, 0 replies; 27+ messages in thread From: Fredrik Markström @ 2016-10-05 21:37 UTC (permalink / raw) To: linux-arm-kernel On Wed, Oct 5, 2016 at 10:44 PM, Mark Rutland <mark.rutland@arm.com> wrote: > On Wed, Oct 05, 2016 at 08:00:38PM +0000, Fredrik Markstr?m wrote: >> On Wed, Oct 5, 2016 at 7:48 PM Robin Murphy <robin.murphy@arm.com> wrote: >> As far as I understand TPIDRURW isn't anything else then an architecture >> specific piece of tls since the last patch, possibly slightly faster then a >> "__thread u32 x;" >> >> The irony is that the two different ways it was handled earlier (not context >> switched or always set to zero on swap in) would have made it useful for this >> purpose. > > The "not context switched" case was also arbitrarily corrupted, and could not > have been relied upon. Ok, I missed that, sorry ! > > The zeroing case is similar to the restartable sequences design. So that's > probably worth looking into. Ok, I'm starting to believe my best bet is to hope that those make it into the kernel eventually, until then I'll probably just go with a local solution. /Fredrik > > Thanks, > Mark. ^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW 2016-10-05 12:25 ` Fredrik Markström @ 2016-10-05 20:12 ` Mark Rutland -1 siblings, 0 replies; 27+ messages in thread From: Mark Rutland @ 2016-10-05 20:12 UTC (permalink / raw) To: Fredrik Markström Cc: linux-arm-kernel, Russell King, Will Deacon, Chris Brandt, Nicolas Pitre, Ard Biesheuvel, Arnd Bergmann, Linus Walleij, Masahiro Yamada, Kees Cook, Jonathan Austin, Zhaoxiu Zeng, Michal Marek, linux-kernel, kristina.martsenko On Wed, Oct 05, 2016 at 02:25:22PM +0200, Fredrik Markström wrote: > On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote: > > On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote: > The way I was trying to defend the breakage was by reasoning that that if it > was an ABI we broke it both with a4780ad and with 6a1c531, and since we don't > break ABI:s, it can't be one. Prior to commit 6a1c531, other programs and/or cpuidle could have corrupted TPIDRURW to arbitrary values at any point in time, so it couldn't have been relied upon. Arguably, someone could have (ab)used TPIDRURW between commits 6a1c531 and a4780ad to detect context switches, but in practice they don't appear to have, and we know of an established user relying on the current behaviour. For better or worse, the current behaviour is ABI. > > I was under the impression that other mechanisms were being considered > > for fast userspace access to per-cpu data structures, e.g. restartable > > sequences. What is the state of those? Why is this better? > > > > If getcpu() specifically is necessary, is there no other way to > > implement it? > > If you are referring to the user space stuff can probably be implemented > other ways, it's just convenient since the interface is there and it will > speed up stuff like lttng without modifications (well, except glibc). It's > also already implemented as a vDSO on other major architectures (like x86, > x86_64, ppc32 and ppc64). > > If you are referring to the implementation of the vdso call, there are other > possibilities, but I haven't found any that doesn't introduce overhead in > context switching. > > But if TPIDRURW is definitely a no go, I can work on a patch that does this > with a thread notifier and the vdso data page. Would that be a viable option? As pointed out, that won't work for SMP, but perhaps we can come up with something that does. > > > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep, > > > + struct getcpu_cache *tcache) > > > +{ > > > + unsigned long node_and_cpu; > > > + > > > + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu)); > > > + > > > + if (nodep) > > > + *nodep = cpu_to_node(node_and_cpu >> 16); > > > + if (cpup) > > > + *cpup = node_and_cpu & 0xffffUL; > > > > Given this is directly user-accessible, this format is a de-facto ABI, > > even if it's not documented as such. Is this definitely the format you > > want long-term? > > Yes, this (the interface) is indeed the important part and therefore I tried > not to invent anything on my own. This is the interface used by ppc32, > ppc64, x86, x86_64. It's also this is how the getcpu(2) system call is > documented. I was referring to the value in TPIDRURW specifically. If an application started reading that directly (rather than going through the vDSO), we wouldn't be able to change the register value in future. That's all moot, given we can't repurpose TPIDRURW. Thanks, Mark. ^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW @ 2016-10-05 20:12 ` Mark Rutland 0 siblings, 0 replies; 27+ messages in thread From: Mark Rutland @ 2016-10-05 20:12 UTC (permalink / raw) To: linux-arm-kernel On Wed, Oct 05, 2016 at 02:25:22PM +0200, Fredrik Markstr?m wrote: > On Tue, Oct 4, 2016 at 7:08 PM Mark Rutland <mark.rutland@arm.com> wrote: > > On Tue, Oct 04, 2016 at 05:35:33PM +0200, Fredrik Markstrom wrote: > The way I was trying to defend the breakage was by reasoning that that if it > was an ABI we broke it both with a4780ad and with 6a1c531, and since we don't > break ABI:s, it can't be one. Prior to commit 6a1c531, other programs and/or cpuidle could have corrupted TPIDRURW to arbitrary values at any point in time, so it couldn't have been relied upon. Arguably, someone could have (ab)used TPIDRURW between commits 6a1c531 and a4780ad to detect context switches, but in practice they don't appear to have, and we know of an established user relying on the current behaviour. For better or worse, the current behaviour is ABI. > > I was under the impression that other mechanisms were being considered > > for fast userspace access to per-cpu data structures, e.g. restartable > > sequences. What is the state of those? Why is this better? > > > > If getcpu() specifically is necessary, is there no other way to > > implement it? > > If you are referring to the user space stuff can probably be implemented > other ways, it's just convenient since the interface is there and it will > speed up stuff like lttng without modifications (well, except glibc). It's > also already implemented as a vDSO on other major architectures (like x86, > x86_64, ppc32 and ppc64). > > If you are referring to the implementation of the vdso call, there are other > possibilities, but I haven't found any that doesn't introduce overhead in > context switching. > > But if TPIDRURW is definitely a no go, I can work on a patch that does this > with a thread notifier and the vdso data page. Would that be a viable option? As pointed out, that won't work for SMP, but perhaps we can come up with something that does. > > > +notrace int __vdso_getcpu(unsigned int *cpup, unsigned int *nodep, > > > + struct getcpu_cache *tcache) > > > +{ > > > + unsigned long node_and_cpu; > > > + > > > + asm("mrc p15, 0, %0, c13, c0, 2\n" : "=r"(node_and_cpu)); > > > + > > > + if (nodep) > > > + *nodep = cpu_to_node(node_and_cpu >> 16); > > > + if (cpup) > > > + *cpup = node_and_cpu & 0xffffUL; > > > > Given this is directly user-accessible, this format is a de-facto ABI, > > even if it's not documented as such. Is this definitely the format you > > want long-term? > > Yes, this (the interface) is indeed the important part and therefore I tried > not to invent anything on my own. This is the interface used by ppc32, > ppc64, x86, x86_64. It's also this is how the getcpu(2) system call is > documented. I was referring to the value in TPIDRURW specifically. If an application started reading that directly (rather than going through the vDSO), we wouldn't be able to change the register value in future. That's all moot, given we can't repurpose TPIDRURW. Thanks, Mark. ^ permalink raw reply [flat|nested] 27+ messages in thread
end of thread, other threads:[~2016-10-10 16:15 UTC | newest] Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2016-10-04 13:49 [PATCH] arm: Added support for getcpu() vDSO using TPIDRURW Fredrik Markstrom 2016-10-04 15:35 ` [PATCH v2] " Fredrik Markstrom 2016-10-04 15:35 ` Fredrik Markstrom 2016-10-04 17:07 ` Mark Rutland 2016-10-04 17:07 ` Mark Rutland 2016-10-05 12:25 ` Fredrik Markström 2016-10-05 12:25 ` Fredrik Markström 2016-10-05 16:39 ` Fredrik Markström 2016-10-05 16:39 ` Fredrik Markström 2016-10-05 17:48 ` Robin Murphy 2016-10-05 17:48 ` Robin Murphy 2016-10-05 19:53 ` Russell King - ARM Linux 2016-10-05 19:53 ` Russell King - ARM Linux [not found] ` <CAKdL+dSt+cBCpwW5q+VCQh+7XeKrnyJgfTsEsuo2nKoUr9ytxw@mail.gmail.com> 2016-10-10 15:29 ` Will Deacon 2016-10-10 15:29 ` Will Deacon 2016-10-10 16:15 ` Restartable Sequences benchmarks (was: Re: [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW) Mathieu Desnoyers 2016-10-10 16:15 ` Mathieu Desnoyers [not found] ` <CAKdL+dQH=9C2aGf7ys5-vXM7pkdPYUQ8xYWLipwVbABOz09f1g@mail.gmail.com> 2016-10-05 20:44 ` [PATCH v2] arm: Added support for getcpu() vDSO using TPIDRURW Mark Rutland 2016-10-05 20:44 ` Mark Rutland 2016-10-05 21:01 ` Russell King - ARM Linux 2016-10-05 21:01 ` Russell King - ARM Linux 2016-10-05 21:47 ` Mark Rutland 2016-10-05 21:47 ` Mark Rutland 2016-10-05 21:37 ` Fredrik Markström 2016-10-05 21:37 ` Fredrik Markström 2016-10-05 20:12 ` Mark Rutland 2016-10-05 20:12 ` Mark Rutland
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.