All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86)
@ 2015-12-10 15:39 ` Mathieu Desnoyers
  0 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2015-12-10 15:39 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: linux-kernel, Mathieu Desnoyers, Paul Turner, Andrew Hunter,
	Peter Zijlstra, Andy Lutomirski, Andi Kleen, Dave Watson,
	Chris Lameter, Ingo Molnar, Ben Maurer, Steven Rostedt,
	Paul E. McKenney, Josh Triplett, Linus Torvalds, Andrew Morton,
	linux-api

Expose a new system call allowing threads to register a userspace memory
area where to store the current CPU number. Scheduler migration sets the
TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space,
a notify-resume handler updates the current CPU value within that
user-space memory area.

This getcpu cache is an alternative to the sched_getcpu() vdso which has
a few benefits:
- It is faster to do a memory read that to call a vDSO,
- This cached value can be read from within an inline assembly, which
  makes it a useful building block for restartable sequences.

This approach is inspired by Paul Turner and Andrew Hunter's work
on percpu atomics, which lets the kernel handle restart of critical
sections:
Ref.:
* https://lkml.org/lkml/2015/6/24/665
* https://lwn.net/Articles/650333/
* http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf

Benchmarking sched_getcpu() vs tls cache approach. Getting the
current CPU number:

- With Linux vdso:            12.7 ns
- With TLS-cached cpu number:  0.3 ns

The system call can be extended by registering a larger structure in
the future.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Paul Turner <pjt@google.com>
CC: Andrew Hunter <ahh@google.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andi Kleen <andi@firstfloor.org>
CC: Dave Watson <davejwatson@fb.com>
CC: Chris Lameter <cl@linux.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: Ben Maurer <bmaurer@fb.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: linux-api@vger.kernel.org
---
 arch/x86/entry/common.c                |  2 +
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/exec.c                              |  1 +
 include/linux/sched.h                  | 32 ++++++++++++
 include/uapi/asm-generic/unistd.h      |  4 +-
 include/uapi/linux/Kbuild              |  1 +
 include/uapi/linux/thread_local_abi.h  | 37 ++++++++++++++
 init/Kconfig                           |  7 +++
 kernel/Makefile                        |  1 +
 kernel/fork.c                          |  2 +
 kernel/sched/core.c                    |  4 ++
 kernel/sched/sched.h                   |  2 +
 kernel/sys_ni.c                        |  3 ++
 kernel/thread_local_abi.c              | 92 ++++++++++++++++++++++++++++++++++
 14 files changed, 188 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/thread_local_abi.h
 create mode 100644 kernel/thread_local_abi.c

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index a89fdbc..fdfdb14 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -249,6 +249,8 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
 		if (cached_flags & _TIF_NOTIFY_RESUME) {
 			clear_thread_flag(TIF_NOTIFY_RESUME);
 			tracehook_notify_resume(regs);
+			if (getcpu_cache_active(current))
+				getcpu_cache_handle_notify_resume(current);
 		}
 
 		if (cached_flags & _TIF_USER_RETURN_NOTIFY)
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 314a90b..748aee3 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -332,6 +332,7 @@
 323	common	userfaultfd		sys_userfaultfd
 324	common	membarrier		sys_membarrier
 325	common	mlock2			sys_mlock2
+326	common	thread_local_abi	sys_thread_local_abi
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/exec.c b/fs/exec.c
index b06623a..88490cc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1594,6 +1594,7 @@ static int do_execveat_common(int fd, struct filename *filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
+	thread_local_abi_execve(current);
 	acct_update_integrals(current);
 	task_numa_free(current);
 	free_bprm(bprm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index edad7a4..b39d9a3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2,6 +2,7 @@
 #define _LINUX_SCHED_H
 
 #include <uapi/linux/sched.h>
+#include <uapi/linux/thread_local_abi.h>
 
 #include <linux/sched/prio.h>
 
@@ -1812,6 +1813,10 @@ struct task_struct {
 	unsigned long	task_state_change;
 #endif
 	int pagefault_disabled;
+#ifdef CONFIG_THREAD_LOCAL_ABI
+	size_t thread_local_abi_len;
+	struct thread_local_abi __user *thread_local_abi;
+#endif
 /* CPU-specific state of this task */
 	struct thread_struct thread;
 /*
@@ -3188,4 +3193,31 @@ static inline unsigned long rlimit_max(unsigned int limit)
 	return task_rlimit_max(current, limit);
 }
 
+#ifdef CONFIG_THREAD_LOCAL_ABI
+void thread_local_abi_fork(struct task_struct *t);
+void thread_local_abi_execve(struct task_struct *t);
+void getcpu_cache_handle_notify_resume(struct task_struct *t);
+static inline bool getcpu_cache_active(struct task_struct *t)
+{
+	if (t->thread_local_abi_len < offsetof(struct thread_local_abi, cpu)
+			+ sizeof(t->thread_local_abi->cpu))
+		return false;
+	return true;
+}
+#else
+static inline void thread_local_abi_fork(struct task_struct *t)
+{
+}
+static inline void thread_local_abi_execve(struct task_struct *t)
+{
+}
+static inline void getcpu_cache_handle_notify_resume(struct task_struct *t)
+{
+}
+static inline bool getcpu_cache_active(struct task_struct *t)
+{
+	return false;
+}
+#endif
+
 #endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 1324b02..89a107a 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
 __SYSCALL(__NR_membarrier, sys_membarrier)
 #define __NR_mlock2 284
 __SYSCALL(__NR_mlock2, sys_mlock2)
+#define __NR_thread_local_abi 285
+__SYSCALL(__NR_thread_local_abi, sys_thread_local_abi)
 
 #undef __NR_syscalls
-#define __NR_syscalls 285
+#define __NR_syscalls 286
 
 /*
  * All syscalls below here should go away really,
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 628e6e6..5df5460 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -397,6 +397,7 @@ header-y += tcp_metrics.h
 header-y += telephony.h
 header-y += termios.h
 header-y += thermal.h
+header-y += thread_local_abi.h
 header-y += time.h
 header-y += times.h
 header-y += timex.h
diff --git a/include/uapi/linux/thread_local_abi.h b/include/uapi/linux/thread_local_abi.h
new file mode 100644
index 0000000..6487c92
--- /dev/null
+++ b/include/uapi/linux/thread_local_abi.h
@@ -0,0 +1,37 @@
+#ifndef _UAPI_LINUX_THREAD_LOCAL_ABI_H
+#define _UAPI_LINUX_THREAD_LOCAL_ABI_H
+
+/*
+ * linux/thread_local_abi.h
+ *
+ * thread_local_abi system call API
+ *
+ * Copyright (c) 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+
+/* This structure is an ABI that can only be extended. */
+struct thread_local_abi {
+	int32_t cpu;
+};
+
+#endif /* _UAPI_LINUX_THREAD_LOCAL_ABI_H */
diff --git a/init/Kconfig b/init/Kconfig
index c24b6f7..df29803 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1612,6 +1612,13 @@ config MEMBARRIER
 	  pairs of memory barriers into pairs consisting of membarrier() and a
 	  compiler barrier.
 
+config THREAD_LOCAL_ABI
+	bool "Enable thread-local ABI" if EXPERT
+	default y
+	help
+	  Enable the thread-local ABI system call. It provides a user-space
+	  cache for the current CPU number value.
+
 	  If unsure, say Y.
 
 config EMBEDDED
diff --git a/kernel/Makefile b/kernel/Makefile
index 53abf00..327fbd9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -103,6 +103,7 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
+obj-$(CONFIG_THREAD_LOCAL_ABI) += thread_local_abi.o
 
 $(obj)/configs.o: $(obj)/config_data.h
 
diff --git a/kernel/fork.c b/kernel/fork.c
index f97f2c4..42dd565 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1612,6 +1612,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	cgroup_post_fork(p, cgrp_ss_priv);
 	if (clone_flags & CLONE_THREAD)
 		threadgroup_change_end(current);
+	if (!(clone_flags & CLONE_THREAD))
+		thread_local_abi_fork(p);
 	perf_event_fork(p);
 
 	trace_task_newtask(p, clone_flags);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4d568ac..b78f92f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2120,6 +2120,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 
 	p->numa_group = NULL;
 #endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_THREAD_LOCAL_ABI
+	p->thread_local_abi_len = 0;
+	p->thread_local_abi = NULL;
+#endif
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index efd3bfc..d828b97 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -957,6 +957,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 	set_task_rq(p, cpu);
 #ifdef CONFIG_SMP
+	if (getcpu_cache_active(p))
+		set_tsk_thread_flag(p, TIF_NOTIFY_RESUME);
 	/*
 	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
 	 * successfuly executed on another CPU. We must ensure that updates of
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0623787..e803824 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -249,3 +249,6 @@ cond_syscall(sys_execveat);
 
 /* membarrier */
 cond_syscall(sys_membarrier);
+
+/* thread-local ABI */
+cond_syscall(sys_thread_local_abi);
diff --git a/kernel/thread_local_abi.c b/kernel/thread_local_abi.c
new file mode 100644
index 0000000..f05505a
--- /dev/null
+++ b/kernel/thread_local_abi.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * thread_local_abi system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+
+static int getcpu_cache_update(struct task_struct *t)
+{
+	if (put_user(raw_smp_processor_id(), &t->thread_local_abi->cpu)) {
+		t->thread_local_abi_len = 0;
+		t->thread_local_abi = NULL;
+		return -1;
+	}
+	return 0;
+}
+
+/*
+ * This resume handler should always be executed between a migration
+ * triggered by preemption and return to user-space.
+ */
+void getcpu_cache_handle_notify_resume(struct task_struct *t)
+{
+	BUG_ON(!getcpu_cache_active(t));
+	if (unlikely(t->flags & PF_EXITING))
+		return;
+	if (getcpu_cache_update(t))
+		force_sig(SIGSEGV, t);
+}
+
+/*
+ * If parent process has a thread-local ABI, the child inherits. Only applies
+ * when forking a process, not a thread.
+ */
+void thread_local_abi_fork(struct task_struct *t)
+{
+	t->thread_local_abi_len = current->thread_local_abi_len;
+	t->thread_local_abi = current->thread_local_abi;
+}
+
+void thread_local_abi_execve(struct task_struct *t)
+{
+	t->thread_local_abi_len = 0;
+	t->thread_local_abi = NULL;
+}
+
+/*
+ * sys_thread_local_abi - setup thread-local ABI for caller thread
+ */
+SYSCALL_DEFINE3(thread_local_abi, struct thread_local_abi __user *, tlap,
+		size_t, len, int, flags)
+{
+	size_t minlen;
+
+	if (flags)
+		return -EINVAL;
+	if (current->thread_local_abi && tlap)
+		return -EBUSY;
+	/* Agree on the intersection of userspace and kernel features */
+	if (!tlap)
+		minlen = 0;
+	else
+		minlen = min_t(size_t, len, sizeof(struct thread_local_abi));
+	current->thread_local_abi_len = minlen;
+	current->thread_local_abi = tlap;
+	/*
+	 * Migration checks ->thread_local_abi_len to see if notify_resume
+	 * flag should be set. Therefore, we need to ensure that
+	 * the scheduler sees ->thread_local_abi_len before we update
+	 * the getcpu cache content with the current CPU number.
+	 */
+	barrier();	/* Store thread_local_abi_len before update content */
+	if (getcpu_cache_active(current)) {
+		if (getcpu_cache_update(current))
+			return -EFAULT;
+	}
+	return minlen;
+}
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86)
@ 2015-12-10 15:39 ` Mathieu Desnoyers
  0 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2015-12-10 15:39 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Mathieu Desnoyers,
	Paul Turner, Andrew Hunter, Peter Zijlstra, Andy Lutomirski,
	Andi Kleen, Dave Watson, Chris Lameter, Ingo Molnar, Ben Maurer,
	Steven Rostedt, Paul E. McKenney, Josh Triplett, Linus Torvalds,
	Andrew Morton, linux-api-u79uwXL29TY76Z2rM5mHXA

Expose a new system call allowing threads to register a userspace memory
area where to store the current CPU number. Scheduler migration sets the
TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space,
a notify-resume handler updates the current CPU value within that
user-space memory area.

This getcpu cache is an alternative to the sched_getcpu() vdso which has
a few benefits:
- It is faster to do a memory read that to call a vDSO,
- This cached value can be read from within an inline assembly, which
  makes it a useful building block for restartable sequences.

This approach is inspired by Paul Turner and Andrew Hunter's work
on percpu atomics, which lets the kernel handle restart of critical
sections:
Ref.:
* https://lkml.org/lkml/2015/6/24/665
* https://lwn.net/Articles/650333/
* http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf

Benchmarking sched_getcpu() vs tls cache approach. Getting the
current CPU number:

- With Linux vdso:            12.7 ns
- With TLS-cached cpu number:  0.3 ns

The system call can be extended by registering a larger structure in
the future.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
CC: Thomas Gleixner <tglx-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org>
CC: Paul Turner <pjt-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
CC: Andrew Hunter <ahh-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
CC: Peter Zijlstra <peterz-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
CC: Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org>
CC: Andi Kleen <andi-Vw/NltI1exuRpAAqCnN02g@public.gmane.org>
CC: Dave Watson <davejwatson-b10kYP2dOMg@public.gmane.org>
CC: Chris Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>
CC: Ingo Molnar <mingo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
CC: Ben Maurer <bmaurer-b10kYP2dOMg@public.gmane.org>
CC: Steven Rostedt <rostedt-nx8X9YLhiw1AfugRpC6u6w@public.gmane.org>
CC: "Paul E. McKenney" <paulmck-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
CC: Josh Triplett <josh-iaAMLnmF4UmaiuxdJuQwMA@public.gmane.org>
CC: Linus Torvalds <torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
CC: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
CC: Thomas Gleixner <tglx-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org>
CC: linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
 arch/x86/entry/common.c                |  2 +
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/exec.c                              |  1 +
 include/linux/sched.h                  | 32 ++++++++++++
 include/uapi/asm-generic/unistd.h      |  4 +-
 include/uapi/linux/Kbuild              |  1 +
 include/uapi/linux/thread_local_abi.h  | 37 ++++++++++++++
 init/Kconfig                           |  7 +++
 kernel/Makefile                        |  1 +
 kernel/fork.c                          |  2 +
 kernel/sched/core.c                    |  4 ++
 kernel/sched/sched.h                   |  2 +
 kernel/sys_ni.c                        |  3 ++
 kernel/thread_local_abi.c              | 92 ++++++++++++++++++++++++++++++++++
 14 files changed, 188 insertions(+), 1 deletion(-)
 create mode 100644 include/uapi/linux/thread_local_abi.h
 create mode 100644 kernel/thread_local_abi.c

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index a89fdbc..fdfdb14 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -249,6 +249,8 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
 		if (cached_flags & _TIF_NOTIFY_RESUME) {
 			clear_thread_flag(TIF_NOTIFY_RESUME);
 			tracehook_notify_resume(regs);
+			if (getcpu_cache_active(current))
+				getcpu_cache_handle_notify_resume(current);
 		}
 
 		if (cached_flags & _TIF_USER_RETURN_NOTIFY)
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 314a90b..748aee3 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -332,6 +332,7 @@
 323	common	userfaultfd		sys_userfaultfd
 324	common	membarrier		sys_membarrier
 325	common	mlock2			sys_mlock2
+326	common	thread_local_abi	sys_thread_local_abi
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/exec.c b/fs/exec.c
index b06623a..88490cc 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1594,6 +1594,7 @@ static int do_execveat_common(int fd, struct filename *filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
+	thread_local_abi_execve(current);
 	acct_update_integrals(current);
 	task_numa_free(current);
 	free_bprm(bprm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index edad7a4..b39d9a3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2,6 +2,7 @@
 #define _LINUX_SCHED_H
 
 #include <uapi/linux/sched.h>
+#include <uapi/linux/thread_local_abi.h>
 
 #include <linux/sched/prio.h>
 
@@ -1812,6 +1813,10 @@ struct task_struct {
 	unsigned long	task_state_change;
 #endif
 	int pagefault_disabled;
+#ifdef CONFIG_THREAD_LOCAL_ABI
+	size_t thread_local_abi_len;
+	struct thread_local_abi __user *thread_local_abi;
+#endif
 /* CPU-specific state of this task */
 	struct thread_struct thread;
 /*
@@ -3188,4 +3193,31 @@ static inline unsigned long rlimit_max(unsigned int limit)
 	return task_rlimit_max(current, limit);
 }
 
+#ifdef CONFIG_THREAD_LOCAL_ABI
+void thread_local_abi_fork(struct task_struct *t);
+void thread_local_abi_execve(struct task_struct *t);
+void getcpu_cache_handle_notify_resume(struct task_struct *t);
+static inline bool getcpu_cache_active(struct task_struct *t)
+{
+	if (t->thread_local_abi_len < offsetof(struct thread_local_abi, cpu)
+			+ sizeof(t->thread_local_abi->cpu))
+		return false;
+	return true;
+}
+#else
+static inline void thread_local_abi_fork(struct task_struct *t)
+{
+}
+static inline void thread_local_abi_execve(struct task_struct *t)
+{
+}
+static inline void getcpu_cache_handle_notify_resume(struct task_struct *t)
+{
+}
+static inline bool getcpu_cache_active(struct task_struct *t)
+{
+	return false;
+}
+#endif
+
 #endif
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 1324b02..89a107a 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
 __SYSCALL(__NR_membarrier, sys_membarrier)
 #define __NR_mlock2 284
 __SYSCALL(__NR_mlock2, sys_mlock2)
+#define __NR_thread_local_abi 285
+__SYSCALL(__NR_thread_local_abi, sys_thread_local_abi)
 
 #undef __NR_syscalls
-#define __NR_syscalls 285
+#define __NR_syscalls 286
 
 /*
  * All syscalls below here should go away really,
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 628e6e6..5df5460 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -397,6 +397,7 @@ header-y += tcp_metrics.h
 header-y += telephony.h
 header-y += termios.h
 header-y += thermal.h
+header-y += thread_local_abi.h
 header-y += time.h
 header-y += times.h
 header-y += timex.h
diff --git a/include/uapi/linux/thread_local_abi.h b/include/uapi/linux/thread_local_abi.h
new file mode 100644
index 0000000..6487c92
--- /dev/null
+++ b/include/uapi/linux/thread_local_abi.h
@@ -0,0 +1,37 @@
+#ifndef _UAPI_LINUX_THREAD_LOCAL_ABI_H
+#define _UAPI_LINUX_THREAD_LOCAL_ABI_H
+
+/*
+ * linux/thread_local_abi.h
+ *
+ * thread_local_abi system call API
+ *
+ * Copyright (c) 2015 Mathieu Desnoyers <mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+
+/* This structure is an ABI that can only be extended. */
+struct thread_local_abi {
+	int32_t cpu;
+};
+
+#endif /* _UAPI_LINUX_THREAD_LOCAL_ABI_H */
diff --git a/init/Kconfig b/init/Kconfig
index c24b6f7..df29803 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1612,6 +1612,13 @@ config MEMBARRIER
 	  pairs of memory barriers into pairs consisting of membarrier() and a
 	  compiler barrier.
 
+config THREAD_LOCAL_ABI
+	bool "Enable thread-local ABI" if EXPERT
+	default y
+	help
+	  Enable the thread-local ABI system call. It provides a user-space
+	  cache for the current CPU number value.
+
 	  If unsure, say Y.
 
 config EMBEDDED
diff --git a/kernel/Makefile b/kernel/Makefile
index 53abf00..327fbd9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -103,6 +103,7 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
+obj-$(CONFIG_THREAD_LOCAL_ABI) += thread_local_abi.o
 
 $(obj)/configs.o: $(obj)/config_data.h
 
diff --git a/kernel/fork.c b/kernel/fork.c
index f97f2c4..42dd565 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1612,6 +1612,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	cgroup_post_fork(p, cgrp_ss_priv);
 	if (clone_flags & CLONE_THREAD)
 		threadgroup_change_end(current);
+	if (!(clone_flags & CLONE_THREAD))
+		thread_local_abi_fork(p);
 	perf_event_fork(p);
 
 	trace_task_newtask(p, clone_flags);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4d568ac..b78f92f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2120,6 +2120,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 
 	p->numa_group = NULL;
 #endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_THREAD_LOCAL_ABI
+	p->thread_local_abi_len = 0;
+	p->thread_local_abi = NULL;
+#endif
 }
 
 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index efd3bfc..d828b97 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -957,6 +957,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 	set_task_rq(p, cpu);
 #ifdef CONFIG_SMP
+	if (getcpu_cache_active(p))
+		set_tsk_thread_flag(p, TIF_NOTIFY_RESUME);
 	/*
 	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
 	 * successfuly executed on another CPU. We must ensure that updates of
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0623787..e803824 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -249,3 +249,6 @@ cond_syscall(sys_execveat);
 
 /* membarrier */
 cond_syscall(sys_membarrier);
+
+/* thread-local ABI */
+cond_syscall(sys_thread_local_abi);
diff --git a/kernel/thread_local_abi.c b/kernel/thread_local_abi.c
new file mode 100644
index 0000000..f05505a
--- /dev/null
+++ b/kernel/thread_local_abi.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2015 Mathieu Desnoyers <mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
+ *
+ * thread_local_abi system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+
+static int getcpu_cache_update(struct task_struct *t)
+{
+	if (put_user(raw_smp_processor_id(), &t->thread_local_abi->cpu)) {
+		t->thread_local_abi_len = 0;
+		t->thread_local_abi = NULL;
+		return -1;
+	}
+	return 0;
+}
+
+/*
+ * This resume handler should always be executed between a migration
+ * triggered by preemption and return to user-space.
+ */
+void getcpu_cache_handle_notify_resume(struct task_struct *t)
+{
+	BUG_ON(!getcpu_cache_active(t));
+	if (unlikely(t->flags & PF_EXITING))
+		return;
+	if (getcpu_cache_update(t))
+		force_sig(SIGSEGV, t);
+}
+
+/*
+ * If parent process has a thread-local ABI, the child inherits. Only applies
+ * when forking a process, not a thread.
+ */
+void thread_local_abi_fork(struct task_struct *t)
+{
+	t->thread_local_abi_len = current->thread_local_abi_len;
+	t->thread_local_abi = current->thread_local_abi;
+}
+
+void thread_local_abi_execve(struct task_struct *t)
+{
+	t->thread_local_abi_len = 0;
+	t->thread_local_abi = NULL;
+}
+
+/*
+ * sys_thread_local_abi - setup thread-local ABI for caller thread
+ */
+SYSCALL_DEFINE3(thread_local_abi, struct thread_local_abi __user *, tlap,
+		size_t, len, int, flags)
+{
+	size_t minlen;
+
+	if (flags)
+		return -EINVAL;
+	if (current->thread_local_abi && tlap)
+		return -EBUSY;
+	/* Agree on the intersection of userspace and kernel features */
+	if (!tlap)
+		minlen = 0;
+	else
+		minlen = min_t(size_t, len, sizeof(struct thread_local_abi));
+	current->thread_local_abi_len = minlen;
+	current->thread_local_abi = tlap;
+	/*
+	 * Migration checks ->thread_local_abi_len to see if notify_resume
+	 * flag should be set. Therefore, we need to ensure that
+	 * the scheduler sees ->thread_local_abi_len before we update
+	 * the getcpu cache content with the current CPU number.
+	 */
+	barrier();	/* Store thread_local_abi_len before update content */
+	if (getcpu_cache_active(current)) {
+		if (getcpu_cache_update(current))
+			return -EFAULT;
+	}
+	return minlen;
+}
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC PATCH 2/2] thread_local_abi: wire up ARM system call
@ 2015-12-10 15:39   ` Mathieu Desnoyers
  0 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2015-12-10 15:39 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: linux-kernel, Mathieu Desnoyers, Russell King, Catalin Marinas,
	Will Deacon, Paul Turner, Andrew Hunter, Peter Zijlstra,
	Andy Lutomirski, Andi Kleen, Dave Watson, Chris Lameter,
	Ingo Molnar, Ben Maurer, Steven Rostedt, Paul E. McKenney,
	Josh Triplett, Linus Torvalds, Andrew Morton, linux-api

Wire up the thread local ABI on ARM32. Call the
getcpu_cache_handle_notify_resume() function on return to userspace if
TIF_NOTIFY_RESUME thread flag is set.

This provides a way to implement a sched_getcpu() on ARM without
requiring to perform a system call on the fast path.

[ Untested. ]

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Russell King <linux@arm.linux.org.uk>
CC: Catalin Marinas <catalin.marinas@arm.com>
CC: Will Deacon <will.deacon@arm.com>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: Paul Turner <pjt@google.com>
CC: Andrew Hunter <ahh@google.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Andy Lutomirski <luto@amacapital.net>
CC: Andi Kleen <andi@firstfloor.org>
CC: Dave Watson <davejwatson@fb.com>
CC: Chris Lameter <cl@linux.com>
CC: Ingo Molnar <mingo@redhat.com>
CC: Ben Maurer <bmaurer@fb.com>
CC: Steven Rostedt <rostedt@goodmis.org>
CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
CC: Josh Triplett <josh@joshtriplett.org>
CC: Linus Torvalds <torvalds@linux-foundation.org>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Thomas Gleixner <tglx@linutronix.de>
CC: linux-api@vger.kernel.org
---
 arch/arm/include/asm/unistd.h      | 2 +-
 arch/arm/include/uapi/asm/unistd.h | 1 +
 arch/arm/kernel/calls.S            | 1 +
 arch/arm/kernel/signal.c           | 2 ++
 4 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index 7b84657..ef55382 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -19,7 +19,7 @@
  * This may need to be greater than __NR_last_syscall+1 in order to
  * account for the padding in the syscall table
  */
-#define __NR_syscalls  (392)
+#define __NR_syscalls  (393)
 
 #define __ARCH_WANT_STAT64
 #define __ARCH_WANT_SYS_GETHOSTNAME
diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h
index 7a2a32a..859433a 100644
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -416,6 +416,7 @@
 #define __NR_execveat			(__NR_SYSCALL_BASE+387)
 #define __NR_userfaultfd		(__NR_SYSCALL_BASE+388)
 #define __NR_membarrier			(__NR_SYSCALL_BASE+389)
+#define __NR_thread_local_abi		(__NR_SYSCALL_BASE+390)
 
 /*
  * The following SWIs are ARM private.
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index fde6c88..82b59cc 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -399,6 +399,7 @@
 		CALL(sys_execveat)
 		CALL(sys_userfaultfd)
 		CALL(sys_membarrier)
+/* 390 */	CALL(sys_thread_local_abi)
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
 #define syscalls_counted
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 7b8f214..feaa514 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -594,6 +594,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 			} else {
 				clear_thread_flag(TIF_NOTIFY_RESUME);
 				tracehook_notify_resume(regs);
+				if (getcpu_cache_active(current))
+					getcpu_cache_handle_notify_resume(current);
 			}
 		}
 		local_irq_disable();
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [RFC PATCH 2/2] thread_local_abi: wire up ARM system call
@ 2015-12-10 15:39   ` Mathieu Desnoyers
  0 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2015-12-10 15:39 UTC (permalink / raw)
  To: Thomas Gleixner
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA, Mathieu Desnoyers,
	Russell King, Catalin Marinas, Will Deacon, Paul Turner,
	Andrew Hunter, Peter Zijlstra, Andy Lutomirski, Andi Kleen,
	Dave Watson, Chris Lameter, Ingo Molnar, Ben Maurer,
	Steven Rostedt, Paul E. McKenney, Josh Triplett, Linus Torvalds,
	Andrew Morton, linux-api-u79uwXL29TY76Z2rM5mHXA

Wire up the thread local ABI on ARM32. Call the
getcpu_cache_handle_notify_resume() function on return to userspace if
TIF_NOTIFY_RESUME thread flag is set.

This provides a way to implement a sched_getcpu() on ARM without
requiring to perform a system call on the fast path.

[ Untested. ]

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org>
CC: Russell King <linux-lFZ/pmaqli7XmaaqVzeoHQ@public.gmane.org>
CC: Catalin Marinas <catalin.marinas-5wv7dgnIgG8@public.gmane.org>
CC: Will Deacon <will.deacon-5wv7dgnIgG8@public.gmane.org>
CC: Thomas Gleixner <tglx-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org>
CC: Paul Turner <pjt-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
CC: Andrew Hunter <ahh-hpIqsD4AKlfQT0dZR+AlfA@public.gmane.org>
CC: Peter Zijlstra <peterz-wEGCiKHe2LqWVfeAwA7xHQ@public.gmane.org>
CC: Andy Lutomirski <luto-kltTT9wpgjJwATOyAt5JVQ@public.gmane.org>
CC: Andi Kleen <andi-Vw/NltI1exuRpAAqCnN02g@public.gmane.org>
CC: Dave Watson <davejwatson-b10kYP2dOMg@public.gmane.org>
CC: Chris Lameter <cl-vYTEC60ixJUAvxtiuMwx3w@public.gmane.org>
CC: Ingo Molnar <mingo-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
CC: Ben Maurer <bmaurer-b10kYP2dOMg@public.gmane.org>
CC: Steven Rostedt <rostedt-nx8X9YLhiw1AfugRpC6u6w@public.gmane.org>
CC: "Paul E. McKenney" <paulmck-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8@public.gmane.org>
CC: Josh Triplett <josh-iaAMLnmF4UmaiuxdJuQwMA@public.gmane.org>
CC: Linus Torvalds <torvalds-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
CC: Andrew Morton <akpm-de/tnXTf+JLsfHDXvbKv3WD2FQJk+8+b@public.gmane.org>
CC: Thomas Gleixner <tglx-hfZtesqFncYOwBW4kG4KsQ@public.gmane.org>
CC: linux-api-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
---
 arch/arm/include/asm/unistd.h      | 2 +-
 arch/arm/include/uapi/asm/unistd.h | 1 +
 arch/arm/kernel/calls.S            | 1 +
 arch/arm/kernel/signal.c           | 2 ++
 4 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index 7b84657..ef55382 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -19,7 +19,7 @@
  * This may need to be greater than __NR_last_syscall+1 in order to
  * account for the padding in the syscall table
  */
-#define __NR_syscalls  (392)
+#define __NR_syscalls  (393)
 
 #define __ARCH_WANT_STAT64
 #define __ARCH_WANT_SYS_GETHOSTNAME
diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h
index 7a2a32a..859433a 100644
--- a/arch/arm/include/uapi/asm/unistd.h
+++ b/arch/arm/include/uapi/asm/unistd.h
@@ -416,6 +416,7 @@
 #define __NR_execveat			(__NR_SYSCALL_BASE+387)
 #define __NR_userfaultfd		(__NR_SYSCALL_BASE+388)
 #define __NR_membarrier			(__NR_SYSCALL_BASE+389)
+#define __NR_thread_local_abi		(__NR_SYSCALL_BASE+390)
 
 /*
  * The following SWIs are ARM private.
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index fde6c88..82b59cc 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -399,6 +399,7 @@
 		CALL(sys_execveat)
 		CALL(sys_userfaultfd)
 		CALL(sys_membarrier)
+/* 390 */	CALL(sys_thread_local_abi)
 #ifndef syscalls_counted
 .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
 #define syscalls_counted
diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c
index 7b8f214..feaa514 100644
--- a/arch/arm/kernel/signal.c
+++ b/arch/arm/kernel/signal.c
@@ -594,6 +594,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
 			} else {
 				clear_thread_flag(TIF_NOTIFY_RESUME);
 				tracehook_notify_resume(regs);
+				if (getcpu_cache_active(current))
+					getcpu_cache_handle_notify_resume(current);
 			}
 		}
 		local_irq_disable();
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 2/2] thread_local_abi: wire up ARM system call
  2015-12-10 15:39   ` Mathieu Desnoyers
  (?)
@ 2015-12-10 16:27   ` Russell King - ARM Linux
  2015-12-10 16:59       ` Mathieu Desnoyers
  -1 siblings, 1 reply; 18+ messages in thread
From: Russell King - ARM Linux @ 2015-12-10 16:27 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Thomas Gleixner, linux-kernel, Catalin Marinas, Will Deacon,
	Paul Turner, Andrew Hunter, Peter Zijlstra, Andy Lutomirski,
	Andi Kleen, Dave Watson, Chris Lameter, Ingo Molnar, Ben Maurer,
	Steven Rostedt, Paul E. McKenney, Josh Triplett, Linus Torvalds,
	Andrew Morton, linux-api

On Thu, Dec 10, 2015 at 10:39:50AM -0500, Mathieu Desnoyers wrote:
> Wire up the thread local ABI on ARM32. Call the
> getcpu_cache_handle_notify_resume() function on return to userspace if
> TIF_NOTIFY_RESUME thread flag is set.
> 
> This provides a way to implement a sched_getcpu() on ARM without
> requiring to perform a system call on the fast path.
> 
> [ Untested. ]

Why are you sending this _to_ Thomas?  Shouldn't you be sending it to me
as the arch maintainer?

> diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
> index 7b84657..ef55382 100644
> --- a/arch/arm/include/asm/unistd.h
> +++ b/arch/arm/include/asm/unistd.h
> @@ -19,7 +19,7 @@
>   * This may need to be greater than __NR_last_syscall+1 in order to
>   * account for the padding in the syscall table
>   */
> -#define __NR_syscalls  (392)
> +#define __NR_syscalls  (393)

That will cause a build error.  Please leave this alone until we get to
syscall 392, where upon it will need to be incremented by four.

Also, I tend to wait until after -rc1 before adding any syscalls, when
all the new syscalls are obvious and known - this avoids ending up with
two different trees having allocated the same syscall number (which is
why arch maintainers should be the only people who are responsible for
merging updates to their arch's syscall numbering.)

Sure, if multiple different people end up merging patches via different
routes, the conflicts can be resolved when those different routes come
together, but what happens if someone adds the syscall number that they
thought they had to (eg) glibc, and then have to change it later because
come -rc1 it ends up being different...

I'd much rather that all patches to unistd.h are only mergable via the
respective arch maintainers to keep the numbering sane.

(I personally want to follow x86's syscall numbering order as much as
possible.)

-- 
RMK's Patch system: http://www.arm.linux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 2/2] thread_local_abi: wire up ARM system call
@ 2015-12-10 16:59       ` Mathieu Desnoyers
  0 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2015-12-10 16:59 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Thomas Gleixner, linux-kernel, Catalin Marinas, Will Deacon,
	Paul Turner, Andrew Hunter, Peter Zijlstra, Andy Lutomirski,
	Andi Kleen, Dave Watson, Chris Lameter, Ingo Molnar, Ben Maurer,
	rostedt, Paul E. McKenney, Josh Triplett, Linus Torvalds,
	Andrew Morton, linux-api

----- On Dec 10, 2015, at 11:27 AM, Russell King - ARM Linux linux@arm.linux.org.uk wrote:

> On Thu, Dec 10, 2015 at 10:39:50AM -0500, Mathieu Desnoyers wrote:
>> Wire up the thread local ABI on ARM32. Call the
>> getcpu_cache_handle_notify_resume() function on return to userspace if
>> TIF_NOTIFY_RESUME thread flag is set.
>> 
>> This provides a way to implement a sched_getcpu() on ARM without
>> requiring to perform a system call on the fast path.
>> 
>> [ Untested. ]
> 
> Why are you sending this _to_ Thomas?  Shouldn't you be sending it to me
> as the arch maintainer?

Thomas showed interest in trying it out on ARM, which is why I'm
sending this RFC patch "To" him. Of course, I plan to send it
to you if it goes beyond RFC stage.

> 
>> diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
>> index 7b84657..ef55382 100644
>> --- a/arch/arm/include/asm/unistd.h
>> +++ b/arch/arm/include/asm/unistd.h
>> @@ -19,7 +19,7 @@
>>   * This may need to be greater than __NR_last_syscall+1 in order to
>>   * account for the padding in the syscall table
>>   */
>> -#define __NR_syscalls  (392)
>> +#define __NR_syscalls  (393)
> 
> That will cause a build error.  Please leave this alone until we get to
> syscall 392, where upon it will need to be incremented by four.

Oops, right. Will do.

> 
> Also, I tend to wait until after -rc1 before adding any syscalls, when
> all the new syscalls are obvious and known - this avoids ending up with
> two different trees having allocated the same syscall number (which is
> why arch maintainers should be the only people who are responsible for
> merging updates to their arch's syscall numbering.)

Sounds good. Anyway please wait until I send a non-RFC patch before
doing so.

Thanks!

Mathieu

> 
> Sure, if multiple different people end up merging patches via different
> routes, the conflicts can be resolved when those different routes come
> together, but what happens if someone adds the syscall number that they
> thought they had to (eg) glibc, and then have to change it later because
> come -rc1 it ends up being different...
> 
> I'd much rather that all patches to unistd.h are only mergable via the
> respective arch maintainers to keep the numbering sane.
> 
> (I personally want to follow x86's syscall numbering order as much as
> possible.)
> 
> --
> RMK's Patch system: http://www.arm.linux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
> according to speedtest.net.

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 2/2] thread_local_abi: wire up ARM system call
@ 2015-12-10 16:59       ` Mathieu Desnoyers
  0 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2015-12-10 16:59 UTC (permalink / raw)
  To: Russell King - ARM Linux
  Cc: Thomas Gleixner, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	Catalin Marinas, Will Deacon, Paul Turner, Andrew Hunter,
	Peter Zijlstra, Andy Lutomirski, Andi Kleen, Dave Watson,
	Chris Lameter, Ingo Molnar, Ben Maurer, rostedt,
	Paul E. McKenney, Josh Triplett, Linus Torvalds, Andrew Morton,
	linux-api

----- On Dec 10, 2015, at 11:27 AM, Russell King - ARM Linux linux-lFZ/pmaqli7XmaaqVzeoHQ@public.gmane.org wrote:

> On Thu, Dec 10, 2015 at 10:39:50AM -0500, Mathieu Desnoyers wrote:
>> Wire up the thread local ABI on ARM32. Call the
>> getcpu_cache_handle_notify_resume() function on return to userspace if
>> TIF_NOTIFY_RESUME thread flag is set.
>> 
>> This provides a way to implement a sched_getcpu() on ARM without
>> requiring to perform a system call on the fast path.
>> 
>> [ Untested. ]
> 
> Why are you sending this _to_ Thomas?  Shouldn't you be sending it to me
> as the arch maintainer?

Thomas showed interest in trying it out on ARM, which is why I'm
sending this RFC patch "To" him. Of course, I plan to send it
to you if it goes beyond RFC stage.

> 
>> diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
>> index 7b84657..ef55382 100644
>> --- a/arch/arm/include/asm/unistd.h
>> +++ b/arch/arm/include/asm/unistd.h
>> @@ -19,7 +19,7 @@
>>   * This may need to be greater than __NR_last_syscall+1 in order to
>>   * account for the padding in the syscall table
>>   */
>> -#define __NR_syscalls  (392)
>> +#define __NR_syscalls  (393)
> 
> That will cause a build error.  Please leave this alone until we get to
> syscall 392, where upon it will need to be incremented by four.

Oops, right. Will do.

> 
> Also, I tend to wait until after -rc1 before adding any syscalls, when
> all the new syscalls are obvious and known - this avoids ending up with
> two different trees having allocated the same syscall number (which is
> why arch maintainers should be the only people who are responsible for
> merging updates to their arch's syscall numbering.)

Sounds good. Anyway please wait until I send a non-RFC patch before
doing so.

Thanks!

Mathieu

> 
> Sure, if multiple different people end up merging patches via different
> routes, the conflicts can be resolved when those different routes come
> together, but what happens if someone adds the syscall number that they
> thought they had to (eg) glibc, and then have to change it later because
> come -rc1 it ends up being different...
> 
> I'd much rather that all patches to unistd.h are only mergable via the
> respective arch maintainers to keep the numbering sane.
> 
> (I personally want to follow x86's syscall numbering order as much as
> possible.)
> 
> --
> RMK's Patch system: http://www.arm.linux.org.uk/developer/patches/
> FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
> according to speedtest.net.

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86)
  2015-12-10 15:39 ` Mathieu Desnoyers
  (?)
  (?)
@ 2015-12-11 18:56 ` Michael Kerrisk (man-pages)
  2015-12-12 12:40   ` Mathieu Desnoyers
  -1 siblings, 1 reply; 18+ messages in thread
From: Michael Kerrisk (man-pages) @ 2015-12-11 18:56 UTC (permalink / raw)
  To: Mathieu Desnoyers, Thomas Gleixner
  Cc: mtk.manpages, linux-kernel, Paul Turner, Andrew Hunter,
	Peter Zijlstra, Andy Lutomirski, Andi Kleen, Dave Watson,
	Chris Lameter, Ingo Molnar, Ben Maurer, Steven Rostedt,
	Paul E. McKenney, Josh Triplett, Linus Torvalds, Andrew Morton,
	linux-api

Hi Mathieu,

On 12/10/2015 04:39 PM, Mathieu Desnoyers wrote:
> Expose a new system call allowing threads to register a userspace memory
> area where to store the current CPU number. Scheduler migration sets the
> TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space,
> a notify-resume handler updates the current CPU value within that
> user-space memory area.
> 
> This getcpu cache is an alternative to the sched_getcpu() vdso which has
> a few benefits:
> - It is faster to do a memory read that to call a vDSO,
> - This cached value can be read from within an inline assembly, which
>   makes it a useful building block for restartable sequences.
> 
> This approach is inspired by Paul Turner and Andrew Hunter's work
> on percpu atomics, which lets the kernel handle restart of critical
> sections:
> Ref.:
> * https://lkml.org/lkml/2015/6/24/665
> * https://lwn.net/Articles/650333/
> * http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf
> 
> Benchmarking sched_getcpu() vs tls cache approach. Getting the
> current CPU number:

Is there a man page for this system call?

Thanks,

Michael

> - With Linux vdso:            12.7 ns
> - With TLS-cached cpu number:  0.3 ns
> 
> The system call can be extended by registering a larger structure in
> the future.
> 
> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> CC: Thomas Gleixner <tglx@linutronix.de>
> CC: Paul Turner <pjt@google.com>
> CC: Andrew Hunter <ahh@google.com>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Andy Lutomirski <luto@amacapital.net>
> CC: Andi Kleen <andi@firstfloor.org>
> CC: Dave Watson <davejwatson@fb.com>
> CC: Chris Lameter <cl@linux.com>
> CC: Ingo Molnar <mingo@redhat.com>
> CC: Ben Maurer <bmaurer@fb.com>
> CC: Steven Rostedt <rostedt@goodmis.org>
> CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
> CC: Josh Triplett <josh@joshtriplett.org>
> CC: Linus Torvalds <torvalds@linux-foundation.org>
> CC: Andrew Morton <akpm@linux-foundation.org>
> CC: Thomas Gleixner <tglx@linutronix.de>
> CC: linux-api@vger.kernel.org
> ---
>  arch/x86/entry/common.c                |  2 +
>  arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>  fs/exec.c                              |  1 +
>  include/linux/sched.h                  | 32 ++++++++++++
>  include/uapi/asm-generic/unistd.h      |  4 +-
>  include/uapi/linux/Kbuild              |  1 +
>  include/uapi/linux/thread_local_abi.h  | 37 ++++++++++++++
>  init/Kconfig                           |  7 +++
>  kernel/Makefile                        |  1 +
>  kernel/fork.c                          |  2 +
>  kernel/sched/core.c                    |  4 ++
>  kernel/sched/sched.h                   |  2 +
>  kernel/sys_ni.c                        |  3 ++
>  kernel/thread_local_abi.c              | 92 ++++++++++++++++++++++++++++++++++
>  14 files changed, 188 insertions(+), 1 deletion(-)
>  create mode 100644 include/uapi/linux/thread_local_abi.h
>  create mode 100644 kernel/thread_local_abi.c
> 
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index a89fdbc..fdfdb14 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -249,6 +249,8 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
>  		if (cached_flags & _TIF_NOTIFY_RESUME) {
>  			clear_thread_flag(TIF_NOTIFY_RESUME);
>  			tracehook_notify_resume(regs);
> +			if (getcpu_cache_active(current))
> +				getcpu_cache_handle_notify_resume(current);
>  		}
>  
>  		if (cached_flags & _TIF_USER_RETURN_NOTIFY)
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index 314a90b..748aee3 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -332,6 +332,7 @@
>  323	common	userfaultfd		sys_userfaultfd
>  324	common	membarrier		sys_membarrier
>  325	common	mlock2			sys_mlock2
> +326	common	thread_local_abi	sys_thread_local_abi
>  
>  #
>  # x32-specific system call numbers start at 512 to avoid cache impact
> diff --git a/fs/exec.c b/fs/exec.c
> index b06623a..88490cc 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1594,6 +1594,7 @@ static int do_execveat_common(int fd, struct filename *filename,
>  	/* execve succeeded */
>  	current->fs->in_exec = 0;
>  	current->in_execve = 0;
> +	thread_local_abi_execve(current);
>  	acct_update_integrals(current);
>  	task_numa_free(current);
>  	free_bprm(bprm);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index edad7a4..b39d9a3 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2,6 +2,7 @@
>  #define _LINUX_SCHED_H
>  
>  #include <uapi/linux/sched.h>
> +#include <uapi/linux/thread_local_abi.h>
>  
>  #include <linux/sched/prio.h>
>  
> @@ -1812,6 +1813,10 @@ struct task_struct {
>  	unsigned long	task_state_change;
>  #endif
>  	int pagefault_disabled;
> +#ifdef CONFIG_THREAD_LOCAL_ABI
> +	size_t thread_local_abi_len;
> +	struct thread_local_abi __user *thread_local_abi;
> +#endif
>  /* CPU-specific state of this task */
>  	struct thread_struct thread;
>  /*
> @@ -3188,4 +3193,31 @@ static inline unsigned long rlimit_max(unsigned int limit)
>  	return task_rlimit_max(current, limit);
>  }
>  
> +#ifdef CONFIG_THREAD_LOCAL_ABI
> +void thread_local_abi_fork(struct task_struct *t);
> +void thread_local_abi_execve(struct task_struct *t);
> +void getcpu_cache_handle_notify_resume(struct task_struct *t);
> +static inline bool getcpu_cache_active(struct task_struct *t)
> +{
> +	if (t->thread_local_abi_len < offsetof(struct thread_local_abi, cpu)
> +			+ sizeof(t->thread_local_abi->cpu))
> +		return false;
> +	return true;
> +}
> +#else
> +static inline void thread_local_abi_fork(struct task_struct *t)
> +{
> +}
> +static inline void thread_local_abi_execve(struct task_struct *t)
> +{
> +}
> +static inline void getcpu_cache_handle_notify_resume(struct task_struct *t)
> +{
> +}
> +static inline bool getcpu_cache_active(struct task_struct *t)
> +{
> +	return false;
> +}
> +#endif
> +
>  #endif
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 1324b02..89a107a 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
>  __SYSCALL(__NR_membarrier, sys_membarrier)
>  #define __NR_mlock2 284
>  __SYSCALL(__NR_mlock2, sys_mlock2)
> +#define __NR_thread_local_abi 285
> +__SYSCALL(__NR_thread_local_abi, sys_thread_local_abi)
>  
>  #undef __NR_syscalls
> -#define __NR_syscalls 285
> +#define __NR_syscalls 286
>  
>  /*
>   * All syscalls below here should go away really,
> diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
> index 628e6e6..5df5460 100644
> --- a/include/uapi/linux/Kbuild
> +++ b/include/uapi/linux/Kbuild
> @@ -397,6 +397,7 @@ header-y += tcp_metrics.h
>  header-y += telephony.h
>  header-y += termios.h
>  header-y += thermal.h
> +header-y += thread_local_abi.h
>  header-y += time.h
>  header-y += times.h
>  header-y += timex.h
> diff --git a/include/uapi/linux/thread_local_abi.h b/include/uapi/linux/thread_local_abi.h
> new file mode 100644
> index 0000000..6487c92
> --- /dev/null
> +++ b/include/uapi/linux/thread_local_abi.h
> @@ -0,0 +1,37 @@
> +#ifndef _UAPI_LINUX_THREAD_LOCAL_ABI_H
> +#define _UAPI_LINUX_THREAD_LOCAL_ABI_H
> +
> +/*
> + * linux/thread_local_abi.h
> + *
> + * thread_local_abi system call API
> + *
> + * Copyright (c) 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a copy
> + * of this software and associated documentation files (the "Software"), to deal
> + * in the Software without restriction, including without limitation the rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
> + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#include <linux/types.h>
> +
> +/* This structure is an ABI that can only be extended. */
> +struct thread_local_abi {
> +	int32_t cpu;
> +};
> +
> +#endif /* _UAPI_LINUX_THREAD_LOCAL_ABI_H */
> diff --git a/init/Kconfig b/init/Kconfig
> index c24b6f7..df29803 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1612,6 +1612,13 @@ config MEMBARRIER
>  	  pairs of memory barriers into pairs consisting of membarrier() and a
>  	  compiler barrier.
>  
> +config THREAD_LOCAL_ABI
> +	bool "Enable thread-local ABI" if EXPERT
> +	default y
> +	help
> +	  Enable the thread-local ABI system call. It provides a user-space
> +	  cache for the current CPU number value.
> +
>  	  If unsure, say Y.
>  
>  config EMBEDDED
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 53abf00..327fbd9 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -103,6 +103,7 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o
>  obj-$(CONFIG_MEMBARRIER) += membarrier.o
>  
>  obj-$(CONFIG_HAS_IOMEM) += memremap.o
> +obj-$(CONFIG_THREAD_LOCAL_ABI) += thread_local_abi.o
>  
>  $(obj)/configs.o: $(obj)/config_data.h
>  
> diff --git a/kernel/fork.c b/kernel/fork.c
> index f97f2c4..42dd565 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1612,6 +1612,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
>  	cgroup_post_fork(p, cgrp_ss_priv);
>  	if (clone_flags & CLONE_THREAD)
>  		threadgroup_change_end(current);
> +	if (!(clone_flags & CLONE_THREAD))
> +		thread_local_abi_fork(p);
>  	perf_event_fork(p);
>  
>  	trace_task_newtask(p, clone_flags);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 4d568ac..b78f92f 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2120,6 +2120,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
>  
>  	p->numa_group = NULL;
>  #endif /* CONFIG_NUMA_BALANCING */
> +#ifdef CONFIG_THREAD_LOCAL_ABI
> +	p->thread_local_abi_len = 0;
> +	p->thread_local_abi = NULL;
> +#endif
>  }
>  
>  DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index efd3bfc..d828b97 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -957,6 +957,8 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
>  {
>  	set_task_rq(p, cpu);
>  #ifdef CONFIG_SMP
> +	if (getcpu_cache_active(p))
> +		set_tsk_thread_flag(p, TIF_NOTIFY_RESUME);
>  	/*
>  	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
>  	 * successfuly executed on another CPU. We must ensure that updates of
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 0623787..e803824 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -249,3 +249,6 @@ cond_syscall(sys_execveat);
>  
>  /* membarrier */
>  cond_syscall(sys_membarrier);
> +
> +/* thread-local ABI */
> +cond_syscall(sys_thread_local_abi);
> diff --git a/kernel/thread_local_abi.c b/kernel/thread_local_abi.c
> new file mode 100644
> index 0000000..f05505a
> --- /dev/null
> +++ b/kernel/thread_local_abi.c
> @@ -0,0 +1,92 @@
> +/*
> + * Copyright (C) 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> + *
> + * thread_local_abi system call
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#include <linux/init.h>
> +#include <linux/sched.h>
> +#include <linux/uaccess.h>
> +#include <linux/syscalls.h>
> +
> +static int getcpu_cache_update(struct task_struct *t)
> +{
> +	if (put_user(raw_smp_processor_id(), &t->thread_local_abi->cpu)) {
> +		t->thread_local_abi_len = 0;
> +		t->thread_local_abi = NULL;
> +		return -1;
> +	}
> +	return 0;
> +}
> +
> +/*
> + * This resume handler should always be executed between a migration
> + * triggered by preemption and return to user-space.
> + */
> +void getcpu_cache_handle_notify_resume(struct task_struct *t)
> +{
> +	BUG_ON(!getcpu_cache_active(t));
> +	if (unlikely(t->flags & PF_EXITING))
> +		return;
> +	if (getcpu_cache_update(t))
> +		force_sig(SIGSEGV, t);
> +}
> +
> +/*
> + * If parent process has a thread-local ABI, the child inherits. Only applies
> + * when forking a process, not a thread.
> + */
> +void thread_local_abi_fork(struct task_struct *t)
> +{
> +	t->thread_local_abi_len = current->thread_local_abi_len;
> +	t->thread_local_abi = current->thread_local_abi;
> +}
> +
> +void thread_local_abi_execve(struct task_struct *t)
> +{
> +	t->thread_local_abi_len = 0;
> +	t->thread_local_abi = NULL;
> +}
> +
> +/*
> + * sys_thread_local_abi - setup thread-local ABI for caller thread
> + */
> +SYSCALL_DEFINE3(thread_local_abi, struct thread_local_abi __user *, tlap,
> +		size_t, len, int, flags)
> +{
> +	size_t minlen;
> +
> +	if (flags)
> +		return -EINVAL;
> +	if (current->thread_local_abi && tlap)
> +		return -EBUSY;
> +	/* Agree on the intersection of userspace and kernel features */
> +	if (!tlap)
> +		minlen = 0;
> +	else
> +		minlen = min_t(size_t, len, sizeof(struct thread_local_abi));
> +	current->thread_local_abi_len = minlen;
> +	current->thread_local_abi = tlap;
> +	/*
> +	 * Migration checks ->thread_local_abi_len to see if notify_resume
> +	 * flag should be set. Therefore, we need to ensure that
> +	 * the scheduler sees ->thread_local_abi_len before we update
> +	 * the getcpu cache content with the current CPU number.
> +	 */
> +	barrier();	/* Store thread_local_abi_len before update content */
> +	if (getcpu_cache_active(current)) {
> +		if (getcpu_cache_update(current))
> +			return -EFAULT;
> +	}
> +	return minlen;
> +}
> 


-- 
Michael Kerrisk
Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
Linux/UNIX System Programming Training: http://man7.org/training/

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86)
  2015-12-11 18:56 ` [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86) Michael Kerrisk (man-pages)
@ 2015-12-12 12:40   ` Mathieu Desnoyers
  2015-12-13  8:04       ` Michael Kerrisk (man-pages)
  0 siblings, 1 reply; 18+ messages in thread
From: Mathieu Desnoyers @ 2015-12-12 12:40 UTC (permalink / raw)
  To: Michael Kerrisk
  Cc: Thomas Gleixner, linux-kernel, Paul Turner, Andrew Hunter,
	Peter Zijlstra, Andy Lutomirski, Andi Kleen, Dave Watson,
	Chris Lameter, Ingo Molnar, Ben Maurer, rostedt,
	Paul E. McKenney, Josh Triplett, Linus Torvalds, Andrew Morton,
	linux-api

----- On Dec 11, 2015, at 1:56 PM, Michael Kerrisk mtk.manpages@gmail.com wrote:

> Hi Mathieu,
> 
> On 12/10/2015 04:39 PM, Mathieu Desnoyers wrote:
>> Expose a new system call allowing threads to register a userspace memory
>> area where to store the current CPU number. Scheduler migration sets the
>> TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space,
>> a notify-resume handler updates the current CPU value within that
>> user-space memory area.
>> 
>> This getcpu cache is an alternative to the sched_getcpu() vdso which has
>> a few benefits:
>> - It is faster to do a memory read that to call a vDSO,
>> - This cached value can be read from within an inline assembly, which
>>   makes it a useful building block for restartable sequences.
>> 
>> This approach is inspired by Paul Turner and Andrew Hunter's work
>> on percpu atomics, which lets the kernel handle restart of critical
>> sections:
>> Ref.:
>> * https://lkml.org/lkml/2015/6/24/665
>> * https://lwn.net/Articles/650333/
>> *
>> http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf
>> 
>> Benchmarking sched_getcpu() vs tls cache approach. Getting the
>> current CPU number:
> 
> Is there a man page for this system call?

Hi Michael,

Not yet. I first want check whether the overall technical
approach is deemed acceptable before adding documentation.

Adding a manpage is going to be one of the first steps after
we agree on the syscall interface. Or perhaps you are suggesting
that adding a manpage at this RFC stage could help the
interface discussion ?

Thanks,

Mathieu

> 
> Thanks,
> 
> Michael
> 
>> - With Linux vdso:            12.7 ns
>> - With TLS-cached cpu number:  0.3 ns
>> 
>> The system call can be extended by registering a larger structure in
>> the future.
>> 
>> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
>> CC: Thomas Gleixner <tglx@linutronix.de>
>> CC: Paul Turner <pjt@google.com>
>> CC: Andrew Hunter <ahh@google.com>
>> CC: Peter Zijlstra <peterz@infradead.org>
>> CC: Andy Lutomirski <luto@amacapital.net>
>> CC: Andi Kleen <andi@firstfloor.org>
>> CC: Dave Watson <davejwatson@fb.com>
>> CC: Chris Lameter <cl@linux.com>
>> CC: Ingo Molnar <mingo@redhat.com>
>> CC: Ben Maurer <bmaurer@fb.com>
>> CC: Steven Rostedt <rostedt@goodmis.org>
>> CC: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
>> CC: Josh Triplett <josh@joshtriplett.org>
>> CC: Linus Torvalds <torvalds@linux-foundation.org>
>> CC: Andrew Morton <akpm@linux-foundation.org>
>> CC: Thomas Gleixner <tglx@linutronix.de>
>> CC: linux-api@vger.kernel.org
>> ---
>>  arch/x86/entry/common.c                |  2 +
>>  arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>>  fs/exec.c                              |  1 +
>>  include/linux/sched.h                  | 32 ++++++++++++
>>  include/uapi/asm-generic/unistd.h      |  4 +-
>>  include/uapi/linux/Kbuild              |  1 +
>>  include/uapi/linux/thread_local_abi.h  | 37 ++++++++++++++
>>  init/Kconfig                           |  7 +++
>>  kernel/Makefile                        |  1 +
>>  kernel/fork.c                          |  2 +
>>  kernel/sched/core.c                    |  4 ++
>>  kernel/sched/sched.h                   |  2 +
>>  kernel/sys_ni.c                        |  3 ++
>>  kernel/thread_local_abi.c              | 92 ++++++++++++++++++++++++++++++++++
>>  14 files changed, 188 insertions(+), 1 deletion(-)
>>  create mode 100644 include/uapi/linux/thread_local_abi.h
>>  create mode 100644 kernel/thread_local_abi.c
>> 
>> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
>> index a89fdbc..fdfdb14 100644
>> --- a/arch/x86/entry/common.c
>> +++ b/arch/x86/entry/common.c
>> @@ -249,6 +249,8 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32
>> cached_flags)
>>  		if (cached_flags & _TIF_NOTIFY_RESUME) {
>>  			clear_thread_flag(TIF_NOTIFY_RESUME);
>>  			tracehook_notify_resume(regs);
>> +			if (getcpu_cache_active(current))
>> +				getcpu_cache_handle_notify_resume(current);
>>  		}
>>  
>>  		if (cached_flags & _TIF_USER_RETURN_NOTIFY)
>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl
>> b/arch/x86/entry/syscalls/syscall_64.tbl
>> index 314a90b..748aee3 100644
>> --- a/arch/x86/entry/syscalls/syscall_64.tbl
>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
>> @@ -332,6 +332,7 @@
>>  323	common	userfaultfd		sys_userfaultfd
>>  324	common	membarrier		sys_membarrier
>>  325	common	mlock2			sys_mlock2
>> +326	common	thread_local_abi	sys_thread_local_abi
>>  
>>  #
>>  # x32-specific system call numbers start at 512 to avoid cache impact
>> diff --git a/fs/exec.c b/fs/exec.c
>> index b06623a..88490cc 100644
>> --- a/fs/exec.c
>> +++ b/fs/exec.c
>> @@ -1594,6 +1594,7 @@ static int do_execveat_common(int fd, struct filename
>> *filename,
>>  	/* execve succeeded */
>>  	current->fs->in_exec = 0;
>>  	current->in_execve = 0;
>> +	thread_local_abi_execve(current);
>>  	acct_update_integrals(current);
>>  	task_numa_free(current);
>>  	free_bprm(bprm);
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index edad7a4..b39d9a3 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -2,6 +2,7 @@
>>  #define _LINUX_SCHED_H
>>  
>>  #include <uapi/linux/sched.h>
>> +#include <uapi/linux/thread_local_abi.h>
>>  
>>  #include <linux/sched/prio.h>
>>  
>> @@ -1812,6 +1813,10 @@ struct task_struct {
>>  	unsigned long	task_state_change;
>>  #endif
>>  	int pagefault_disabled;
>> +#ifdef CONFIG_THREAD_LOCAL_ABI
>> +	size_t thread_local_abi_len;
>> +	struct thread_local_abi __user *thread_local_abi;
>> +#endif
>>  /* CPU-specific state of this task */
>>  	struct thread_struct thread;
>>  /*
>> @@ -3188,4 +3193,31 @@ static inline unsigned long rlimit_max(unsigned int
>> limit)
>>  	return task_rlimit_max(current, limit);
>>  }
>>  
>> +#ifdef CONFIG_THREAD_LOCAL_ABI
>> +void thread_local_abi_fork(struct task_struct *t);
>> +void thread_local_abi_execve(struct task_struct *t);
>> +void getcpu_cache_handle_notify_resume(struct task_struct *t);
>> +static inline bool getcpu_cache_active(struct task_struct *t)
>> +{
>> +	if (t->thread_local_abi_len < offsetof(struct thread_local_abi, cpu)
>> +			+ sizeof(t->thread_local_abi->cpu))
>> +		return false;
>> +	return true;
>> +}
>> +#else
>> +static inline void thread_local_abi_fork(struct task_struct *t)
>> +{
>> +}
>> +static inline void thread_local_abi_execve(struct task_struct *t)
>> +{
>> +}
>> +static inline void getcpu_cache_handle_notify_resume(struct task_struct *t)
>> +{
>> +}
>> +static inline bool getcpu_cache_active(struct task_struct *t)
>> +{
>> +	return false;
>> +}
>> +#endif
>> +
>>  #endif
>> diff --git a/include/uapi/asm-generic/unistd.h
>> b/include/uapi/asm-generic/unistd.h
>> index 1324b02..89a107a 100644
>> --- a/include/uapi/asm-generic/unistd.h
>> +++ b/include/uapi/asm-generic/unistd.h
>> @@ -715,9 +715,11 @@ __SYSCALL(__NR_userfaultfd, sys_userfaultfd)
>>  __SYSCALL(__NR_membarrier, sys_membarrier)
>>  #define __NR_mlock2 284
>>  __SYSCALL(__NR_mlock2, sys_mlock2)
>> +#define __NR_thread_local_abi 285
>> +__SYSCALL(__NR_thread_local_abi, sys_thread_local_abi)
>>  
>>  #undef __NR_syscalls
>> -#define __NR_syscalls 285
>> +#define __NR_syscalls 286
>>  
>>  /*
>>   * All syscalls below here should go away really,
>> diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
>> index 628e6e6..5df5460 100644
>> --- a/include/uapi/linux/Kbuild
>> +++ b/include/uapi/linux/Kbuild
>> @@ -397,6 +397,7 @@ header-y += tcp_metrics.h
>>  header-y += telephony.h
>>  header-y += termios.h
>>  header-y += thermal.h
>> +header-y += thread_local_abi.h
>>  header-y += time.h
>>  header-y += times.h
>>  header-y += timex.h
>> diff --git a/include/uapi/linux/thread_local_abi.h
>> b/include/uapi/linux/thread_local_abi.h
>> new file mode 100644
>> index 0000000..6487c92
>> --- /dev/null
>> +++ b/include/uapi/linux/thread_local_abi.h
>> @@ -0,0 +1,37 @@
>> +#ifndef _UAPI_LINUX_THREAD_LOCAL_ABI_H
>> +#define _UAPI_LINUX_THREAD_LOCAL_ABI_H
>> +
>> +/*
>> + * linux/thread_local_abi.h
>> + *
>> + * thread_local_abi system call API
>> + *
>> + * Copyright (c) 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a copy
>> + * of this software and associated documentation files (the "Software"), to
>> deal
>> + * in the Software without restriction, including without limitation the rights
>> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
>> + * copies of the Software, and to permit persons to whom the Software is
>> + * furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice shall be included in
>> + * all copies or substantial portions of the Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
>> + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>> FROM,
>> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
>> THE
>> + * SOFTWARE.
>> + */
>> +
>> +#include <linux/types.h>
>> +
>> +/* This structure is an ABI that can only be extended. */
>> +struct thread_local_abi {
>> +	int32_t cpu;
>> +};
>> +
>> +#endif /* _UAPI_LINUX_THREAD_LOCAL_ABI_H */
>> diff --git a/init/Kconfig b/init/Kconfig
>> index c24b6f7..df29803 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -1612,6 +1612,13 @@ config MEMBARRIER
>>  	  pairs of memory barriers into pairs consisting of membarrier() and a
>>  	  compiler barrier.
>>  
>> +config THREAD_LOCAL_ABI
>> +	bool "Enable thread-local ABI" if EXPERT
>> +	default y
>> +	help
>> +	  Enable the thread-local ABI system call. It provides a user-space
>> +	  cache for the current CPU number value.
>> +
>>  	  If unsure, say Y.
>>  
>>  config EMBEDDED
>> diff --git a/kernel/Makefile b/kernel/Makefile
>> index 53abf00..327fbd9 100644
>> --- a/kernel/Makefile
>> +++ b/kernel/Makefile
>> @@ -103,6 +103,7 @@ obj-$(CONFIG_TORTURE_TEST) += torture.o
>>  obj-$(CONFIG_MEMBARRIER) += membarrier.o
>>  
>>  obj-$(CONFIG_HAS_IOMEM) += memremap.o
>> +obj-$(CONFIG_THREAD_LOCAL_ABI) += thread_local_abi.o
>>  
>>  $(obj)/configs.o: $(obj)/config_data.h
>>  
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index f97f2c4..42dd565 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -1612,6 +1612,8 @@ static struct task_struct *copy_process(unsigned long
>> clone_flags,
>>  	cgroup_post_fork(p, cgrp_ss_priv);
>>  	if (clone_flags & CLONE_THREAD)
>>  		threadgroup_change_end(current);
>> +	if (!(clone_flags & CLONE_THREAD))
>> +		thread_local_abi_fork(p);
>>  	perf_event_fork(p);
>>  
>>  	trace_task_newtask(p, clone_flags);
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 4d568ac..b78f92f 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -2120,6 +2120,10 @@ static void __sched_fork(unsigned long clone_flags,
>> struct task_struct *p)
>>  
>>  	p->numa_group = NULL;
>>  #endif /* CONFIG_NUMA_BALANCING */
>> +#ifdef CONFIG_THREAD_LOCAL_ABI
>> +	p->thread_local_abi_len = 0;
>> +	p->thread_local_abi = NULL;
>> +#endif
>>  }
>>  
>>  DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index efd3bfc..d828b97 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -957,6 +957,8 @@ static inline void __set_task_cpu(struct task_struct *p,
>> unsigned int cpu)
>>  {
>>  	set_task_rq(p, cpu);
>>  #ifdef CONFIG_SMP
>> +	if (getcpu_cache_active(p))
>> +		set_tsk_thread_flag(p, TIF_NOTIFY_RESUME);
>>  	/*
>>  	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
>>  	 * successfuly executed on another CPU. We must ensure that updates of
>> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
>> index 0623787..e803824 100644
>> --- a/kernel/sys_ni.c
>> +++ b/kernel/sys_ni.c
>> @@ -249,3 +249,6 @@ cond_syscall(sys_execveat);
>>  
>>  /* membarrier */
>>  cond_syscall(sys_membarrier);
>> +
>> +/* thread-local ABI */
>> +cond_syscall(sys_thread_local_abi);
>> diff --git a/kernel/thread_local_abi.c b/kernel/thread_local_abi.c
>> new file mode 100644
>> index 0000000..f05505a
>> --- /dev/null
>> +++ b/kernel/thread_local_abi.c
>> @@ -0,0 +1,92 @@
>> +/*
>> + * Copyright (C) 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
>> + *
>> + * thread_local_abi system call
>> + *
>> + * This program is free software; you can redistribute it and/or modify
>> + * it under the terms of the GNU General Public License as published by
>> + * the Free Software Foundation; either version 2 of the License, or
>> + * (at your option) any later version.
>> + *
>> + * This program is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> + * GNU General Public License for more details.
>> + */
>> +
>> +#include <linux/init.h>
>> +#include <linux/sched.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/syscalls.h>
>> +
>> +static int getcpu_cache_update(struct task_struct *t)
>> +{
>> +	if (put_user(raw_smp_processor_id(), &t->thread_local_abi->cpu)) {
>> +		t->thread_local_abi_len = 0;
>> +		t->thread_local_abi = NULL;
>> +		return -1;
>> +	}
>> +	return 0;
>> +}
>> +
>> +/*
>> + * This resume handler should always be executed between a migration
>> + * triggered by preemption and return to user-space.
>> + */
>> +void getcpu_cache_handle_notify_resume(struct task_struct *t)
>> +{
>> +	BUG_ON(!getcpu_cache_active(t));
>> +	if (unlikely(t->flags & PF_EXITING))
>> +		return;
>> +	if (getcpu_cache_update(t))
>> +		force_sig(SIGSEGV, t);
>> +}
>> +
>> +/*
>> + * If parent process has a thread-local ABI, the child inherits. Only applies
>> + * when forking a process, not a thread.
>> + */
>> +void thread_local_abi_fork(struct task_struct *t)
>> +{
>> +	t->thread_local_abi_len = current->thread_local_abi_len;
>> +	t->thread_local_abi = current->thread_local_abi;
>> +}
>> +
>> +void thread_local_abi_execve(struct task_struct *t)
>> +{
>> +	t->thread_local_abi_len = 0;
>> +	t->thread_local_abi = NULL;
>> +}
>> +
>> +/*
>> + * sys_thread_local_abi - setup thread-local ABI for caller thread
>> + */
>> +SYSCALL_DEFINE3(thread_local_abi, struct thread_local_abi __user *, tlap,
>> +		size_t, len, int, flags)
>> +{
>> +	size_t minlen;
>> +
>> +	if (flags)
>> +		return -EINVAL;
>> +	if (current->thread_local_abi && tlap)
>> +		return -EBUSY;
>> +	/* Agree on the intersection of userspace and kernel features */
>> +	if (!tlap)
>> +		minlen = 0;
>> +	else
>> +		minlen = min_t(size_t, len, sizeof(struct thread_local_abi));
>> +	current->thread_local_abi_len = minlen;
>> +	current->thread_local_abi = tlap;
>> +	/*
>> +	 * Migration checks ->thread_local_abi_len to see if notify_resume
>> +	 * flag should be set. Therefore, we need to ensure that
>> +	 * the scheduler sees ->thread_local_abi_len before we update
>> +	 * the getcpu cache content with the current CPU number.
>> +	 */
>> +	barrier();	/* Store thread_local_abi_len before update content */
>> +	if (getcpu_cache_active(current)) {
>> +		if (getcpu_cache_update(current))
>> +			return -EFAULT;
>> +	}
>> +	return minlen;
>> +}
>> 
> 
> 
> --
> Michael Kerrisk
> Linux man-pages maintainer; http://www.kernel.org/doc/man-pages/
> Linux/UNIX System Programming Training: http://man7.org/training/

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86)
@ 2015-12-13  8:04       ` Michael Kerrisk (man-pages)
  0 siblings, 0 replies; 18+ messages in thread
From: Michael Kerrisk (man-pages) @ 2015-12-13  8:04 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Thomas Gleixner, lkml, Paul Turner, Andrew Hunter,
	Peter Zijlstra, Andy Lutomirski, Andi Kleen, Dave Watson,
	Chris Lameter, Ingo Molnar, Ben Maurer, rostedt,
	Paul E. McKenney, Josh Triplett, Linus Torvalds, Andrew Morton,
	linux-api

Hello Mathieu

On 12 December 2015 at 13:40, Mathieu Desnoyers
<mathieu.desnoyers@efficios.com> wrote:
> ----- On Dec 11, 2015, at 1:56 PM, Michael Kerrisk mtk.manpages@gmail.com wrote:
>
>> Hi Mathieu,
>>
>> On 12/10/2015 04:39 PM, Mathieu Desnoyers wrote:
>>> Expose a new system call allowing threads to register a userspace memory
>>> area where to store the current CPU number. Scheduler migration sets the
>>> TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space,
>>> a notify-resume handler updates the current CPU value within that
>>> user-space memory area.
>>>
>>> This getcpu cache is an alternative to the sched_getcpu() vdso which has
>>> a few benefits:
>>> - It is faster to do a memory read that to call a vDSO,
>>> - This cached value can be read from within an inline assembly, which
>>>   makes it a useful building block for restartable sequences.
>>>
>>> This approach is inspired by Paul Turner and Andrew Hunter's work
>>> on percpu atomics, which lets the kernel handle restart of critical
>>> sections:
>>> Ref.:
>>> * https://lkml.org/lkml/2015/6/24/665
>>> * https://lwn.net/Articles/650333/
>>> *
>>> http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf
>>>
>>> Benchmarking sched_getcpu() vs tls cache approach. Getting the
>>> current CPU number:
>>
>> Is there a man page for this system call?
>
> Hi Michael,
>
> Not yet. I first want check whether the overall technical
> approach is deemed acceptable before adding documentation.
>
> Adding a manpage is going to be one of the first steps after
> we agree on the syscall interface. Or perhaps you are suggesting
> that adding a manpage at this RFC stage could help the
> interface discussion ?

Yup, that's exactly what I'm suggesting.

Thanks,

Michael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86)
@ 2015-12-13  8:04       ` Michael Kerrisk (man-pages)
  0 siblings, 0 replies; 18+ messages in thread
From: Michael Kerrisk (man-pages) @ 2015-12-13  8:04 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Thomas Gleixner, lkml, Paul Turner, Andrew Hunter,
	Peter Zijlstra, Andy Lutomirski, Andi Kleen, Dave Watson,
	Chris Lameter, Ingo Molnar, Ben Maurer, rostedt,
	Paul E. McKenney, Josh Triplett, Linus Torvalds, Andrew Morton,
	linux-api

Hello Mathieu

On 12 December 2015 at 13:40, Mathieu Desnoyers
<mathieu.desnoyers-vg+e7yoeK/dWk0Htik3J/w@public.gmane.org> wrote:
> ----- On Dec 11, 2015, at 1:56 PM, Michael Kerrisk mtk.manpages-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org wrote:
>
>> Hi Mathieu,
>>
>> On 12/10/2015 04:39 PM, Mathieu Desnoyers wrote:
>>> Expose a new system call allowing threads to register a userspace memory
>>> area where to store the current CPU number. Scheduler migration sets the
>>> TIF_NOTIFY_RESUME flag on the current thread. Upon return to user-space,
>>> a notify-resume handler updates the current CPU value within that
>>> user-space memory area.
>>>
>>> This getcpu cache is an alternative to the sched_getcpu() vdso which has
>>> a few benefits:
>>> - It is faster to do a memory read that to call a vDSO,
>>> - This cached value can be read from within an inline assembly, which
>>>   makes it a useful building block for restartable sequences.
>>>
>>> This approach is inspired by Paul Turner and Andrew Hunter's work
>>> on percpu atomics, which lets the kernel handle restart of critical
>>> sections:
>>> Ref.:
>>> * https://lkml.org/lkml/2015/6/24/665
>>> * https://lwn.net/Articles/650333/
>>> *
>>> http://www.linuxplumbersconf.org/2013/ocw/system/presentations/1695/original/LPC%20-%20PerCpu%20Atomics.pdf
>>>
>>> Benchmarking sched_getcpu() vs tls cache approach. Getting the
>>> current CPU number:
>>
>> Is there a man page for this system call?
>
> Hi Michael,
>
> Not yet. I first want check whether the overall technical
> approach is deemed acceptable before adding documentation.
>
> Adding a manpage is going to be one of the first steps after
> we agree on the syscall interface. Or perhaps you are suggesting
> that adding a manpage at this RFC stage could help the
> interface discussion ?

Yup, that's exactly what I'm suggesting.

Thanks,

Michael

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86)
  2015-12-10 15:39 ` Mathieu Desnoyers
                   ` (2 preceding siblings ...)
  (?)
@ 2015-12-13 18:15 ` Andi Kleen
  2015-12-13 19:58     ` Mathieu Desnoyers
  -1 siblings, 1 reply; 18+ messages in thread
From: Andi Kleen @ 2015-12-13 18:15 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Thomas Gleixner, linux-kernel, Paul Turner, Andrew Hunter,
	Peter Zijlstra, Andy Lutomirski, Andi Kleen, Dave Watson,
	Chris Lameter, Ingo Molnar, Ben Maurer, Steven Rostedt,
	Paul E. McKenney, Josh Triplett, Linus Torvalds, Andrew Morton,
	linux-api

> This getcpu cache is an alternative to the sched_getcpu() vdso which has
> a few benefits:


Note the first version of getcpu() I proposed had a cache. But it was
rejected.

> - It is faster to do a memory read that to call a vDSO,
> - This cached value can be read from within an inline assembly, which
>   makes it a useful building block for restartable sequences.

On x86 we already have the de-facto ABI of using LSL with the magic
segment directly. While that is a few cycles slower than a memory load
I question the difference is big enough to justify a new system call,
and risk slow page fault in context switches.

BTW the vdso could be also optimized I think. For example glibc today
does some stupid (slow) things with it, like doing double iindirect
jumps.

-Andi

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86)
@ 2015-12-13 19:58     ` Mathieu Desnoyers
  0 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2015-12-13 19:58 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Thomas Gleixner, linux-kernel, Paul Turner, Andrew Hunter,
	Peter Zijlstra, Andy Lutomirski, Dave Watson, Chris Lameter,
	Ingo Molnar, Ben Maurer, rostedt, Paul E. McKenney,
	Josh Triplett, Linus Torvalds, Andrew Morton, linux-api

----- On Dec 13, 2015, at 1:15 PM, Andi Kleen andi@firstfloor.org wrote:

>> This getcpu cache is an alternative to the sched_getcpu() vdso which has
>> a few benefits:
> 
> 
> Note the first version of getcpu() I proposed had a cache. But it was
> rejected.
> 
>> - It is faster to do a memory read that to call a vDSO,
>> - This cached value can be read from within an inline assembly, which
>>   makes it a useful building block for restartable sequences.
> 
> On x86 we already have the de-facto ABI of using LSL with the magic
> segment directly. While that is a few cycles slower than a memory load
> I question the difference is big enough to justify a new system call,
> and risk slow page fault in context switches.

In the context of restartable sequences [1] [2], the goal is to turn
atomic operations on per-cpu data into a sequence of simple load/store
operations. Therefore, improving getcpu from 12ns to 0.3ns will have a
significant impact there. Those will be used in memory allocators, RCU
read-side in userspace, and tracing fast path, where we can expect
significant speedups even for those few cycles per call.

Moreover, AFAIU, restartable sequences cannot do the function call
required by the  vdso while within the c.s.: those need to entirely fit
within an inline assembly. So this CPU number caching actually enables
restartable sequences, whereas the vdso approach cannot be used in that
context.

Regarding your concern about slow page fault in context switches, this
updated patch takes care of it: the context switch is only setting
TIF_NOTIFY_RESUME, which lets the cache value update be performed on
return to userspace.

Finally, even if overall this new system call is not deemed sufficiently
interesting on x86, other popular architectures such as ARM32 don't have
any vDSO for getcpu at the moment, mainly because they don't have similar
segment selector tricks, and I'm not aware of other solutions than caching
the CPU value for those architectures. So we might very well end up having
to implement this system call for other architectures anyway.

> 
> BTW the vdso could be also optimized I think. For example glibc today
> does some stupid (slow) things with it, like doing double iindirect
> jumps.

I suspect that most of the difference between the vDSO approach and
CPU number caching is simply the function call required for the vDSO.
I doubt there is much to be done on this front.

Thanks,

Mathieu

[1] https://lwn.net/Articles/664645/
[2] https://lkml.org/lkml/2015/10/27/1095

> 
> -Andi

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86)
@ 2015-12-13 19:58     ` Mathieu Desnoyers
  0 siblings, 0 replies; 18+ messages in thread
From: Mathieu Desnoyers @ 2015-12-13 19:58 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Thomas Gleixner, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	Paul Turner, Andrew Hunter, Peter Zijlstra, Andy Lutomirski,
	Dave Watson, Chris Lameter, Ingo Molnar, Ben Maurer, rostedt,
	Paul E. McKenney, Josh Triplett, Linus Torvalds, Andrew Morton,
	linux-api

----- On Dec 13, 2015, at 1:15 PM, Andi Kleen andi-Vw/NltI1exuRpAAqCnN02g@public.gmane.org wrote:

>> This getcpu cache is an alternative to the sched_getcpu() vdso which has
>> a few benefits:
> 
> 
> Note the first version of getcpu() I proposed had a cache. But it was
> rejected.
> 
>> - It is faster to do a memory read that to call a vDSO,
>> - This cached value can be read from within an inline assembly, which
>>   makes it a useful building block for restartable sequences.
> 
> On x86 we already have the de-facto ABI of using LSL with the magic
> segment directly. While that is a few cycles slower than a memory load
> I question the difference is big enough to justify a new system call,
> and risk slow page fault in context switches.

In the context of restartable sequences [1] [2], the goal is to turn
atomic operations on per-cpu data into a sequence of simple load/store
operations. Therefore, improving getcpu from 12ns to 0.3ns will have a
significant impact there. Those will be used in memory allocators, RCU
read-side in userspace, and tracing fast path, where we can expect
significant speedups even for those few cycles per call.

Moreover, AFAIU, restartable sequences cannot do the function call
required by the  vdso while within the c.s.: those need to entirely fit
within an inline assembly. So this CPU number caching actually enables
restartable sequences, whereas the vdso approach cannot be used in that
context.

Regarding your concern about slow page fault in context switches, this
updated patch takes care of it: the context switch is only setting
TIF_NOTIFY_RESUME, which lets the cache value update be performed on
return to userspace.

Finally, even if overall this new system call is not deemed sufficiently
interesting on x86, other popular architectures such as ARM32 don't have
any vDSO for getcpu at the moment, mainly because they don't have similar
segment selector tricks, and I'm not aware of other solutions than caching
the CPU value for those architectures. So we might very well end up having
to implement this system call for other architectures anyway.

> 
> BTW the vdso could be also optimized I think. For example glibc today
> does some stupid (slow) things with it, like doing double iindirect
> jumps.

I suspect that most of the difference between the vDSO approach and
CPU number caching is simply the function call required for the vDSO.
I doubt there is much to be done on this front.

Thanks,

Mathieu

[1] https://lwn.net/Articles/664645/
[2] https://lkml.org/lkml/2015/10/27/1095

> 
> -Andi

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86)
@ 2015-12-13 20:18       ` Andi Kleen
  0 siblings, 0 replies; 18+ messages in thread
From: Andi Kleen @ 2015-12-13 20:18 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Andi Kleen, Thomas Gleixner, linux-kernel, Paul Turner,
	Andrew Hunter, Peter Zijlstra, Andy Lutomirski, Dave Watson,
	Chris Lameter, Ingo Molnar, Ben Maurer, rostedt,
	Paul E. McKenney, Josh Triplett, Linus Torvalds, Andrew Morton,
	linux-api

> In the context of restartable sequences [1] [2], the goal is to turn
> atomic operations on per-cpu data into a sequence of simple load/store
> operations. Therefore, improving getcpu from 12ns to 0.3ns will have a

I don't think LSL is 12ns. It's a few cycles.

> Moreover, AFAIU, restartable sequences cannot do the function call
> required by the  vdso while within the c.s.: those need to entirely fit
> within an inline assembly. So this CPU number caching actually enables
> restartable sequences, whereas the vdso approach cannot be used in that
> context.

You can use the LSL directly though. In practice people already rely
on it (and it's very cheap on the kernel side), so it's a defacto ABI
and could be documented.

So it's not function call vs load, but LSL vs load.

> 
> Finally, even if overall this new system call is not deemed sufficiently
> interesting on x86, other popular architectures such as ARM32 don't have
> any vDSO for getcpu at the moment, mainly because they don't have similar
> segment selector tricks, and I'm not aware of other solutions than caching

Has that been confirmed by architecture experts? Maybe there is some
trick there too.

> I suspect that most of the difference between the vDSO approach and
> CPU number caching is simply the function call required for the vDSO.
> I doubt there is much to be done on this front.

Not sure about that. Basic function calls are not that expensive. Right
now there is some baggage but that could be optimized. The only
unavoidable overhead would be the ABI register clobbering.

-Andi

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86)
@ 2015-12-13 20:18       ` Andi Kleen
  0 siblings, 0 replies; 18+ messages in thread
From: Andi Kleen @ 2015-12-13 20:18 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Andi Kleen, Thomas Gleixner, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	Paul Turner, Andrew Hunter, Peter Zijlstra, Andy Lutomirski,
	Dave Watson, Chris Lameter, Ingo Molnar, Ben Maurer, rostedt,
	Paul E. McKenney, Josh Triplett, Linus Torvalds, Andrew Morton,
	linux-api

> In the context of restartable sequences [1] [2], the goal is to turn
> atomic operations on per-cpu data into a sequence of simple load/store
> operations. Therefore, improving getcpu from 12ns to 0.3ns will have a

I don't think LSL is 12ns. It's a few cycles.

> Moreover, AFAIU, restartable sequences cannot do the function call
> required by the  vdso while within the c.s.: those need to entirely fit
> within an inline assembly. So this CPU number caching actually enables
> restartable sequences, whereas the vdso approach cannot be used in that
> context.

You can use the LSL directly though. In practice people already rely
on it (and it's very cheap on the kernel side), so it's a defacto ABI
and could be documented.

So it's not function call vs load, but LSL vs load.

> 
> Finally, even if overall this new system call is not deemed sufficiently
> interesting on x86, other popular architectures such as ARM32 don't have
> any vDSO for getcpu at the moment, mainly because they don't have similar
> segment selector tricks, and I'm not aware of other solutions than caching

Has that been confirmed by architecture experts? Maybe there is some
trick there too.

> I suspect that most of the difference between the vDSO approach and
> CPU number caching is simply the function call required for the vDSO.
> I doubt there is much to be done on this front.

Not sure about that. Basic function calls are not that expensive. Right
now there is some baggage but that could be optimized. The only
unavoidable overhead would be the ABI register clobbering.

-Andi

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86)
@ 2015-12-13 20:26         ` Andy Lutomirski
  0 siblings, 0 replies; 18+ messages in thread
From: Andy Lutomirski @ 2015-12-13 20:26 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Mathieu Desnoyers, Thomas Gleixner, linux-kernel, Paul Turner,
	Andrew Hunter, Peter Zijlstra, Dave Watson, Chris Lameter,
	Ingo Molnar, Ben Maurer, rostedt, Paul E. McKenney,
	Josh Triplett, Linus Torvalds, Andrew Morton, linux-api

On Sun, Dec 13, 2015 at 12:18 PM, Andi Kleen <andi@firstfloor.org> wrote:
>> In the context of restartable sequences [1] [2], the goal is to turn
>> atomic operations on per-cpu data into a sequence of simple load/store
>> operations. Therefore, improving getcpu from 12ns to 0.3ns will have a
>
> I don't think LSL is 12ns. It's a few cycles.

11ns on my Skylale laptop.  (rdtscp is now almost as fast as lsl.)
FWIW, a failed LSL is 55ns.

We could play sneaky tricks and use SGDT instead.  Long term on x86, I
think we should be using per-cpu segments, though.

>
>> Moreover, AFAIU, restartable sequences cannot do the function call
>> required by the  vdso while within the c.s.: those need to entirely fit
>> within an inline assembly. So this CPU number caching actually enables
>> restartable sequences, whereas the vdso approach cannot be used in that
>> context.
>
> You can use the LSL directly though. In practice people already rely
> on it (and it's very cheap on the kernel side), so it's a defacto ABI
> and could be documented.
>
> So it's not function call vs load, but LSL vs load.

I do wonder if the function call itself is cheap enough that we should
do this entirely within the vDSO.  Unfortunately, the vDSO can't use
TLS, so that's not so easy without trickery.

--Andy

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86)
@ 2015-12-13 20:26         ` Andy Lutomirski
  0 siblings, 0 replies; 18+ messages in thread
From: Andy Lutomirski @ 2015-12-13 20:26 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Mathieu Desnoyers, Thomas Gleixner,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, Paul Turner, Andrew Hunter,
	Peter Zijlstra, Dave Watson, Chris Lameter, Ingo Molnar,
	Ben Maurer, rostedt, Paul E. McKenney, Josh Triplett,
	Linus Torvalds, Andrew Morton, linux-api

On Sun, Dec 13, 2015 at 12:18 PM, Andi Kleen <andi-Vw/NltI1exuRpAAqCnN02g@public.gmane.org> wrote:
>> In the context of restartable sequences [1] [2], the goal is to turn
>> atomic operations on per-cpu data into a sequence of simple load/store
>> operations. Therefore, improving getcpu from 12ns to 0.3ns will have a
>
> I don't think LSL is 12ns. It's a few cycles.

11ns on my Skylale laptop.  (rdtscp is now almost as fast as lsl.)
FWIW, a failed LSL is 55ns.

We could play sneaky tricks and use SGDT instead.  Long term on x86, I
think we should be using per-cpu segments, though.

>
>> Moreover, AFAIU, restartable sequences cannot do the function call
>> required by the  vdso while within the c.s.: those need to entirely fit
>> within an inline assembly. So this CPU number caching actually enables
>> restartable sequences, whereas the vdso approach cannot be used in that
>> context.
>
> You can use the LSL directly though. In practice people already rely
> on it (and it's very cheap on the kernel side), so it's a defacto ABI
> and could be documented.
>
> So it's not function call vs load, but LSL vs load.

I do wonder if the function call itself is cheap enough that we should
do this entirely within the vDSO.  Unfortunately, the vDSO can't use
TLS, so that's not so easy without trickery.

--Andy

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2015-12-13 20:26 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-10 15:39 [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86) Mathieu Desnoyers
2015-12-10 15:39 ` Mathieu Desnoyers
2015-12-10 15:39 ` [RFC PATCH 2/2] thread_local_abi: wire up ARM system call Mathieu Desnoyers
2015-12-10 15:39   ` Mathieu Desnoyers
2015-12-10 16:27   ` Russell King - ARM Linux
2015-12-10 16:59     ` Mathieu Desnoyers
2015-12-10 16:59       ` Mathieu Desnoyers
2015-12-11 18:56 ` [RFC PATCH 1/2] thread_local_abi system call: caching current CPU number (x86) Michael Kerrisk (man-pages)
2015-12-12 12:40   ` Mathieu Desnoyers
2015-12-13  8:04     ` Michael Kerrisk (man-pages)
2015-12-13  8:04       ` Michael Kerrisk (man-pages)
2015-12-13 18:15 ` Andi Kleen
2015-12-13 19:58   ` Mathieu Desnoyers
2015-12-13 19:58     ` Mathieu Desnoyers
2015-12-13 20:18     ` Andi Kleen
2015-12-13 20:18       ` Andi Kleen
2015-12-13 20:26       ` Andy Lutomirski
2015-12-13 20:26         ` Andy Lutomirski

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.