All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH v3 1/2] membarrier: Provide register expedited private command
@ 2017-09-19 22:13 Mathieu Desnoyers
  2017-09-19 22:13 ` [RFC PATCH 2/2] membarrier: selftest: Test private expedited cmd Mathieu Desnoyers
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Mathieu Desnoyers @ 2017-09-19 22:13 UTC (permalink / raw)
  To: Paul E . McKenney, Peter Zijlstra
  Cc: linux-kernel, Mathieu Desnoyers, Boqun Feng, Andrew Hunter,
	Maged Michael, gromer, Avi Kivity, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Dave Watson, Alan Stern,
	Will Deacon, Andy Lutomirski, linux-arch

Provide a new command allowing processes to register their intent to use
the private expedited command.

This allows PowerPC to skip the full memory barrier in switch_mm(), and
only issue the barrier when scheduling into a task belonging to a
process that has registered to use expedited private.

Processes are now required to register before using
MEMBARRIER_CMD_PRIVATE_EXPEDITED, otherwise that command returns EPERM.

Changes since v1:
- Use test_ti_thread_flag(next, ...) instead of test_thread_flag() in
  powerpc membarrier_arch_sched_in(), given that we want to specifically
  check the next thread state.
- Add missing ARCH_HAS_MEMBARRIER_HOOKS in Kconfig.
- Use task_thread_info() to pass thread_info from task to
  *_ti_thread_flag().

Changes since v2:
- Move membarrier_arch_sched_in() call to finish_task_switch().
- Check for NULL t->mm in membarrier_arch_fork().
- Use membarrier_sched_in() in generic code, which invokes the
  arch-specific membarrier_arch_sched_in(). This fixes allnoconfig
  build on PowerPC.
- Move asm/membarrier.h include under CONFIG_MEMBARRIER, fixing
  allnoconfig build on PowerPC.
- Build and runtime tested on PowerPC.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Andrew Hunter <ahh@google.com>
CC: Maged Michael <maged.michael@gmail.com>
CC: gromer@google.com
CC: Avi Kivity <avi@scylladb.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Michael Ellerman <mpe@ellerman.id.au>
CC: Dave Watson <davejwatson@fb.com>
CC: Alan Stern <stern@rowland.harvard.edu>
CC: Will Deacon <will.deacon@arm.com>
CC: Andy Lutomirski <luto@kernel.org>
CC: linux-arch@vger.kernel.org
---
 MAINTAINERS                            |  2 ++
 arch/powerpc/Kconfig                   |  1 +
 arch/powerpc/include/asm/membarrier.h  | 40 +++++++++++++++++++++++++
 arch/powerpc/include/asm/thread_info.h |  3 ++
 arch/powerpc/kernel/Makefile           |  2 ++
 arch/powerpc/kernel/membarrier.c       | 45 ++++++++++++++++++++++++++++
 fs/exec.c                              |  1 +
 include/linux/mm_types.h               |  3 ++
 include/linux/sched/mm.h               | 55 ++++++++++++++++++++++++++++++++++
 include/uapi/linux/membarrier.h        | 23 +++++++++-----
 init/Kconfig                           |  3 ++
 kernel/fork.c                          |  2 ++
 kernel/sched/core.c                    | 16 ++--------
 kernel/sched/membarrier.c              | 25 +++++++++++++---
 14 files changed, 197 insertions(+), 24 deletions(-)
 create mode 100644 arch/powerpc/include/asm/membarrier.h
 create mode 100644 arch/powerpc/kernel/membarrier.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 2281af4b41b6..fe6d34d8a2e0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8805,6 +8805,8 @@ L:	linux-kernel@vger.kernel.org
 S:	Supported
 F:	kernel/sched/membarrier.c
 F:	include/uapi/linux/membarrier.h
+F:	arch/powerpc/kernel/membarrier.c
+F:	arch/powerpc/include/asm/membarrier.h
 
 MEMORY MANAGEMENT
 L:	linux-mm@kvack.org
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 809c468edab1..6f44c5f74f71 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -138,6 +138,7 @@ config PPC
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
+	select ARCH_HAS_MEMBARRIER_HOOKS
 	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h
new file mode 100644
index 000000000000..ee208455674d
--- /dev/null
+++ b/arch/powerpc/include/asm/membarrier.h
@@ -0,0 +1,40 @@
+#ifndef _ASM_POWERPC_MEMBARRIER_H
+#define _ASM_POWERPC_MEMBARRIER_H
+
+static inline void membarrier_arch_sched_in(struct task_struct *prev,
+		struct task_struct *next)
+{
+	/*
+	 * Only need the full barrier when switching between processes.
+	 */
+	if (likely(!test_ti_thread_flag(task_thread_info(next),
+			TIF_MEMBARRIER_PRIVATE_EXPEDITED)
+				|| prev->mm == next->mm))
+		return;
+
+	/*
+	 * The membarrier system call requires a full memory barrier
+	 * after storing to rq->curr, before going back to user-space.
+	 */
+	smp_mb();
+}
+static inline void membarrier_arch_fork(struct task_struct *t,
+		unsigned long clone_flags)
+{
+	/*
+	 * Coherence of TIF_MEMBARRIER_PRIVATE_EXPEDITED against thread
+	 * fork is protected by siglock. membarrier_arch_fork is called
+	 * with siglock held.
+	 */
+	if (t->mm && t->mm->membarrier_private_expedited)
+		set_ti_thread_flag(task_thread_info(t),
+				TIF_MEMBARRIER_PRIVATE_EXPEDITED);
+}
+static inline void membarrier_arch_execve(struct task_struct *t)
+{
+	clear_ti_thread_flag(task_thread_info(t),
+			TIF_MEMBARRIER_PRIVATE_EXPEDITED);
+}
+void membarrier_arch_register_private_expedited(struct task_struct *t);
+
+#endif /* _ASM_POWERPC_MEMBARRIER_H */
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index a941cc6fc3e9..2a208487724b 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -100,6 +100,7 @@ static inline struct thread_info *current_thread_info(void)
 #if defined(CONFIG_PPC64)
 #define TIF_ELF2ABI		18	/* function descriptors must die! */
 #endif
+#define TIF_MEMBARRIER_PRIVATE_EXPEDITED	19	/* membarrier */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -119,6 +120,8 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
 #define _TIF_EMULATE_STACK_STORE	(1<<TIF_EMULATE_STACK_STORE)
 #define _TIF_NOHZ		(1<<TIF_NOHZ)
+#define _TIF_MEMBARRIER_PRIVATE_EXPEDITED \
+				(1<<TIF_MEMBARRIER_PRIVATE_EXPEDITED)
 #define _TIF_SYSCALL_DOTRACE	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
 				 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
 				 _TIF_NOHZ)
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 91960f83039c..2dd4b9e3313a 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -135,6 +135,8 @@ endif
 obj-$(CONFIG_EPAPR_PARAVIRT)	+= epapr_paravirt.o epapr_hcalls.o
 obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvm_emul.o
 
+obj-$(CONFIG_MEMBARRIER)	+= membarrier.o
+
 # Disable GCOV & sanitizers in odd or sensitive code
 GCOV_PROFILE_prom_init.o := n
 UBSAN_SANITIZE_prom_init.o := n
diff --git a/arch/powerpc/kernel/membarrier.c b/arch/powerpc/kernel/membarrier.c
new file mode 100644
index 000000000000..b0d79a5f5981
--- /dev/null
+++ b/arch/powerpc/kernel/membarrier.c
@@ -0,0 +1,45 @@
+/*
+ * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call - PowerPC architecture code
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/thread_info.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+
+void membarrier_arch_register_private_expedited(struct task_struct *p)
+{
+	struct task_struct *t;
+
+	if (get_nr_threads(p) == 1) {
+		set_thread_flag(TIF_MEMBARRIER_PRIVATE_EXPEDITED);
+		return;
+	}
+	/*
+	 * Coherence of TIF_MEMBARRIER_PRIVATE_EXPEDITED against thread
+	 * fork is protected by siglock.
+	 */
+	spin_lock(&p->sighand->siglock);
+	for_each_thread(p, t)
+		set_ti_thread_flag(task_thread_info(t),
+				TIF_MEMBARRIER_PRIVATE_EXPEDITED);
+	spin_unlock(&p->sighand->siglock);
+	/*
+	 * Ensure all future scheduler executions will observe the new
+	 * thread flag state for this process.
+	 */
+	synchronize_sched();
+}
diff --git a/fs/exec.c b/fs/exec.c
index ac34d9724684..b2448f2731b3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1802,6 +1802,7 @@ static int do_execveat_common(int fd, struct filename *filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
+	membarrier_execve(current);
 	acct_update_integrals(current);
 	task_numa_free(current);
 	free_bprm(bprm);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 46f4ecf5479a..5e0fe8ce053b 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -445,6 +445,9 @@ struct mm_struct {
 	unsigned long flags; /* Must use atomic bitops to access the bits */
 
 	struct core_state *core_state; /* coredumping support */
+#ifdef CONFIG_MEMBARRIER
+	int membarrier_private_expedited;
+#endif
 #ifdef CONFIG_AIO
 	spinlock_t			ioctx_lock;
 	struct kioctx_table __rcu	*ioctx_table;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 766cc47c4d7c..b88f85c8474e 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -210,4 +210,59 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
 	current->flags = (current->flags & ~PF_MEMALLOC) | flags;
 }
 
+#ifdef CONFIG_MEMBARRIER
+
+#ifdef CONFIG_ARCH_HAS_MEMBARRIER_HOOKS
+#include <asm/membarrier.h>
+#else
+static inline void membarrier_arch_sched_in(struct task_struct *prev,
+		struct task_struct *next)
+{
+}
+static inline void membarrier_arch_fork(struct task_struct *t,
+		unsigned long clone_flags)
+{
+}
+static inline void membarrier_arch_execve(struct task_struct *t)
+{
+}
+static inline void membarrier_arch_register_private_expedited(
+		struct task_struct *p)
+{
+}
+#endif
+
+static inline void membarrier_sched_in(struct task_struct *prev,
+		struct task_struct *next)
+{
+	membarrier_arch_sched_in(prev, next);
+}
+static inline void membarrier_fork(struct task_struct *t,
+		unsigned long clone_flags)
+{
+	if (!current->mm || !t->mm)
+		return;
+	t->mm->membarrier_private_expedited =
+		current->mm->membarrier_private_expedited;
+	membarrier_arch_fork(t, clone_flags);
+}
+static inline void membarrier_execve(struct task_struct *t)
+{
+	t->mm->membarrier_private_expedited = 0;
+	membarrier_arch_execve(t);
+}
+#else
+static inline void membarrier_sched_in(struct task_struct *prev,
+		struct task_struct *next)
+{
+}
+static inline void membarrier_fork(struct task_struct *t,
+		unsigned long clone_flags)
+{
+}
+static inline void membarrier_execve(struct task_struct *t)
+{
+}
+#endif
+
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index 6d47b3249d8a..4e01ad7ffe98 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -52,21 +52,30 @@
  *                          (non-running threads are de facto in such a
  *                          state). This only covers threads from the
  *                          same processes as the caller thread. This
- *                          command returns 0. The "expedited" commands
- *                          complete faster than the non-expedited ones,
- *                          they never block, but have the downside of
- *                          causing extra overhead.
+ *                          command returns 0 on success. The
+ *                          "expedited" commands complete faster than
+ *                          the non-expedited ones, they never block,
+ *                          but have the downside of causing extra
+ *                          overhead. A process needs to register its
+ *                          intent to use the private expedited command
+ *                          prior to using it, otherwise this command
+ *                          returns -EPERM.
+ * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+ *                          Register the process intent to use
+ *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always
+ *                          returns 0.
  *
  * Command to be passed to the membarrier system call. The commands need to
  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
  * the value 0.
  */
 enum membarrier_cmd {
-	MEMBARRIER_CMD_QUERY			= 0,
-	MEMBARRIER_CMD_SHARED			= (1 << 0),
+	MEMBARRIER_CMD_QUERY				= 0,
+	MEMBARRIER_CMD_SHARED				= (1 << 0),
 	/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
 	/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
-	MEMBARRIER_CMD_PRIVATE_EXPEDITED	= (1 << 3),
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED		= (1 << 3),
+	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED	= (1 << 4),
 };
 
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/init/Kconfig b/init/Kconfig
index 78cb2461012e..a3dc6a66f0d1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1400,6 +1400,9 @@ config MEMBARRIER
 
 	  If unsure, say Y.
 
+config ARCH_HAS_MEMBARRIER_HOOKS
+	bool
+
 config EMBEDDED
 	bool "Embedded system"
 	option allnoconfig_y
diff --git a/kernel/fork.c b/kernel/fork.c
index 10646182440f..bd4a93915e08 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1840,6 +1840,8 @@ static __latent_entropy struct task_struct *copy_process(
 	 */
 	copy_seccomp(p);
 
+	membarrier_fork(p, clone_flags);
+
 	/*
 	 * Process group and session signals need to be delivered to just the
 	 * parent before the fork or both the parent and the child after the
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7977b25acf54..08095bb1cfe6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2643,17 +2643,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	 */
 	prev_state = prev->state;
 	vtime_task_switch(prev);
+	membarrier_sched_in(prev, current);
 	perf_event_task_sched_in(prev, current);
-	/*
-	 * The membarrier system call requires a full memory barrier
-	 * after storing to rq->curr, before going back to user-space.
-	 *
-	 * TODO: This smp_mb__after_unlock_lock can go away if PPC end
-	 * up adding a full barrier to switch_mm(), or we should figure
-	 * out if a smp_mb__after_unlock_lock is really the proper API
-	 * to use.
-	 */
-	smp_mb__after_unlock_lock();
 	finish_lock_switch(rq, prev);
 	finish_arch_post_lock_switch();
 
@@ -3363,9 +3354,8 @@ static void __sched notrace __schedule(bool preempt)
 		 * care of this barrier. For weakly ordered machines for
 		 * which spin_unlock() acts as a RELEASE barrier (only
 		 * arm64 and PowerPC), arm64 has a full barrier in
-		 * switch_to(), and PowerPC has
-		 * smp_mb__after_unlock_lock() before
-		 * finish_lock_switch().
+		 * switch_to(), and PowerPC has a full barrier in
+		 * membarrier_arch_sched_in().
 		 */
 		++*switch_count;
 
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index a92fddc22747..00a2618a36ba 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -26,21 +26,25 @@
  * except MEMBARRIER_CMD_QUERY.
  */
 #define MEMBARRIER_CMD_BITMASK	\
-	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED	\
+	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
 
 static void ipi_mb(void *info)
 {
 	smp_mb();	/* IPIs should be serializing but paranoid. */
 }
 
-static void membarrier_private_expedited(void)
+static int membarrier_private_expedited(void)
 {
 	int cpu;
 	bool fallback = false;
 	cpumask_var_t tmpmask;
 
+	if (!current->mm->membarrier_private_expedited)
+		return -EPERM;
+
 	if (num_online_cpus() == 1)
-		return;
+		return 0;
 
 	/*
 	 * Matches memory barriers around rq->curr modification in
@@ -94,6 +98,17 @@ static void membarrier_private_expedited(void)
 	 * rq->curr modification in scheduler.
 	 */
 	smp_mb();	/* exit from system call is not a mb */
+	return 0;
+}
+
+static void membarrier_register_private_expedited(void)
+{
+	struct task_struct *p = current;
+
+	if (READ_ONCE(p->mm->membarrier_private_expedited))
+		return;
+	WRITE_ONCE(p->mm->membarrier_private_expedited, 1);
+	membarrier_arch_register_private_expedited(p);
 }
 
 /**
@@ -144,7 +159,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 			synchronize_sched();
 		return 0;
 	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
-		membarrier_private_expedited();
+		return membarrier_private_expedited();
+	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+		membarrier_register_private_expedited();
 		return 0;
 	default:
 		return -EINVAL;
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [RFC PATCH 2/2] membarrier: selftest: Test private expedited cmd
  2017-09-19 22:13 [RFC PATCH v3 1/2] membarrier: Provide register expedited private command Mathieu Desnoyers
@ 2017-09-19 22:13 ` Mathieu Desnoyers
  2017-09-22  3:22 ` [RFC PATCH v3 1/2] membarrier: Provide register expedited private command Boqun Feng
  2017-09-22  8:59 ` Boqun Feng
  2 siblings, 0 replies; 16+ messages in thread
From: Mathieu Desnoyers @ 2017-09-19 22:13 UTC (permalink / raw)
  To: Paul E . McKenney, Peter Zijlstra
  Cc: linux-kernel, Mathieu Desnoyers, Boqun Feng, Andrew Hunter,
	Maged Michael, gromer, Avi Kivity, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Dave Watson, Alan Stern,
	Will Deacon, Andy Lutomirski, linux-arch

Test the new MEMBARRIER_CMD_PRIVATE_EXPEDITED and
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED commands.

Add checks expecting specific error values on system calls expected to
fail.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Andrew Hunter <ahh@google.com>
CC: Maged Michael <maged.michael@gmail.com>
CC: gromer@google.com
CC: Avi Kivity <avi@scylladb.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Michael Ellerman <mpe@ellerman.id.au>
CC: Dave Watson <davejwatson@fb.com>
CC: Alan Stern <stern@rowland.harvard.edu>
CC: Will Deacon <will.deacon@arm.com>
CC: Andy Lutomirski <luto@kernel.org>
CC: linux-arch@vger.kernel.org
---
 .../testing/selftests/membarrier/membarrier_test.c | 109 ++++++++++++++++++---
 1 file changed, 94 insertions(+), 15 deletions(-)

diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test.c
index 21399fcf1a59..f85657374b59 100644
--- a/tools/testing/selftests/membarrier/membarrier_test.c
+++ b/tools/testing/selftests/membarrier/membarrier_test.c
@@ -15,49 +15,119 @@ static int sys_membarrier(int cmd, int flags)
 static int test_membarrier_cmd_fail(void)
 {
 	int cmd = -1, flags = 0;
+	const char *test_name = "sys membarrier invalid command";
 
 	if (sys_membarrier(cmd, flags) != -1) {
 		ksft_exit_fail_msg(
-			"sys membarrier invalid command test: command = %d, flags = %d. Should fail, but passed\n",
-			cmd, flags);
+			"%s test: command = %d, flags = %d. Should fail, but passed\n",
+			test_name, cmd, flags);
+	}
+	if (errno != EINVAL) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+			test_name, flags, EINVAL, strerror(EINVAL),
+			errno, strerror(errno));
 	}
 
 	ksft_test_result_pass(
-		"sys membarrier invalid command test: command = %d, flags = %d. Failed as expected\n",
-		cmd, flags);
+		"%s test: command = %d, flags = %d, errno = %d. Failed as expected\n",
+		test_name, cmd, flags, errno);
 	return 0;
 }
 
 static int test_membarrier_flags_fail(void)
 {
 	int cmd = MEMBARRIER_CMD_QUERY, flags = 1;
+	const char *test_name = "sys membarrier MEMBARRIER_CMD_QUERY invalid flags";
 
 	if (sys_membarrier(cmd, flags) != -1) {
 		ksft_exit_fail_msg(
-			"sys membarrier MEMBARRIER_CMD_QUERY invalid flags test: flags = %d. Should fail, but passed\n",
-			flags);
+			"%s test: flags = %d. Should fail, but passed\n",
+			test_name, flags);
+	}
+	if (errno != EINVAL) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+			test_name, flags, EINVAL, strerror(EINVAL),
+			errno, strerror(errno));
 	}
 
 	ksft_test_result_pass(
-		"sys membarrier MEMBARRIER_CMD_QUERY invalid flags test: flags = %d. Failed as expected\n",
-		flags);
+		"%s test: flags = %d, errno = %d. Failed as expected\n",
+		test_name, flags, errno);
 	return 0;
 }
 
-static int test_membarrier_success(void)
+static int test_membarrier_shared_success(void)
 {
 	int cmd = MEMBARRIER_CMD_SHARED, flags = 0;
-	const char *test_name = "sys membarrier MEMBARRIER_CMD_SHARED\n";
+	const char *test_name = "sys membarrier MEMBARRIER_CMD_SHARED";
+
+	if (sys_membarrier(cmd, flags) != 0) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d, errno = %d\n",
+			test_name, flags, errno);
+	}
+
+	ksft_test_result_pass(
+		"%s test: flags = %d\n", test_name, flags);
+	return 0;
+}
+
+static int test_membarrier_private_expedited_fail(void)
+{
+	int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
+	const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED not registered failure";
+
+	if (sys_membarrier(cmd, flags) != -1) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d. Should fail, but passed\n",
+			test_name, flags);
+	}
+	if (errno != EPERM) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+			test_name, flags, EPERM, strerror(EPERM),
+			errno, strerror(errno));
+	}
+
+	ksft_test_result_pass(
+		"%s test: flags = %d, errno = %d\n",
+		test_name, flags, errno);
+	return 0;
+}
+
+static int test_membarrier_register_private_expedited_success(void)
+{
+	int cmd = MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, flags = 0;
+	const char *test_name = "sys membarrier MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED";
 
 	if (sys_membarrier(cmd, flags) != 0) {
 		ksft_exit_fail_msg(
-			"sys membarrier MEMBARRIER_CMD_SHARED test: flags = %d\n",
-			flags);
+			"%s test: flags = %d, errno = %d\n",
+			test_name, flags, errno);
 	}
 
 	ksft_test_result_pass(
-		"sys membarrier MEMBARRIER_CMD_SHARED test: flags = %d\n",
-		flags);
+		"%s test: flags = %d\n",
+		test_name, flags);
+	return 0;
+}
+
+static int test_membarrier_private_expedited_success(void)
+{
+	int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
+	const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED";
+
+	if (sys_membarrier(cmd, flags) != 0) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d, errno = %d\n",
+			test_name, flags, errno);
+	}
+
+	ksft_test_result_pass(
+		"%s test: flags = %d\n",
+		test_name, flags);
 	return 0;
 }
 
@@ -71,7 +141,16 @@ static int test_membarrier(void)
 	status = test_membarrier_flags_fail();
 	if (status)
 		return status;
-	status = test_membarrier_success();
+	status = test_membarrier_shared_success();
+	if (status)
+		return status;
+	status = test_membarrier_private_expedited_fail();
+	if (status)
+		return status;
+	status = test_membarrier_register_private_expedited_success();
+	if (status)
+		return status;
+	status = test_membarrier_private_expedited_success();
 	if (status)
 		return status;
 	return 0;
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH v3 1/2] membarrier: Provide register expedited private command
  2017-09-19 22:13 [RFC PATCH v3 1/2] membarrier: Provide register expedited private command Mathieu Desnoyers
  2017-09-19 22:13 ` [RFC PATCH 2/2] membarrier: selftest: Test private expedited cmd Mathieu Desnoyers
@ 2017-09-22  3:22 ` Boqun Feng
  2017-09-22  3:30   ` Boqun Feng
  2017-09-22  8:24   ` Peter Zijlstra
  2017-09-22  8:59 ` Boqun Feng
  2 siblings, 2 replies; 16+ messages in thread
From: Boqun Feng @ 2017-09-22  3:22 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Paul E . McKenney, Peter Zijlstra, linux-kernel, Andrew Hunter,
	Maged Michael, gromer, Avi Kivity, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Dave Watson, Alan Stern,
	Will Deacon, Andy Lutomirski, linux-arch

[-- Attachment #1: Type: text/plain, Size: 20622 bytes --]

Hi Mathieu,

On Tue, Sep 19, 2017 at 06:13:41PM -0400, Mathieu Desnoyers wrote:
> Provide a new command allowing processes to register their intent to use
> the private expedited command.
> 
> This allows PowerPC to skip the full memory barrier in switch_mm(), and
> only issue the barrier when scheduling into a task belonging to a
> process that has registered to use expedited private.
> 
> Processes are now required to register before using
> MEMBARRIER_CMD_PRIVATE_EXPEDITED, otherwise that command returns EPERM.
> 

Sorry I'm late for the party, but I couldn't stop thinking whether we
could avoid the register thing at all, because the registering makes
sys_membarrier() more complex(both for the interface and the
implementation). So how about we trade-off a little bit by taking
some(not all) the rq->locks?

The idea is in membarrier_private_expedited(), we go through all ->curr
on each CPU and 

1)	If it's a userspace task and its ->mm is matched, we send an ipi

2)	If it's a kernel task, we skip

	(Because there will be a smp_mb() implied by mmdrop(), when it
	switchs to userspace task).

3)	If it's a userspace task and its ->mm is not matched, we take
	the corresponding rq->lock and check rq->curr again, if its ->mm
	matched, we send an ipi, otherwise we do nothing.

	(Because if we observe rq->curr is not matched with rq->lock
	held, when a task having matched ->mm schedules in, the rq->lock
	pairing along with the smp_mb__after_spinlock() will guarantee
	it observes all memory ops before sys_membarrir()).

membarrier_private_expedited() will look like this if we choose this
way:

void membarrier_private_expedited()
{
	int cpu;
	bool fallback = false;
	cpumask_var_t tmpmask;
	struct rq_flags rf;


	if (num_online_cpus() == 1)
		return;

	smp_mb();

	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
		/* Fallback for OOM. */
		fallback = true;
	}

	cpus_read_lock();
	for_each_online_cpu(cpu) {
		struct task_struct *p;

		if (cpu == raw_smp_processor_id())
			continue;

		rcu_read_lock();
		p = task_rcu_dereference(&cpu_rq(cpu)->curr);

		if (!p) {
			rcu_read_unlock();
			continue;
		}

		if (p->mm == current->mm) {
			if (!fallback)
				__cpumask_set_cpu(cpu, tmpmask);
			else
				smp_call_function_single(cpu, ipi_mb, NULL, 1);
		}

		if (p->mm == current->mm || !p->mm) {
			rcu_read_unlock();
			continue;
		}

		rcu_read_unlock();
		
		/*
		 * This should be a arch-specific code, as we don't
		 * need it at else place other than some archs without
		 * a smp_mb() in switch_mm() (i.e. powerpc)
		 */
		rq_lock_irq(cpu_rq(cpu), &rf);
		if (p->mm == current->mm) {
			if (!fallback)
				__cpumask_set_cpu(cpu, tmpmask);
			else
				smp_call_function_single(cpu, ipi_mb, NULL, 1);
		}
		rq_unlock_irq(cpu_rq(cpu), &rf);
	}
	if (!fallback) {
		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
		free_cpumask_var(tmpmask);
	}
	cpus_read_unlock();

	smp_mb();
}

Thoughts?

Regards,
Boqun

> Changes since v1:
> - Use test_ti_thread_flag(next, ...) instead of test_thread_flag() in
>   powerpc membarrier_arch_sched_in(), given that we want to specifically
>   check the next thread state.
> - Add missing ARCH_HAS_MEMBARRIER_HOOKS in Kconfig.
> - Use task_thread_info() to pass thread_info from task to
>   *_ti_thread_flag().
> 
> Changes since v2:
> - Move membarrier_arch_sched_in() call to finish_task_switch().
> - Check for NULL t->mm in membarrier_arch_fork().
> - Use membarrier_sched_in() in generic code, which invokes the
>   arch-specific membarrier_arch_sched_in(). This fixes allnoconfig
>   build on PowerPC.
> - Move asm/membarrier.h include under CONFIG_MEMBARRIER, fixing
>   allnoconfig build on PowerPC.
> - Build and runtime tested on PowerPC.
> 
> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> CC: Boqun Feng <boqun.feng@gmail.com>
> CC: Andrew Hunter <ahh@google.com>
> CC: Maged Michael <maged.michael@gmail.com>
> CC: gromer@google.com
> CC: Avi Kivity <avi@scylladb.com>
> CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
> CC: Paul Mackerras <paulus@samba.org>
> CC: Michael Ellerman <mpe@ellerman.id.au>
> CC: Dave Watson <davejwatson@fb.com>
> CC: Alan Stern <stern@rowland.harvard.edu>
> CC: Will Deacon <will.deacon@arm.com>
> CC: Andy Lutomirski <luto@kernel.org>
> CC: linux-arch@vger.kernel.org
> ---
>  MAINTAINERS                            |  2 ++
>  arch/powerpc/Kconfig                   |  1 +
>  arch/powerpc/include/asm/membarrier.h  | 40 +++++++++++++++++++++++++
>  arch/powerpc/include/asm/thread_info.h |  3 ++
>  arch/powerpc/kernel/Makefile           |  2 ++
>  arch/powerpc/kernel/membarrier.c       | 45 ++++++++++++++++++++++++++++
>  fs/exec.c                              |  1 +
>  include/linux/mm_types.h               |  3 ++
>  include/linux/sched/mm.h               | 55 ++++++++++++++++++++++++++++++++++
>  include/uapi/linux/membarrier.h        | 23 +++++++++-----
>  init/Kconfig                           |  3 ++
>  kernel/fork.c                          |  2 ++
>  kernel/sched/core.c                    | 16 ++--------
>  kernel/sched/membarrier.c              | 25 +++++++++++++---
>  14 files changed, 197 insertions(+), 24 deletions(-)
>  create mode 100644 arch/powerpc/include/asm/membarrier.h
>  create mode 100644 arch/powerpc/kernel/membarrier.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 2281af4b41b6..fe6d34d8a2e0 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -8805,6 +8805,8 @@ L:	linux-kernel@vger.kernel.org
>  S:	Supported
>  F:	kernel/sched/membarrier.c
>  F:	include/uapi/linux/membarrier.h
> +F:	arch/powerpc/kernel/membarrier.c
> +F:	arch/powerpc/include/asm/membarrier.h
>  
>  MEMORY MANAGEMENT
>  L:	linux-mm@kvack.org
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 809c468edab1..6f44c5f74f71 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -138,6 +138,7 @@ config PPC
>  	select ARCH_HAS_ELF_RANDOMIZE
>  	select ARCH_HAS_FORTIFY_SOURCE
>  	select ARCH_HAS_GCOV_PROFILE_ALL
> +	select ARCH_HAS_MEMBARRIER_HOOKS
>  	select ARCH_HAS_SCALED_CPUTIME		if VIRT_CPU_ACCOUNTING_NATIVE
>  	select ARCH_HAS_SG_CHAIN
>  	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
> diff --git a/arch/powerpc/include/asm/membarrier.h b/arch/powerpc/include/asm/membarrier.h
> new file mode 100644
> index 000000000000..ee208455674d
> --- /dev/null
> +++ b/arch/powerpc/include/asm/membarrier.h
> @@ -0,0 +1,40 @@
> +#ifndef _ASM_POWERPC_MEMBARRIER_H
> +#define _ASM_POWERPC_MEMBARRIER_H
> +
> +static inline void membarrier_arch_sched_in(struct task_struct *prev,
> +		struct task_struct *next)
> +{
> +	/*
> +	 * Only need the full barrier when switching between processes.
> +	 */
> +	if (likely(!test_ti_thread_flag(task_thread_info(next),
> +			TIF_MEMBARRIER_PRIVATE_EXPEDITED)
> +				|| prev->mm == next->mm))
> +		return;
> +
> +	/*
> +	 * The membarrier system call requires a full memory barrier
> +	 * after storing to rq->curr, before going back to user-space.
> +	 */
> +	smp_mb();
> +}
> +static inline void membarrier_arch_fork(struct task_struct *t,
> +		unsigned long clone_flags)
> +{
> +	/*
> +	 * Coherence of TIF_MEMBARRIER_PRIVATE_EXPEDITED against thread
> +	 * fork is protected by siglock. membarrier_arch_fork is called
> +	 * with siglock held.
> +	 */
> +	if (t->mm && t->mm->membarrier_private_expedited)
> +		set_ti_thread_flag(task_thread_info(t),
> +				TIF_MEMBARRIER_PRIVATE_EXPEDITED);
> +}
> +static inline void membarrier_arch_execve(struct task_struct *t)
> +{
> +	clear_ti_thread_flag(task_thread_info(t),
> +			TIF_MEMBARRIER_PRIVATE_EXPEDITED);
> +}
> +void membarrier_arch_register_private_expedited(struct task_struct *t);
> +
> +#endif /* _ASM_POWERPC_MEMBARRIER_H */
> diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
> index a941cc6fc3e9..2a208487724b 100644
> --- a/arch/powerpc/include/asm/thread_info.h
> +++ b/arch/powerpc/include/asm/thread_info.h
> @@ -100,6 +100,7 @@ static inline struct thread_info *current_thread_info(void)
>  #if defined(CONFIG_PPC64)
>  #define TIF_ELF2ABI		18	/* function descriptors must die! */
>  #endif
> +#define TIF_MEMBARRIER_PRIVATE_EXPEDITED	19	/* membarrier */
>  
>  /* as above, but as bit values */
>  #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
> @@ -119,6 +120,8 @@ static inline struct thread_info *current_thread_info(void)
>  #define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
>  #define _TIF_EMULATE_STACK_STORE	(1<<TIF_EMULATE_STACK_STORE)
>  #define _TIF_NOHZ		(1<<TIF_NOHZ)
> +#define _TIF_MEMBARRIER_PRIVATE_EXPEDITED \
> +				(1<<TIF_MEMBARRIER_PRIVATE_EXPEDITED)
>  #define _TIF_SYSCALL_DOTRACE	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
>  				 _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \
>  				 _TIF_NOHZ)
> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> index 91960f83039c..2dd4b9e3313a 100644
> --- a/arch/powerpc/kernel/Makefile
> +++ b/arch/powerpc/kernel/Makefile
> @@ -135,6 +135,8 @@ endif
>  obj-$(CONFIG_EPAPR_PARAVIRT)	+= epapr_paravirt.o epapr_hcalls.o
>  obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvm_emul.o
>  
> +obj-$(CONFIG_MEMBARRIER)	+= membarrier.o
> +
>  # Disable GCOV & sanitizers in odd or sensitive code
>  GCOV_PROFILE_prom_init.o := n
>  UBSAN_SANITIZE_prom_init.o := n
> diff --git a/arch/powerpc/kernel/membarrier.c b/arch/powerpc/kernel/membarrier.c
> new file mode 100644
> index 000000000000..b0d79a5f5981
> --- /dev/null
> +++ b/arch/powerpc/kernel/membarrier.c
> @@ -0,0 +1,45 @@
> +/*
> + * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> + *
> + * membarrier system call - PowerPC architecture code
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + */
> +
> +#include <linux/sched/mm.h>
> +#include <linux/sched/signal.h>
> +#include <linux/thread_info.h>
> +#include <linux/spinlock.h>
> +#include <linux/rcupdate.h>
> +
> +void membarrier_arch_register_private_expedited(struct task_struct *p)
> +{
> +	struct task_struct *t;
> +
> +	if (get_nr_threads(p) == 1) {
> +		set_thread_flag(TIF_MEMBARRIER_PRIVATE_EXPEDITED);
> +		return;
> +	}
> +	/*
> +	 * Coherence of TIF_MEMBARRIER_PRIVATE_EXPEDITED against thread
> +	 * fork is protected by siglock.
> +	 */
> +	spin_lock(&p->sighand->siglock);
> +	for_each_thread(p, t)
> +		set_ti_thread_flag(task_thread_info(t),
> +				TIF_MEMBARRIER_PRIVATE_EXPEDITED);
> +	spin_unlock(&p->sighand->siglock);
> +	/*
> +	 * Ensure all future scheduler executions will observe the new
> +	 * thread flag state for this process.
> +	 */
> +	synchronize_sched();
> +}
> diff --git a/fs/exec.c b/fs/exec.c
> index ac34d9724684..b2448f2731b3 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1802,6 +1802,7 @@ static int do_execveat_common(int fd, struct filename *filename,
>  	/* execve succeeded */
>  	current->fs->in_exec = 0;
>  	current->in_execve = 0;
> +	membarrier_execve(current);
>  	acct_update_integrals(current);
>  	task_numa_free(current);
>  	free_bprm(bprm);
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 46f4ecf5479a..5e0fe8ce053b 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -445,6 +445,9 @@ struct mm_struct {
>  	unsigned long flags; /* Must use atomic bitops to access the bits */
>  
>  	struct core_state *core_state; /* coredumping support */
> +#ifdef CONFIG_MEMBARRIER
> +	int membarrier_private_expedited;
> +#endif
>  #ifdef CONFIG_AIO
>  	spinlock_t			ioctx_lock;
>  	struct kioctx_table __rcu	*ioctx_table;
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index 766cc47c4d7c..b88f85c8474e 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -210,4 +210,59 @@ static inline void memalloc_noreclaim_restore(unsigned int flags)
>  	current->flags = (current->flags & ~PF_MEMALLOC) | flags;
>  }
>  
> +#ifdef CONFIG_MEMBARRIER
> +
> +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_HOOKS
> +#include <asm/membarrier.h>
> +#else
> +static inline void membarrier_arch_sched_in(struct task_struct *prev,
> +		struct task_struct *next)
> +{
> +}
> +static inline void membarrier_arch_fork(struct task_struct *t,
> +		unsigned long clone_flags)
> +{
> +}
> +static inline void membarrier_arch_execve(struct task_struct *t)
> +{
> +}
> +static inline void membarrier_arch_register_private_expedited(
> +		struct task_struct *p)
> +{
> +}
> +#endif
> +
> +static inline void membarrier_sched_in(struct task_struct *prev,
> +		struct task_struct *next)
> +{
> +	membarrier_arch_sched_in(prev, next);
> +}
> +static inline void membarrier_fork(struct task_struct *t,
> +		unsigned long clone_flags)
> +{
> +	if (!current->mm || !t->mm)
> +		return;
> +	t->mm->membarrier_private_expedited =
> +		current->mm->membarrier_private_expedited;
> +	membarrier_arch_fork(t, clone_flags);
> +}
> +static inline void membarrier_execve(struct task_struct *t)
> +{
> +	t->mm->membarrier_private_expedited = 0;
> +	membarrier_arch_execve(t);
> +}
> +#else
> +static inline void membarrier_sched_in(struct task_struct *prev,
> +		struct task_struct *next)
> +{
> +}
> +static inline void membarrier_fork(struct task_struct *t,
> +		unsigned long clone_flags)
> +{
> +}
> +static inline void membarrier_execve(struct task_struct *t)
> +{
> +}
> +#endif
> +
>  #endif /* _LINUX_SCHED_MM_H */
> diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
> index 6d47b3249d8a..4e01ad7ffe98 100644
> --- a/include/uapi/linux/membarrier.h
> +++ b/include/uapi/linux/membarrier.h
> @@ -52,21 +52,30 @@
>   *                          (non-running threads are de facto in such a
>   *                          state). This only covers threads from the
>   *                          same processes as the caller thread. This
> - *                          command returns 0. The "expedited" commands
> - *                          complete faster than the non-expedited ones,
> - *                          they never block, but have the downside of
> - *                          causing extra overhead.
> + *                          command returns 0 on success. The
> + *                          "expedited" commands complete faster than
> + *                          the non-expedited ones, they never block,
> + *                          but have the downside of causing extra
> + *                          overhead. A process needs to register its
> + *                          intent to use the private expedited command
> + *                          prior to using it, otherwise this command
> + *                          returns -EPERM.
> + * @MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
> + *                          Register the process intent to use
> + *                          MEMBARRIER_CMD_PRIVATE_EXPEDITED. Always
> + *                          returns 0.
>   *
>   * Command to be passed to the membarrier system call. The commands need to
>   * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
>   * the value 0.
>   */
>  enum membarrier_cmd {
> -	MEMBARRIER_CMD_QUERY			= 0,
> -	MEMBARRIER_CMD_SHARED			= (1 << 0),
> +	MEMBARRIER_CMD_QUERY				= 0,
> +	MEMBARRIER_CMD_SHARED				= (1 << 0),
>  	/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
>  	/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
> -	MEMBARRIER_CMD_PRIVATE_EXPEDITED	= (1 << 3),
> +	MEMBARRIER_CMD_PRIVATE_EXPEDITED		= (1 << 3),
> +	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED	= (1 << 4),
>  };
>  
>  #endif /* _UAPI_LINUX_MEMBARRIER_H */
> diff --git a/init/Kconfig b/init/Kconfig
> index 78cb2461012e..a3dc6a66f0d1 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1400,6 +1400,9 @@ config MEMBARRIER
>  
>  	  If unsure, say Y.
>  
> +config ARCH_HAS_MEMBARRIER_HOOKS
> +	bool
> +
>  config EMBEDDED
>  	bool "Embedded system"
>  	option allnoconfig_y
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 10646182440f..bd4a93915e08 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1840,6 +1840,8 @@ static __latent_entropy struct task_struct *copy_process(
>  	 */
>  	copy_seccomp(p);
>  
> +	membarrier_fork(p, clone_flags);
> +
>  	/*
>  	 * Process group and session signals need to be delivered to just the
>  	 * parent before the fork or both the parent and the child after the
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 7977b25acf54..08095bb1cfe6 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2643,17 +2643,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
>  	 */
>  	prev_state = prev->state;
>  	vtime_task_switch(prev);
> +	membarrier_sched_in(prev, current);
>  	perf_event_task_sched_in(prev, current);
> -	/*
> -	 * The membarrier system call requires a full memory barrier
> -	 * after storing to rq->curr, before going back to user-space.
> -	 *
> -	 * TODO: This smp_mb__after_unlock_lock can go away if PPC end
> -	 * up adding a full barrier to switch_mm(), or we should figure
> -	 * out if a smp_mb__after_unlock_lock is really the proper API
> -	 * to use.
> -	 */
> -	smp_mb__after_unlock_lock();
>  	finish_lock_switch(rq, prev);
>  	finish_arch_post_lock_switch();
>  
> @@ -3363,9 +3354,8 @@ static void __sched notrace __schedule(bool preempt)
>  		 * care of this barrier. For weakly ordered machines for
>  		 * which spin_unlock() acts as a RELEASE barrier (only
>  		 * arm64 and PowerPC), arm64 has a full barrier in
> -		 * switch_to(), and PowerPC has
> -		 * smp_mb__after_unlock_lock() before
> -		 * finish_lock_switch().
> +		 * switch_to(), and PowerPC has a full barrier in
> +		 * membarrier_arch_sched_in().
>  		 */
>  		++*switch_count;
>  
> diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
> index a92fddc22747..00a2618a36ba 100644
> --- a/kernel/sched/membarrier.c
> +++ b/kernel/sched/membarrier.c
> @@ -26,21 +26,25 @@
>   * except MEMBARRIER_CMD_QUERY.
>   */
>  #define MEMBARRIER_CMD_BITMASK	\
> -	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
> +	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED	\
> +	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED)
>  
>  static void ipi_mb(void *info)
>  {
>  	smp_mb();	/* IPIs should be serializing but paranoid. */
>  }
>  
> -static void membarrier_private_expedited(void)
> +static int membarrier_private_expedited(void)
>  {
>  	int cpu;
>  	bool fallback = false;
>  	cpumask_var_t tmpmask;
>  
> +	if (!current->mm->membarrier_private_expedited)
> +		return -EPERM;
> +
>  	if (num_online_cpus() == 1)
> -		return;
> +		return 0;
>  
>  	/*
>  	 * Matches memory barriers around rq->curr modification in
> @@ -94,6 +98,17 @@ static void membarrier_private_expedited(void)
>  	 * rq->curr modification in scheduler.
>  	 */
>  	smp_mb();	/* exit from system call is not a mb */
> +	return 0;
> +}
> +
> +static void membarrier_register_private_expedited(void)
> +{
> +	struct task_struct *p = current;
> +
> +	if (READ_ONCE(p->mm->membarrier_private_expedited))
> +		return;
> +	WRITE_ONCE(p->mm->membarrier_private_expedited, 1);
> +	membarrier_arch_register_private_expedited(p);
>  }
>  
>  /**
> @@ -144,7 +159,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
>  			synchronize_sched();
>  		return 0;
>  	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
> -		membarrier_private_expedited();
> +		return membarrier_private_expedited();
> +	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
> +		membarrier_register_private_expedited();
>  		return 0;
>  	default:
>  		return -EINVAL;
> -- 
> 2.11.0
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH v3 1/2] membarrier: Provide register expedited private command
  2017-09-22  3:22 ` [RFC PATCH v3 1/2] membarrier: Provide register expedited private command Boqun Feng
@ 2017-09-22  3:30   ` Boqun Feng
  2017-09-22  5:22     ` Mathieu Desnoyers
  2017-09-22  8:24   ` Peter Zijlstra
  1 sibling, 1 reply; 16+ messages in thread
From: Boqun Feng @ 2017-09-22  3:30 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Paul E . McKenney, Peter Zijlstra, linux-kernel, Andrew Hunter,
	Maged Michael, gromer, Avi Kivity, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Dave Watson, Alan Stern,
	Will Deacon, Andy Lutomirski, linux-arch

[-- Attachment #1: Type: text/plain, Size: 3519 bytes --]

On Fri, Sep 22, 2017 at 11:22:06AM +0800, Boqun Feng wrote:
> Hi Mathieu,
> 
> On Tue, Sep 19, 2017 at 06:13:41PM -0400, Mathieu Desnoyers wrote:
> > Provide a new command allowing processes to register their intent to use
> > the private expedited command.
> > 
> > This allows PowerPC to skip the full memory barrier in switch_mm(), and
> > only issue the barrier when scheduling into a task belonging to a
> > process that has registered to use expedited private.
> > 
> > Processes are now required to register before using
> > MEMBARRIER_CMD_PRIVATE_EXPEDITED, otherwise that command returns EPERM.
> > 
> 
> Sorry I'm late for the party, but I couldn't stop thinking whether we
> could avoid the register thing at all, because the registering makes
> sys_membarrier() more complex(both for the interface and the
> implementation). So how about we trade-off a little bit by taking
> some(not all) the rq->locks?
> 
> The idea is in membarrier_private_expedited(), we go through all ->curr
> on each CPU and 
> 
> 1)	If it's a userspace task and its ->mm is matched, we send an ipi
> 
> 2)	If it's a kernel task, we skip
> 
> 	(Because there will be a smp_mb() implied by mmdrop(), when it
> 	switchs to userspace task).
> 
> 3)	If it's a userspace task and its ->mm is not matched, we take
> 	the corresponding rq->lock and check rq->curr again, if its ->mm
> 	matched, we send an ipi, otherwise we do nothing.
> 
> 	(Because if we observe rq->curr is not matched with rq->lock
> 	held, when a task having matched ->mm schedules in, the rq->lock
> 	pairing along with the smp_mb__after_spinlock() will guarantee
> 	it observes all memory ops before sys_membarrir()).
> 
> membarrier_private_expedited() will look like this if we choose this
> way:
> 
> void membarrier_private_expedited()
> {
> 	int cpu;
> 	bool fallback = false;
> 	cpumask_var_t tmpmask;
> 	struct rq_flags rf;
> 
> 
> 	if (num_online_cpus() == 1)
> 		return;
> 
> 	smp_mb();
> 
> 	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
> 		/* Fallback for OOM. */
> 		fallback = true;
> 	}
> 
> 	cpus_read_lock();
> 	for_each_online_cpu(cpu) {
> 		struct task_struct *p;
> 
> 		if (cpu == raw_smp_processor_id())
> 			continue;
> 
> 		rcu_read_lock();
> 		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
> 
> 		if (!p) {
> 			rcu_read_unlock();
> 			continue;
> 		}
> 
> 		if (p->mm == current->mm) {
> 			if (!fallback)
> 				__cpumask_set_cpu(cpu, tmpmask);
> 			else
> 				smp_call_function_single(cpu, ipi_mb, NULL, 1);
> 		}
> 
> 		if (p->mm == current->mm || !p->mm) {
> 			rcu_read_unlock();
> 			continue;
> 		}
> 
> 		rcu_read_unlock();
> 		
> 		/*
> 		 * This should be a arch-specific code, as we don't
> 		 * need it at else place other than some archs without
> 		 * a smp_mb() in switch_mm() (i.e. powerpc)
> 		 */
> 		rq_lock_irq(cpu_rq(cpu), &rf);
> 		if (p->mm == current->mm) {

Oops, this one should be

		if (cpu_curr(cpu)->mm == current->mm)

> 			if (!fallback)
> 				__cpumask_set_cpu(cpu, tmpmask);
> 			else
> 				smp_call_function_single(cpu, ipi_mb, NULL, 1);

, and this better be moved out of the lock rq->lock critical section.

Regards,
Boqun

> 		}
> 		rq_unlock_irq(cpu_rq(cpu), &rf);
> 	}
> 	if (!fallback) {
> 		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
> 		free_cpumask_var(tmpmask);
> 	}
> 	cpus_read_unlock();
> 
> 	smp_mb();
> }
> 
> Thoughts?
> 
> Regards,
> Boqun
> 
[...]

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH v3 1/2] membarrier: Provide register expedited private command
  2017-09-22  3:30   ` Boqun Feng
@ 2017-09-22  5:22     ` Mathieu Desnoyers
  0 siblings, 0 replies; 16+ messages in thread
From: Mathieu Desnoyers @ 2017-09-22  5:22 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Paul E. McKenney, Peter Zijlstra, linux-kernel, Andrew Hunter,
	maged michael, gromer, Avi Kivity, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Dave Watson, Alan Stern,
	Will Deacon, Andy Lutomirski, linux-arch

----- On Sep 21, 2017, at 11:30 PM, Boqun Feng boqun.feng@gmail.com wrote:

> On Fri, Sep 22, 2017 at 11:22:06AM +0800, Boqun Feng wrote:
>> Hi Mathieu,
>> 
>> On Tue, Sep 19, 2017 at 06:13:41PM -0400, Mathieu Desnoyers wrote:
>> > Provide a new command allowing processes to register their intent to use
>> > the private expedited command.
>> > 
>> > This allows PowerPC to skip the full memory barrier in switch_mm(), and
>> > only issue the barrier when scheduling into a task belonging to a
>> > process that has registered to use expedited private.
>> > 
>> > Processes are now required to register before using
>> > MEMBARRIER_CMD_PRIVATE_EXPEDITED, otherwise that command returns EPERM.
>> > 
>> 
>> Sorry I'm late for the party, but I couldn't stop thinking whether we
>> could avoid the register thing at all, because the registering makes
>> sys_membarrier() more complex(both for the interface and the
>> implementation). So how about we trade-off a little bit by taking
>> some(not all) the rq->locks?
>> 
>> The idea is in membarrier_private_expedited(), we go through all ->curr
>> on each CPU and
>> 
>> 1)	If it's a userspace task and its ->mm is matched, we send an ipi
>> 
>> 2)	If it's a kernel task, we skip
>> 
>> 	(Because there will be a smp_mb() implied by mmdrop(), when it
>> 	switchs to userspace task).
>> 
>> 3)	If it's a userspace task and its ->mm is not matched, we take
>> 	the corresponding rq->lock and check rq->curr again, if its ->mm
>> 	matched, we send an ipi, otherwise we do nothing.
>> 
>> 	(Because if we observe rq->curr is not matched with rq->lock
>> 	held, when a task having matched ->mm schedules in, the rq->lock
>> 	pairing along with the smp_mb__after_spinlock() will guarantee
>> 	it observes all memory ops before sys_membarrir()).
>> 
>> membarrier_private_expedited() will look like this if we choose this
>> way:
>> 
>> void membarrier_private_expedited()
>> {
>> 	int cpu;
>> 	bool fallback = false;
>> 	cpumask_var_t tmpmask;
>> 	struct rq_flags rf;
>> 
>> 
>> 	if (num_online_cpus() == 1)
>> 		return;
>> 
>> 	smp_mb();
>> 
>> 	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
>> 		/* Fallback for OOM. */
>> 		fallback = true;
>> 	}
>> 
>> 	cpus_read_lock();
>> 	for_each_online_cpu(cpu) {
>> 		struct task_struct *p;
>> 
>> 		if (cpu == raw_smp_processor_id())
>> 			continue;
>> 
>> 		rcu_read_lock();
>> 		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
>> 
>> 		if (!p) {
>> 			rcu_read_unlock();
>> 			continue;
>> 		}
>> 
>> 		if (p->mm == current->mm) {
>> 			if (!fallback)
>> 				__cpumask_set_cpu(cpu, tmpmask);
>> 			else
>> 				smp_call_function_single(cpu, ipi_mb, NULL, 1);
>> 		}
>> 
>> 		if (p->mm == current->mm || !p->mm) {
>> 			rcu_read_unlock();
>> 			continue;
>> 		}
>> 
>> 		rcu_read_unlock();
>> 		
>> 		/*
>> 		 * This should be a arch-specific code, as we don't
>> 		 * need it at else place other than some archs without
>> 		 * a smp_mb() in switch_mm() (i.e. powerpc)
>> 		 */
>> 		rq_lock_irq(cpu_rq(cpu), &rf);
>> 		if (p->mm == current->mm) {
> 
> Oops, this one should be
> 
>		if (cpu_curr(cpu)->mm == current->mm)
> 
>> 			if (!fallback)
>> 				__cpumask_set_cpu(cpu, tmpmask);
>> 			else
>> 				smp_call_function_single(cpu, ipi_mb, NULL, 1);
> 
> , and this better be moved out of the lock rq->lock critical section.
> 
> Regards,
> Boqun
> 
>> 		}
>> 		rq_unlock_irq(cpu_rq(cpu), &rf);
>> 	}
>> 	if (!fallback) {
>> 		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
>> 		free_cpumask_var(tmpmask);
>> 	}
>> 	cpus_read_unlock();
>> 
>> 	smp_mb();
>> }
>> 
>> Thoughts?

Hi Boqun,

The main concern Peter has with the runqueue locking approach
is interference with the scheduler by hitting all CPU's runqueue
locks repeatedly if someone performs membarrier system calls in
a short loop.

Just reading the rq->curr pointer does not generate as much
overhead as grabbing each rq lock.

Thanks,

Mathieu


>> 
>> Regards,
>> Boqun
>> 
> [...]

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH v3 1/2] membarrier: Provide register expedited private command
  2017-09-22  3:22 ` [RFC PATCH v3 1/2] membarrier: Provide register expedited private command Boqun Feng
  2017-09-22  3:30   ` Boqun Feng
@ 2017-09-22  8:24   ` Peter Zijlstra
  2017-09-22  8:56     ` Boqun Feng
  1 sibling, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2017-09-22  8:24 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Mathieu Desnoyers, Paul E . McKenney, linux-kernel,
	Andrew Hunter, Maged Michael, gromer, Avi Kivity,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Dave Watson, Alan Stern, Will Deacon, Andy Lutomirski,
	linux-arch

On Fri, Sep 22, 2017 at 11:22:06AM +0800, Boqun Feng wrote:

> The idea is in membarrier_private_expedited(), we go through all ->curr
> on each CPU and 
> 
> 1)	If it's a userspace task and its ->mm is matched, we send an ipi
> 
> 2)	If it's a kernel task, we skip
> 
> 	(Because there will be a smp_mb() implied by mmdrop(), when it
> 	switchs to userspace task).
> 
> 3)	If it's a userspace task and its ->mm is not matched, we take
> 	the corresponding rq->lock and check rq->curr again, if its ->mm
> 	matched, we send an ipi, otherwise we do nothing.
> 
> 	(Because if we observe rq->curr is not matched with rq->lock
> 	held, when a task having matched ->mm schedules in, the rq->lock
> 	pairing along with the smp_mb__after_spinlock() will guarantee
> 	it observes all memory ops before sys_membarrir()).

3) is an insta DoS.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH v3 1/2] membarrier: Provide register expedited private command
  2017-09-22  8:24   ` Peter Zijlstra
@ 2017-09-22  8:56     ` Boqun Feng
  0 siblings, 0 replies; 16+ messages in thread
From: Boqun Feng @ 2017-09-22  8:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, Paul E . McKenney, linux-kernel,
	Andrew Hunter, Maged Michael, gromer, Avi Kivity,
	Benjamin Herrenschmidt, Paul Mackerras, Michael Ellerman,
	Dave Watson, Alan Stern, Will Deacon, Andy Lutomirski,
	linux-arch

[-- Attachment #1: Type: text/plain, Size: 1004 bytes --]

On Fri, Sep 22, 2017 at 10:24:41AM +0200, Peter Zijlstra wrote:
> On Fri, Sep 22, 2017 at 11:22:06AM +0800, Boqun Feng wrote:
> 
> > The idea is in membarrier_private_expedited(), we go through all ->curr
> > on each CPU and 
> > 
> > 1)	If it's a userspace task and its ->mm is matched, we send an ipi
> > 
> > 2)	If it's a kernel task, we skip
> > 
> > 	(Because there will be a smp_mb() implied by mmdrop(), when it
> > 	switchs to userspace task).
> > 
> > 3)	If it's a userspace task and its ->mm is not matched, we take
> > 	the corresponding rq->lock and check rq->curr again, if its ->mm
> > 	matched, we send an ipi, otherwise we do nothing.
> > 
> > 	(Because if we observe rq->curr is not matched with rq->lock
> > 	held, when a task having matched ->mm schedules in, the rq->lock
> > 	pairing along with the smp_mb__after_spinlock() will guarantee
> > 	it observes all memory ops before sys_membarrir()).
> 
> 3) is an insta DoS.

OK, fair enough.

Regards,
Boqun

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH v3 1/2] membarrier: Provide register expedited private command
  2017-09-19 22:13 [RFC PATCH v3 1/2] membarrier: Provide register expedited private command Mathieu Desnoyers
  2017-09-19 22:13 ` [RFC PATCH 2/2] membarrier: selftest: Test private expedited cmd Mathieu Desnoyers
  2017-09-22  3:22 ` [RFC PATCH v3 1/2] membarrier: Provide register expedited private command Boqun Feng
@ 2017-09-22  8:59 ` Boqun Feng
  2017-09-22 15:10   ` Mathieu Desnoyers
  2 siblings, 1 reply; 16+ messages in thread
From: Boqun Feng @ 2017-09-22  8:59 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Paul E . McKenney, Peter Zijlstra, linux-kernel, Andrew Hunter,
	Maged Michael, gromer, Avi Kivity, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Dave Watson, Alan Stern,
	Will Deacon, Andy Lutomirski, linux-arch

[-- Attachment #1: Type: text/plain, Size: 1560 bytes --]

On Tue, Sep 19, 2017 at 06:13:41PM -0400, Mathieu Desnoyers wrote:
[...]
> +static inline void membarrier_arch_sched_in(struct task_struct *prev,
> +		struct task_struct *next)
> +{
> +	/*
> +	 * Only need the full barrier when switching between processes.
> +	 */
> +	if (likely(!test_ti_thread_flag(task_thread_info(next),
> +			TIF_MEMBARRIER_PRIVATE_EXPEDITED)
> +				|| prev->mm == next->mm))

And we also don't need the smp_mb() if !prev->mm, because switching from
kernel to user will have a smp_mb() implied by mmdrop()?

> +		return;
> +
> +	/*
> +	 * The membarrier system call requires a full memory barrier
> +	 * after storing to rq->curr, before going back to user-space.
> +	 */
> +	smp_mb();
> +}

[...]

> +static inline void membarrier_fork(struct task_struct *t,
> +		unsigned long clone_flags)
> +{
> +	if (!current->mm || !t->mm)
> +		return;
> +	t->mm->membarrier_private_expedited =
> +		current->mm->membarrier_private_expedited;

Have we already done the copy of ->membarrier_private_expedited in
copy_mm()?

Regards,
Boqun

> +	membarrier_arch_fork(t, clone_flags);
> +}
> +static inline void membarrier_execve(struct task_struct *t)
> +{
> +	t->mm->membarrier_private_expedited = 0;
> +	membarrier_arch_execve(t);
> +}
> +#else
> +static inline void membarrier_sched_in(struct task_struct *prev,
> +		struct task_struct *next)
> +{
> +}
> +static inline void membarrier_fork(struct task_struct *t,
> +		unsigned long clone_flags)
> +{
> +}
> +static inline void membarrier_execve(struct task_struct *t)
> +{
> +}
> +#endif
> +
[...]

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH v3 1/2] membarrier: Provide register expedited private command
  2017-09-22  8:59 ` Boqun Feng
@ 2017-09-22 15:10   ` Mathieu Desnoyers
  2017-09-24 13:30     ` Boqun Feng
  0 siblings, 1 reply; 16+ messages in thread
From: Mathieu Desnoyers @ 2017-09-22 15:10 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Paul E. McKenney, Peter Zijlstra, linux-kernel, Andrew Hunter,
	maged michael, gromer, Avi Kivity, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Dave Watson, Alan Stern,
	Will Deacon, Andy Lutomirski, linux-arch

----- On Sep 22, 2017, at 4:59 AM, Boqun Feng boqun.feng@gmail.com wrote:

> On Tue, Sep 19, 2017 at 06:13:41PM -0400, Mathieu Desnoyers wrote:
> [...]
>> +static inline void membarrier_arch_sched_in(struct task_struct *prev,
>> +		struct task_struct *next)
>> +{
>> +	/*
>> +	 * Only need the full barrier when switching between processes.
>> +	 */
>> +	if (likely(!test_ti_thread_flag(task_thread_info(next),
>> +			TIF_MEMBARRIER_PRIVATE_EXPEDITED)
>> +				|| prev->mm == next->mm))
> 
> And we also don't need the smp_mb() if !prev->mm, because switching from
> kernel to user will have a smp_mb() implied by mmdrop()?

Right. And we also don't need it when switching from userspace to kernel
thread neither. Something like this:

static inline void membarrier_arch_sched_in(struct task_struct *prev,
                struct task_struct *next)
{
        /*
         * Only need the full barrier when switching between processes.
         * Barrier when switching from kernel to userspace is not
         * required here, given that it is implied by mmdrop(). Barrier
         * when switching from userspace to kernel is not needed after
         * store to rq->curr.
         */
        if (likely(!test_ti_thread_flag(task_thread_info(next),
                        TIF_MEMBARRIER_PRIVATE_EXPEDITED)
                        || !prev->mm || !next->mm || prev->mm == next->mm))
                return;

        /*
         * The membarrier system call requires a full memory barrier
         * after storing to rq->curr, before going back to user-space.
         */
        smp_mb();
}

> 
>> +		return;
>> +
>> +	/*
>> +	 * The membarrier system call requires a full memory barrier
>> +	 * after storing to rq->curr, before going back to user-space.
>> +	 */
>> +	smp_mb();
>> +}
> 
> [...]
> 
>> +static inline void membarrier_fork(struct task_struct *t,
>> +		unsigned long clone_flags)
>> +{
>> +	if (!current->mm || !t->mm)
>> +		return;
>> +	t->mm->membarrier_private_expedited =
>> +		current->mm->membarrier_private_expedited;
> 
> Have we already done the copy of ->membarrier_private_expedited in
> copy_mm()?

copy_mm() is performed without holding current->sighand->siglock, so
it appears to be racing with concurrent membarrier register cmd.
However, given that it is a single flag updated with WRITE_ONCE()
and read with READ_ONCE(), it might be OK to rely on copy_mm there.
If userspace runs registration concurrently with fork, they should
not expect the child to be specifically registered or unregistered.

So yes, I think you are right about removing this copy and relying on
copy_mm() instead. I also think we can improve membarrier_arch_fork()
on powerpc to test the current thread flag rather than using current->mm.

Which leads to those two changes:

static inline void membarrier_fork(struct task_struct *t,
                unsigned long clone_flags)
{
        /*
         * Prior copy_mm() copies the membarrier_private_expedited field
         * from current->mm to t->mm.
         */
        membarrier_arch_fork(t, clone_flags);
}

And on PowerPC:

static inline void membarrier_arch_fork(struct task_struct *t,
                unsigned long clone_flags)
{
        /*
         * Coherence of TIF_MEMBARRIER_PRIVATE_EXPEDITED against thread
         * fork is protected by siglock. membarrier_arch_fork is called
         * with siglock held.
         */
        if (test_thread_flag(TIF_MEMBARRIER_PRIVATE_EXPEDITED))
                set_ti_thread_flag(task_thread_info(t),
                                TIF_MEMBARRIER_PRIVATE_EXPEDITED);
}

Thanks,

Mathieu


> 
> Regards,
> Boqun
> 
>> +	membarrier_arch_fork(t, clone_flags);
>> +}
>> +static inline void membarrier_execve(struct task_struct *t)
>> +{
>> +	t->mm->membarrier_private_expedited = 0;
>> +	membarrier_arch_execve(t);
>> +}
>> +#else
>> +static inline void membarrier_sched_in(struct task_struct *prev,
>> +		struct task_struct *next)
>> +{
>> +}
>> +static inline void membarrier_fork(struct task_struct *t,
>> +		unsigned long clone_flags)
>> +{
>> +}
>> +static inline void membarrier_execve(struct task_struct *t)
>> +{
>> +}
>> +#endif
>> +
> [...]

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH v3 1/2] membarrier: Provide register expedited private command
  2017-09-22 15:10   ` Mathieu Desnoyers
@ 2017-09-24 13:30     ` Boqun Feng
  2017-09-24 14:23       ` Mathieu Desnoyers
  0 siblings, 1 reply; 16+ messages in thread
From: Boqun Feng @ 2017-09-24 13:30 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Paul E. McKenney, Peter Zijlstra, linux-kernel, Andrew Hunter,
	maged michael, gromer, Avi Kivity, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Dave Watson, Alan Stern,
	Will Deacon, Andy Lutomirski, linux-arch

[-- Attachment #1: Type: text/plain, Size: 6246 bytes --]

On Fri, Sep 22, 2017 at 03:10:10PM +0000, Mathieu Desnoyers wrote:
> ----- On Sep 22, 2017, at 4:59 AM, Boqun Feng boqun.feng@gmail.com wrote:
> 
> > On Tue, Sep 19, 2017 at 06:13:41PM -0400, Mathieu Desnoyers wrote:
> > [...]
> >> +static inline void membarrier_arch_sched_in(struct task_struct *prev,
> >> +		struct task_struct *next)
> >> +{
> >> +	/*
> >> +	 * Only need the full barrier when switching between processes.
> >> +	 */
> >> +	if (likely(!test_ti_thread_flag(task_thread_info(next),
> >> +			TIF_MEMBARRIER_PRIVATE_EXPEDITED)
> >> +				|| prev->mm == next->mm))
> > 
> > And we also don't need the smp_mb() if !prev->mm, because switching from
> > kernel to user will have a smp_mb() implied by mmdrop()?
> 
> Right. And we also don't need it when switching from userspace to kernel

Yep, but this case is covered already, as I think we don't allow kernel
thread to have TIF_MEMBARRIER_PRIVATE_EXPEDITED set, right?

> thread neither. Something like this:
> 
> static inline void membarrier_arch_sched_in(struct task_struct *prev,
>                 struct task_struct *next)
> {
>         /*
>          * Only need the full barrier when switching between processes.
>          * Barrier when switching from kernel to userspace is not
>          * required here, given that it is implied by mmdrop(). Barrier
>          * when switching from userspace to kernel is not needed after
>          * store to rq->curr.
>          */
>         if (likely(!test_ti_thread_flag(task_thread_info(next),
>                         TIF_MEMBARRIER_PRIVATE_EXPEDITED)
>                         || !prev->mm || !next->mm || prev->mm == next->mm))

, so no need to test next->mm here.

>                 return;
> 
>         /*
>          * The membarrier system call requires a full memory barrier
>          * after storing to rq->curr, before going back to user-space.
>          */
>         smp_mb();
> }
> 
> > 
> >> +		return;
> >> +
> >> +	/*
> >> +	 * The membarrier system call requires a full memory barrier
> >> +	 * after storing to rq->curr, before going back to user-space.
> >> +	 */
> >> +	smp_mb();
> >> +}
> > 
> > [...]
> > 
> >> +static inline void membarrier_fork(struct task_struct *t,
> >> +		unsigned long clone_flags)
> >> +{
> >> +	if (!current->mm || !t->mm)
> >> +		return;
> >> +	t->mm->membarrier_private_expedited =
> >> +		current->mm->membarrier_private_expedited;
> > 
> > Have we already done the copy of ->membarrier_private_expedited in
> > copy_mm()?
> 
> copy_mm() is performed without holding current->sighand->siglock, so
> it appears to be racing with concurrent membarrier register cmd.

Speak of racing, I think we currently have a problem if we do a
register_private_expedited in one thread and do a
membarrer_private_expedited in another thread(sharing the same mm), as
follow:

	{t1,t2,t3 sharing the same ->mm}
	CPU 0				CPU 1				CPU2
	====================		===================		============
	{in thread t1}							
	membarrier_register_private_expedited():
	  ...
	  WRITE_ONCE(->mm->membarrier_private_expedited, 1);
	  membarrier_arch_register_private_expedited():
	    ...
	    <haven't set the TIF for t3 yet>

	    				{in thread t2}
					membarrier_private_expedited():
					  READ_ONCE(->mm->membarrier_private_expedited); // == 1
					  ...
					  for_each_online_cpu()
					    ...
					    <p is cpu_rq(CPU2)->curr>
					    if (p && p->mm == current->mm) // false
					    <so no ipi sent to CPU2>
					    				
									{about to switch to t3}
									rq->curr = t3;
									....
									context_switch():
									  ...
									  finish_task_swtich():
									    membarrier_sched_in():
									    <TIF is not set>
									    // no smp_mb() here.

, and we will miss the smp_mb() on CPU2, right? And this could even
happen if t2 has a membarrier_register_private_expedited() preceding the
membarrier_private_expedited().
					
Am I missing something subtle here?

Regards,
Boqun


> However, given that it is a single flag updated with WRITE_ONCE()
> and read with READ_ONCE(), it might be OK to rely on copy_mm there.
> If userspace runs registration concurrently with fork, they should
> not expect the child to be specifically registered or unregistered.
> 
> So yes, I think you are right about removing this copy and relying on
> copy_mm() instead. I also think we can improve membarrier_arch_fork()
> on powerpc to test the current thread flag rather than using current->mm.
> 
> Which leads to those two changes:
> 
> static inline void membarrier_fork(struct task_struct *t,
>                 unsigned long clone_flags)
> {
>         /*
>          * Prior copy_mm() copies the membarrier_private_expedited field
>          * from current->mm to t->mm.
>          */
>         membarrier_arch_fork(t, clone_flags);
> }
> 
> And on PowerPC:
> 
> static inline void membarrier_arch_fork(struct task_struct *t,
>                 unsigned long clone_flags)
> {
>         /*
>          * Coherence of TIF_MEMBARRIER_PRIVATE_EXPEDITED against thread
>          * fork is protected by siglock. membarrier_arch_fork is called
>          * with siglock held.
>          */
>         if (test_thread_flag(TIF_MEMBARRIER_PRIVATE_EXPEDITED))
>                 set_ti_thread_flag(task_thread_info(t),
>                                 TIF_MEMBARRIER_PRIVATE_EXPEDITED);
> }
> 
> Thanks,
> 
> Mathieu
> 
> 
> > 
> > Regards,
> > Boqun
> > 
> >> +	membarrier_arch_fork(t, clone_flags);
> >> +}
> >> +static inline void membarrier_execve(struct task_struct *t)
> >> +{
> >> +	t->mm->membarrier_private_expedited = 0;
> >> +	membarrier_arch_execve(t);
> >> +}
> >> +#else
> >> +static inline void membarrier_sched_in(struct task_struct *prev,
> >> +		struct task_struct *next)
> >> +{
> >> +}
> >> +static inline void membarrier_fork(struct task_struct *t,
> >> +		unsigned long clone_flags)
> >> +{
> >> +}
> >> +static inline void membarrier_execve(struct task_struct *t)
> >> +{
> >> +}
> >> +#endif
> >> +
> > [...]
> 
> -- 
> Mathieu Desnoyers
> EfficiOS Inc.
> http://www.efficios.com

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH v3 1/2] membarrier: Provide register expedited private command
  2017-09-24 13:30     ` Boqun Feng
@ 2017-09-24 14:23       ` Mathieu Desnoyers
  2017-09-25 12:10         ` Boqun Feng
  0 siblings, 1 reply; 16+ messages in thread
From: Mathieu Desnoyers @ 2017-09-24 14:23 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Paul E. McKenney, Peter Zijlstra, linux-kernel, Andrew Hunter,
	maged michael, gromer, Avi Kivity, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Dave Watson, Alan Stern,
	Will Deacon, Andy Lutomirski, linux-arch

----- On Sep 24, 2017, at 9:30 AM, Boqun Feng boqun.feng@gmail.com wrote:

> On Fri, Sep 22, 2017 at 03:10:10PM +0000, Mathieu Desnoyers wrote:
>> ----- On Sep 22, 2017, at 4:59 AM, Boqun Feng boqun.feng@gmail.com wrote:
>> 
>> > On Tue, Sep 19, 2017 at 06:13:41PM -0400, Mathieu Desnoyers wrote:
>> > [...]
>> >> +static inline void membarrier_arch_sched_in(struct task_struct *prev,
>> >> +		struct task_struct *next)
>> >> +{
>> >> +	/*
>> >> +	 * Only need the full barrier when switching between processes.
>> >> +	 */
>> >> +	if (likely(!test_ti_thread_flag(task_thread_info(next),
>> >> +			TIF_MEMBARRIER_PRIVATE_EXPEDITED)
>> >> +				|| prev->mm == next->mm))
>> > 
>> > And we also don't need the smp_mb() if !prev->mm, because switching from
>> > kernel to user will have a smp_mb() implied by mmdrop()?
>> 
>> Right. And we also don't need it when switching from userspace to kernel
> 
> Yep, but this case is covered already, as I think we don't allow kernel
> thread to have TIF_MEMBARRIER_PRIVATE_EXPEDITED set, right?

Good point.

> 
>> thread neither. Something like this:
>> 
>> static inline void membarrier_arch_sched_in(struct task_struct *prev,
>>                 struct task_struct *next)
>> {
>>         /*
>>          * Only need the full barrier when switching between processes.
>>          * Barrier when switching from kernel to userspace is not
>>          * required here, given that it is implied by mmdrop(). Barrier
>>          * when switching from userspace to kernel is not needed after
>>          * store to rq->curr.
>>          */
>>         if (likely(!test_ti_thread_flag(task_thread_info(next),
>>                         TIF_MEMBARRIER_PRIVATE_EXPEDITED)
>>                         || !prev->mm || !next->mm || prev->mm == next->mm))
> 
> , so no need to test next->mm here.
> 

Right, it's redundant wrt testing the thread flag.

>>                 return;
>> 
>>         /*
>>          * The membarrier system call requires a full memory barrier
>>          * after storing to rq->curr, before going back to user-space.
>>          */
>>         smp_mb();
>> }
>> 
>> > 
>> >> +		return;
>> >> +
>> >> +	/*
>> >> +	 * The membarrier system call requires a full memory barrier
>> >> +	 * after storing to rq->curr, before going back to user-space.
>> >> +	 */
>> >> +	smp_mb();
>> >> +}
>> > 
>> > [...]
>> > 
>> >> +static inline void membarrier_fork(struct task_struct *t,
>> >> +		unsigned long clone_flags)
>> >> +{
>> >> +	if (!current->mm || !t->mm)
>> >> +		return;
>> >> +	t->mm->membarrier_private_expedited =
>> >> +		current->mm->membarrier_private_expedited;
>> > 
>> > Have we already done the copy of ->membarrier_private_expedited in
>> > copy_mm()?
>> 
>> copy_mm() is performed without holding current->sighand->siglock, so
>> it appears to be racing with concurrent membarrier register cmd.
> 
> Speak of racing, I think we currently have a problem if we do a
> register_private_expedited in one thread and do a
> membarrer_private_expedited in another thread(sharing the same mm), as
> follow:
> 
>	{t1,t2,t3 sharing the same ->mm}
>	CPU 0				CPU 1				CPU2
>	====================		===================		============
>	{in thread t1}
>	membarrier_register_private_expedited():
>	  ...
>	  WRITE_ONCE(->mm->membarrier_private_expedited, 1);
>	  membarrier_arch_register_private_expedited():
>	    ...
>	    <haven't set the TIF for t3 yet>
> 
>	    				{in thread t2}
>					membarrier_private_expedited():
>					  READ_ONCE(->mm->membarrier_private_expedited); // == 1
>					  ...
>					  for_each_online_cpu()
>					    ...
>					    <p is cpu_rq(CPU2)->curr>
>					    if (p && p->mm == current->mm) // false
>					    <so no ipi sent to CPU2>
>					    				
>									{about to switch to t3}
>									rq->curr = t3;
>									....
>									context_switch():
>									  ...
>									  finish_task_swtich():
>									    membarrier_sched_in():
>									    <TIF is not set>
>									    // no smp_mb() here.
> 
> , and we will miss the smp_mb() on CPU2, right? And this could even
> happen if t2 has a membarrier_register_private_expedited() preceding the
> membarrier_private_expedited().
>					
> Am I missing something subtle here?

I think the problem sits in this function:

static void membarrier_register_private_expedited(void)
{
        struct task_struct *p = current;

        if (READ_ONCE(p->mm->membarrier_private_expedited))
                return;
        WRITE_ONCE(p->mm->membarrier_private_expedited, 1);
        membarrier_arch_register_private_expedited(p);
}

I need to change the order between WRITE_ONCE() and invoking
membarrier_arch_register_private_expedited. If I issue the
WRITE_ONCE after the arch code (which sets the TIF flags),
then concurrent membarrier priv exped commands will simply
return an -EPERM error:

static void membarrier_register_private_expedited(void)
{
        struct task_struct *p = current;

        if (READ_ONCE(p->mm->membarrier_private_expedited))
                return;
        membarrier_arch_register_private_expedited(p);
        WRITE_ONCE(p->mm->membarrier_private_expedited, 1);
}

Do you agree that this would fix the race you identified ?

Thanks,

Mathieu


> 
> Regards,
> Boqun
> 
> 
>> However, given that it is a single flag updated with WRITE_ONCE()
>> and read with READ_ONCE(), it might be OK to rely on copy_mm there.
>> If userspace runs registration concurrently with fork, they should
>> not expect the child to be specifically registered or unregistered.
>> 
>> So yes, I think you are right about removing this copy and relying on
>> copy_mm() instead. I also think we can improve membarrier_arch_fork()
>> on powerpc to test the current thread flag rather than using current->mm.
>> 
>> Which leads to those two changes:
>> 
>> static inline void membarrier_fork(struct task_struct *t,
>>                 unsigned long clone_flags)
>> {
>>         /*
>>          * Prior copy_mm() copies the membarrier_private_expedited field
>>          * from current->mm to t->mm.
>>          */
>>         membarrier_arch_fork(t, clone_flags);
>> }
>> 
>> And on PowerPC:
>> 
>> static inline void membarrier_arch_fork(struct task_struct *t,
>>                 unsigned long clone_flags)
>> {
>>         /*
>>          * Coherence of TIF_MEMBARRIER_PRIVATE_EXPEDITED against thread
>>          * fork is protected by siglock. membarrier_arch_fork is called
>>          * with siglock held.
>>          */
>>         if (test_thread_flag(TIF_MEMBARRIER_PRIVATE_EXPEDITED))
>>                 set_ti_thread_flag(task_thread_info(t),
>>                                 TIF_MEMBARRIER_PRIVATE_EXPEDITED);
>> }
>> 
>> Thanks,
>> 
>> Mathieu
>> 
>> 
>> > 
>> > Regards,
>> > Boqun
>> > 
>> >> +	membarrier_arch_fork(t, clone_flags);
>> >> +}
>> >> +static inline void membarrier_execve(struct task_struct *t)
>> >> +{
>> >> +	t->mm->membarrier_private_expedited = 0;
>> >> +	membarrier_arch_execve(t);
>> >> +}
>> >> +#else
>> >> +static inline void membarrier_sched_in(struct task_struct *prev,
>> >> +		struct task_struct *next)
>> >> +{
>> >> +}
>> >> +static inline void membarrier_fork(struct task_struct *t,
>> >> +		unsigned long clone_flags)
>> >> +{
>> >> +}
>> >> +static inline void membarrier_execve(struct task_struct *t)
>> >> +{
>> >> +}
>> >> +#endif
>> >> +
>> > [...]
>> 
>> --
>> Mathieu Desnoyers
>> EfficiOS Inc.
> > http://www.efficios.com

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH v3 1/2] membarrier: Provide register expedited private command
  2017-09-24 14:23       ` Mathieu Desnoyers
@ 2017-09-25 12:10         ` Boqun Feng
  2017-09-25 12:25           ` Peter Zijlstra
  0 siblings, 1 reply; 16+ messages in thread
From: Boqun Feng @ 2017-09-25 12:10 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Paul E. McKenney, Peter Zijlstra, linux-kernel, Andrew Hunter,
	maged michael, gromer, Avi Kivity, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Dave Watson, Alan Stern,
	Will Deacon, Andy Lutomirski, linux-arch

[-- Attachment #1: Type: text/plain, Size: 2855 bytes --]

On Sun, Sep 24, 2017 at 02:23:04PM +0000, Mathieu Desnoyers wrote:
[...]
> >> 
> >> copy_mm() is performed without holding current->sighand->siglock, so
> >> it appears to be racing with concurrent membarrier register cmd.
> > 
> > Speak of racing, I think we currently have a problem if we do a
> > register_private_expedited in one thread and do a
> > membarrer_private_expedited in another thread(sharing the same mm), as
> > follow:
> > 
> >	{t1,t2,t3 sharing the same ->mm}
> >	CPU 0				CPU 1				CPU2
> >	====================		===================		============
> >	{in thread t1}
> >	membarrier_register_private_expedited():
> >	  ...
> >	  WRITE_ONCE(->mm->membarrier_private_expedited, 1);
> >	  membarrier_arch_register_private_expedited():
> >	    ...
> >	    <haven't set the TIF for t3 yet>
> > 
> >	    				{in thread t2}
> >					membarrier_private_expedited():
> >					  READ_ONCE(->mm->membarrier_private_expedited); // == 1
> >					  ...
> >					  for_each_online_cpu()
> >					    ...
> >					    <p is cpu_rq(CPU2)->curr>
> >					    if (p && p->mm == current->mm) // false
> >					    <so no ipi sent to CPU2>
> >					    				
> >									{about to switch to t3}
> >									rq->curr = t3;
> >									....
> >									context_switch():
> >									  ...
> >									  finish_task_swtich():
> >									    membarrier_sched_in():
> >									    <TIF is not set>
> >									    // no smp_mb() here.
> > 
> > , and we will miss the smp_mb() on CPU2, right? And this could even
> > happen if t2 has a membarrier_register_private_expedited() preceding the
> > membarrier_private_expedited().
> >					
> > Am I missing something subtle here?
> 
> I think the problem sits in this function:
> 
> static void membarrier_register_private_expedited(void)
> {
>         struct task_struct *p = current;
> 
>         if (READ_ONCE(p->mm->membarrier_private_expedited))
>                 return;
>         WRITE_ONCE(p->mm->membarrier_private_expedited, 1);
>         membarrier_arch_register_private_expedited(p);
> }
> 
> I need to change the order between WRITE_ONCE() and invoking
> membarrier_arch_register_private_expedited. If I issue the
> WRITE_ONCE after the arch code (which sets the TIF flags),
> then concurrent membarrier priv exped commands will simply
> return an -EPERM error:
> 
> static void membarrier_register_private_expedited(void)
> {
>         struct task_struct *p = current;
> 
>         if (READ_ONCE(p->mm->membarrier_private_expedited))
>                 return;
>         membarrier_arch_register_private_expedited(p);
>         WRITE_ONCE(p->mm->membarrier_private_expedited, 1);
> }
> 
> Do you agree that this would fix the race you identified ?
> 

Yep, that will do the trick ;-)

Regards,
Boqun

> Thanks,
> 
> Mathieu
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH v3 1/2] membarrier: Provide register expedited private command
  2017-09-25 12:10         ` Boqun Feng
@ 2017-09-25 12:25           ` Peter Zijlstra
  2017-09-25 12:42             ` Mathieu Desnoyers
  0 siblings, 1 reply; 16+ messages in thread
From: Peter Zijlstra @ 2017-09-25 12:25 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Mathieu Desnoyers, Paul E. McKenney, linux-kernel, Andrew Hunter,
	maged michael, gromer, Avi Kivity, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Dave Watson, Alan Stern,
	Will Deacon, Andy Lutomirski, linux-arch

On Mon, Sep 25, 2017 at 08:10:54PM +0800, Boqun Feng wrote:
> > static void membarrier_register_private_expedited(void)
> > {
> >         struct task_struct *p = current;
> > 
> >         if (READ_ONCE(p->mm->membarrier_private_expedited))
> >                 return;
> >         membarrier_arch_register_private_expedited(p);

Should we not then also do:

	    barrier();

> >         WRITE_ONCE(p->mm->membarrier_private_expedited, 1);
> > }

to avoid the compiler lifting that store?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [RFC PATCH v3 1/2] membarrier: Provide register expedited private command
  2017-09-25 12:25           ` Peter Zijlstra
@ 2017-09-25 12:42             ` Mathieu Desnoyers
  0 siblings, 0 replies; 16+ messages in thread
From: Mathieu Desnoyers @ 2017-09-25 12:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Boqun Feng, Paul E. McKenney, linux-kernel, Andrew Hunter,
	maged michael, gromer, Avi Kivity, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Dave Watson, Alan Stern,
	Will Deacon, Andy Lutomirski, linux-arch

----- On Sep 25, 2017, at 8:25 AM, Peter Zijlstra peterz@infradead.org wrote:

> On Mon, Sep 25, 2017 at 08:10:54PM +0800, Boqun Feng wrote:
>> > static void membarrier_register_private_expedited(void)
>> > {
>> >         struct task_struct *p = current;
>> > 
>> >         if (READ_ONCE(p->mm->membarrier_private_expedited))
>> >                 return;
>> >         membarrier_arch_register_private_expedited(p);
> 
> Should we not then also do:
> 
>	    barrier();
> 
>> >         WRITE_ONCE(p->mm->membarrier_private_expedited, 1);
>> > }
> 
> to avoid the compiler lifting that store?

membarrier_arch_register_private_expedited() being a function call, I
recall compilers cannot move load/stores across those. Moreover, even if
that function would happen to be eventually inlined, synchronize_sched()
is needed at the end of the function to ensure the scheduler will observe
the thread flags before it returns. That too would then act as a compiler
barrier if that function is ever inlined in the future.

So do you think we should still add the barrier() as documentation, or is
having synchronize_sched() in the callee enough ?

By the way, I think I should add a READ_ONCE() in membarrier_private_expedited
to pair with the WRITE_ONCE() in registration, such as:

        if (!READ_ONCE(current->mm->membarrier_private_expedited))
                return -EPERM;

Thanks!

Mathieu



-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [RFC PATCH 2/2] membarrier: selftest: Test private expedited cmd
  2017-09-18 22:36 [RFC PATCH v2 " Mathieu Desnoyers
@ 2017-09-18 22:36 ` Mathieu Desnoyers
  0 siblings, 0 replies; 16+ messages in thread
From: Mathieu Desnoyers @ 2017-09-18 22:36 UTC (permalink / raw)
  To: Peter Zijlstra, Paul E . McKenney
  Cc: linux-kernel, Mathieu Desnoyers, Boqun Feng, Andrew Hunter,
	Maged Michael, gromer, Avi Kivity, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Dave Watson, Alan Stern,
	Will Deacon, Andy Lutomirski, linux-arch

Test the new MEMBARRIER_CMD_PRIVATE_EXPEDITED and
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED commands.

Add checks expecting specific error values on system calls expected to
fail.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Andrew Hunter <ahh@google.com>
CC: Maged Michael <maged.michael@gmail.com>
CC: gromer@google.com
CC: Avi Kivity <avi@scylladb.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Michael Ellerman <mpe@ellerman.id.au>
CC: Dave Watson <davejwatson@fb.com>
CC: Alan Stern <stern@rowland.harvard.edu>
CC: Will Deacon <will.deacon@arm.com>
CC: Andy Lutomirski <luto@kernel.org>
CC: linux-arch@vger.kernel.org
---
 .../testing/selftests/membarrier/membarrier_test.c | 109 ++++++++++++++++++---
 1 file changed, 94 insertions(+), 15 deletions(-)

diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test.c
index 21399fcf1a59..f85657374b59 100644
--- a/tools/testing/selftests/membarrier/membarrier_test.c
+++ b/tools/testing/selftests/membarrier/membarrier_test.c
@@ -15,49 +15,119 @@ static int sys_membarrier(int cmd, int flags)
 static int test_membarrier_cmd_fail(void)
 {
 	int cmd = -1, flags = 0;
+	const char *test_name = "sys membarrier invalid command";
 
 	if (sys_membarrier(cmd, flags) != -1) {
 		ksft_exit_fail_msg(
-			"sys membarrier invalid command test: command = %d, flags = %d. Should fail, but passed\n",
-			cmd, flags);
+			"%s test: command = %d, flags = %d. Should fail, but passed\n",
+			test_name, cmd, flags);
+	}
+	if (errno != EINVAL) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+			test_name, flags, EINVAL, strerror(EINVAL),
+			errno, strerror(errno));
 	}
 
 	ksft_test_result_pass(
-		"sys membarrier invalid command test: command = %d, flags = %d. Failed as expected\n",
-		cmd, flags);
+		"%s test: command = %d, flags = %d, errno = %d. Failed as expected\n",
+		test_name, cmd, flags, errno);
 	return 0;
 }
 
 static int test_membarrier_flags_fail(void)
 {
 	int cmd = MEMBARRIER_CMD_QUERY, flags = 1;
+	const char *test_name = "sys membarrier MEMBARRIER_CMD_QUERY invalid flags";
 
 	if (sys_membarrier(cmd, flags) != -1) {
 		ksft_exit_fail_msg(
-			"sys membarrier MEMBARRIER_CMD_QUERY invalid flags test: flags = %d. Should fail, but passed\n",
-			flags);
+			"%s test: flags = %d. Should fail, but passed\n",
+			test_name, flags);
+	}
+	if (errno != EINVAL) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+			test_name, flags, EINVAL, strerror(EINVAL),
+			errno, strerror(errno));
 	}
 
 	ksft_test_result_pass(
-		"sys membarrier MEMBARRIER_CMD_QUERY invalid flags test: flags = %d. Failed as expected\n",
-		flags);
+		"%s test: flags = %d, errno = %d. Failed as expected\n",
+		test_name, flags, errno);
 	return 0;
 }
 
-static int test_membarrier_success(void)
+static int test_membarrier_shared_success(void)
 {
 	int cmd = MEMBARRIER_CMD_SHARED, flags = 0;
-	const char *test_name = "sys membarrier MEMBARRIER_CMD_SHARED\n";
+	const char *test_name = "sys membarrier MEMBARRIER_CMD_SHARED";
+
+	if (sys_membarrier(cmd, flags) != 0) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d, errno = %d\n",
+			test_name, flags, errno);
+	}
+
+	ksft_test_result_pass(
+		"%s test: flags = %d\n", test_name, flags);
+	return 0;
+}
+
+static int test_membarrier_private_expedited_fail(void)
+{
+	int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
+	const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED not registered failure";
+
+	if (sys_membarrier(cmd, flags) != -1) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d. Should fail, but passed\n",
+			test_name, flags);
+	}
+	if (errno != EPERM) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+			test_name, flags, EPERM, strerror(EPERM),
+			errno, strerror(errno));
+	}
+
+	ksft_test_result_pass(
+		"%s test: flags = %d, errno = %d\n",
+		test_name, flags, errno);
+	return 0;
+}
+
+static int test_membarrier_register_private_expedited_success(void)
+{
+	int cmd = MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, flags = 0;
+	const char *test_name = "sys membarrier MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED";
 
 	if (sys_membarrier(cmd, flags) != 0) {
 		ksft_exit_fail_msg(
-			"sys membarrier MEMBARRIER_CMD_SHARED test: flags = %d\n",
-			flags);
+			"%s test: flags = %d, errno = %d\n",
+			test_name, flags, errno);
 	}
 
 	ksft_test_result_pass(
-		"sys membarrier MEMBARRIER_CMD_SHARED test: flags = %d\n",
-		flags);
+		"%s test: flags = %d\n",
+		test_name, flags);
+	return 0;
+}
+
+static int test_membarrier_private_expedited_success(void)
+{
+	int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
+	const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED";
+
+	if (sys_membarrier(cmd, flags) != 0) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d, errno = %d\n",
+			test_name, flags, errno);
+	}
+
+	ksft_test_result_pass(
+		"%s test: flags = %d\n",
+		test_name, flags);
 	return 0;
 }
 
@@ -71,7 +141,16 @@ static int test_membarrier(void)
 	status = test_membarrier_flags_fail();
 	if (status)
 		return status;
-	status = test_membarrier_success();
+	status = test_membarrier_shared_success();
+	if (status)
+		return status;
+	status = test_membarrier_private_expedited_fail();
+	if (status)
+		return status;
+	status = test_membarrier_register_private_expedited_success();
+	if (status)
+		return status;
+	status = test_membarrier_private_expedited_success();
 	if (status)
 		return status;
 	return 0;
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [RFC PATCH 2/2] membarrier: selftest: Test private expedited cmd
  2017-09-18 16:52 [RFC PATCH 1/2] membarrier: Provide register expedited private command Mathieu Desnoyers
@ 2017-09-18 16:52 ` Mathieu Desnoyers
  0 siblings, 0 replies; 16+ messages in thread
From: Mathieu Desnoyers @ 2017-09-18 16:52 UTC (permalink / raw)
  To: Peter Zijlstra, Paul E . McKenney
  Cc: linux-kernel, Mathieu Desnoyers, Boqun Feng, Andrew Hunter,
	Maged Michael, gromer, Avi Kivity, Benjamin Herrenschmidt,
	Paul Mackerras, Michael Ellerman, Dave Watson, Alan Stern,
	Will Deacon, Andy Lutomirski, linux-arch

Test the new MEMBARRIER_CMD_PRIVATE_EXPEDITED and
MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED commands.

Add checks expecting specific error values on system calls expected to
fail.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
CC: Peter Zijlstra <peterz@infradead.org>
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Andrew Hunter <ahh@google.com>
CC: Maged Michael <maged.michael@gmail.com>
CC: gromer@google.com
CC: Avi Kivity <avi@scylladb.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Michael Ellerman <mpe@ellerman.id.au>
CC: Dave Watson <davejwatson@fb.com>
CC: Alan Stern <stern@rowland.harvard.edu>
CC: Will Deacon <will.deacon@arm.com>
CC: Andy Lutomirski <luto@kernel.org>
CC: linux-arch@vger.kernel.org
---
 .../testing/selftests/membarrier/membarrier_test.c | 109 ++++++++++++++++++---
 1 file changed, 94 insertions(+), 15 deletions(-)

diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test.c
index 21399fcf1a59..f85657374b59 100644
--- a/tools/testing/selftests/membarrier/membarrier_test.c
+++ b/tools/testing/selftests/membarrier/membarrier_test.c
@@ -15,49 +15,119 @@ static int sys_membarrier(int cmd, int flags)
 static int test_membarrier_cmd_fail(void)
 {
 	int cmd = -1, flags = 0;
+	const char *test_name = "sys membarrier invalid command";
 
 	if (sys_membarrier(cmd, flags) != -1) {
 		ksft_exit_fail_msg(
-			"sys membarrier invalid command test: command = %d, flags = %d. Should fail, but passed\n",
-			cmd, flags);
+			"%s test: command = %d, flags = %d. Should fail, but passed\n",
+			test_name, cmd, flags);
+	}
+	if (errno != EINVAL) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+			test_name, flags, EINVAL, strerror(EINVAL),
+			errno, strerror(errno));
 	}
 
 	ksft_test_result_pass(
-		"sys membarrier invalid command test: command = %d, flags = %d. Failed as expected\n",
-		cmd, flags);
+		"%s test: command = %d, flags = %d, errno = %d. Failed as expected\n",
+		test_name, cmd, flags, errno);
 	return 0;
 }
 
 static int test_membarrier_flags_fail(void)
 {
 	int cmd = MEMBARRIER_CMD_QUERY, flags = 1;
+	const char *test_name = "sys membarrier MEMBARRIER_CMD_QUERY invalid flags";
 
 	if (sys_membarrier(cmd, flags) != -1) {
 		ksft_exit_fail_msg(
-			"sys membarrier MEMBARRIER_CMD_QUERY invalid flags test: flags = %d. Should fail, but passed\n",
-			flags);
+			"%s test: flags = %d. Should fail, but passed\n",
+			test_name, flags);
+	}
+	if (errno != EINVAL) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+			test_name, flags, EINVAL, strerror(EINVAL),
+			errno, strerror(errno));
 	}
 
 	ksft_test_result_pass(
-		"sys membarrier MEMBARRIER_CMD_QUERY invalid flags test: flags = %d. Failed as expected\n",
-		flags);
+		"%s test: flags = %d, errno = %d. Failed as expected\n",
+		test_name, flags, errno);
 	return 0;
 }
 
-static int test_membarrier_success(void)
+static int test_membarrier_shared_success(void)
 {
 	int cmd = MEMBARRIER_CMD_SHARED, flags = 0;
-	const char *test_name = "sys membarrier MEMBARRIER_CMD_SHARED\n";
+	const char *test_name = "sys membarrier MEMBARRIER_CMD_SHARED";
+
+	if (sys_membarrier(cmd, flags) != 0) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d, errno = %d\n",
+			test_name, flags, errno);
+	}
+
+	ksft_test_result_pass(
+		"%s test: flags = %d\n", test_name, flags);
+	return 0;
+}
+
+static int test_membarrier_private_expedited_fail(void)
+{
+	int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
+	const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED not registered failure";
+
+	if (sys_membarrier(cmd, flags) != -1) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d. Should fail, but passed\n",
+			test_name, flags);
+	}
+	if (errno != EPERM) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d. Should return (%d: \"%s\"), but returned (%d: \"%s\").\n",
+			test_name, flags, EPERM, strerror(EPERM),
+			errno, strerror(errno));
+	}
+
+	ksft_test_result_pass(
+		"%s test: flags = %d, errno = %d\n",
+		test_name, flags, errno);
+	return 0;
+}
+
+static int test_membarrier_register_private_expedited_success(void)
+{
+	int cmd = MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, flags = 0;
+	const char *test_name = "sys membarrier MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED";
 
 	if (sys_membarrier(cmd, flags) != 0) {
 		ksft_exit_fail_msg(
-			"sys membarrier MEMBARRIER_CMD_SHARED test: flags = %d\n",
-			flags);
+			"%s test: flags = %d, errno = %d\n",
+			test_name, flags, errno);
 	}
 
 	ksft_test_result_pass(
-		"sys membarrier MEMBARRIER_CMD_SHARED test: flags = %d\n",
-		flags);
+		"%s test: flags = %d\n",
+		test_name, flags);
+	return 0;
+}
+
+static int test_membarrier_private_expedited_success(void)
+{
+	int cmd = MEMBARRIER_CMD_PRIVATE_EXPEDITED, flags = 0;
+	const char *test_name = "sys membarrier MEMBARRIER_CMD_PRIVATE_EXPEDITED";
+
+	if (sys_membarrier(cmd, flags) != 0) {
+		ksft_exit_fail_msg(
+			"%s test: flags = %d, errno = %d\n",
+			test_name, flags, errno);
+	}
+
+	ksft_test_result_pass(
+		"%s test: flags = %d\n",
+		test_name, flags);
 	return 0;
 }
 
@@ -71,7 +141,16 @@ static int test_membarrier(void)
 	status = test_membarrier_flags_fail();
 	if (status)
 		return status;
-	status = test_membarrier_success();
+	status = test_membarrier_shared_success();
+	if (status)
+		return status;
+	status = test_membarrier_private_expedited_fail();
+	if (status)
+		return status;
+	status = test_membarrier_register_private_expedited_success();
+	if (status)
+		return status;
+	status = test_membarrier_private_expedited_success();
 	if (status)
 		return status;
 	return 0;
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2017-09-25 12:41 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-19 22:13 [RFC PATCH v3 1/2] membarrier: Provide register expedited private command Mathieu Desnoyers
2017-09-19 22:13 ` [RFC PATCH 2/2] membarrier: selftest: Test private expedited cmd Mathieu Desnoyers
2017-09-22  3:22 ` [RFC PATCH v3 1/2] membarrier: Provide register expedited private command Boqun Feng
2017-09-22  3:30   ` Boqun Feng
2017-09-22  5:22     ` Mathieu Desnoyers
2017-09-22  8:24   ` Peter Zijlstra
2017-09-22  8:56     ` Boqun Feng
2017-09-22  8:59 ` Boqun Feng
2017-09-22 15:10   ` Mathieu Desnoyers
2017-09-24 13:30     ` Boqun Feng
2017-09-24 14:23       ` Mathieu Desnoyers
2017-09-25 12:10         ` Boqun Feng
2017-09-25 12:25           ` Peter Zijlstra
2017-09-25 12:42             ` Mathieu Desnoyers
  -- strict thread matches above, loose matches on Subject: below --
2017-09-18 22:36 [RFC PATCH v2 " Mathieu Desnoyers
2017-09-18 22:36 ` [RFC PATCH 2/2] membarrier: selftest: Test private expedited cmd Mathieu Desnoyers
2017-09-18 16:52 [RFC PATCH 1/2] membarrier: Provide register expedited private command Mathieu Desnoyers
2017-09-18 16:52 ` [RFC PATCH 2/2] membarrier: selftest: Test private expedited cmd Mathieu Desnoyers

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.