[RFC PATCH] membarrier: handle nohz_full with expedited thread registration

* [RFC PATCH] membarrier: handle nohz_full with expedited thread registration
@ 2017-01-16 19:51 Mathieu Desnoyers
  2017-01-16 20:15 ` Linus Torvalds
  0 siblings, 1 reply; 11+ messages in thread
From: Mathieu Desnoyers @ 2017-01-16 19:51 UTC (permalink / raw)
  To: Paul E . McKenney
  Cc: linux-kernel, Mathieu Desnoyers, Josh Triplett, KOSAKI Motohiro,
	Steven Rostedt, Nicholas Miell, Ingo Molnar, Alan Cox,
	Lai Jiangshan, Stephen Hemminger, Thomas Gleixner,
	Peter Zijlstra, David Howells, Pranith Kumar, Michael Kerrisk,
	Shuah Khan, Andrew Morton, Linus Torvalds

Threads running on nohz_full CPUs are not considered by
synchronize_sched, but they should be covered by a membarrier system
call with MEMBARRIER_CMD_SHARED command.

Introduce two new commands to membarrier:
MEMBARRIER_CMD_REGISTER_EXPEDITED and
MEMBARRIER_CMD_UNREGISTER_EXPEDITED.

No-hz full threads requiring to receive interrupts to ensure correct
memory ordering pairing compiler barriers with membarrier system call
should register as "expedited" threads.

[ This RFC patch lacks documentation. I mainly want feedback to see if
  everyone is OK with the general approach. ]

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nicholas Miell <nmiell@comcast.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Pranith Kumar <bobby.prani@gmail.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                       |  1 +
 include/linux/sched.h           | 27 +++++++++++++++
 include/uapi/linux/membarrier.h |  6 ++++
 kernel/fork.c                   |  2 ++
 kernel/membarrier.c             | 77 +++++++++++++++++++++++++++++++++++++++--
 5 files changed, 111 insertions(+), 2 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index e579466..2cf1f87 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1771,6 +1771,7 @@ static int do_execveat_common(int fd, struct filename *filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
+	membarrier_execve(current);
 	acct_update_integrals(current);
 	task_numa_free(current);
 	free_bprm(bprm);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad3ec9e..1242eb9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1998,6 +1998,9 @@ struct task_struct {
 	/* A live task holds one reference. */
 	atomic_t stack_refcount;
 #endif
+#ifdef CONFIG_MEMBARRIER
+	unsigned int membarrier_expedited;
+#endif
 /* CPU-specific state of this task */
 	struct thread_struct thread;
 /*
@@ -3671,4 +3674,28 @@ void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
 void cpufreq_remove_update_util_hook(int cpu);
 #endif /* CONFIG_CPU_FREQ */
 
+#ifdef CONFIG_MEMBARRIER
+static inline void membarrier_fork(struct task_struct *t,
+		unsigned long clone_flags)
+{
+	if (clone_flags & CLONE_THREAD)
+		t->membarrier_expedited = 0;
+	else
+		t->membarrier_expedited = current->membarrier_expedited;
+}
+
+static inline void membarrier_execve(struct task_struct *t)
+{
+	t->membarrier_expedited = 0;
+}
+#else
+static inline void membarrier_fork(struct task_struct *t,
+		unsigned long clone_flags)
+{
+}
+static inline void membarrier_execve(struct task_struct *t)
+{
+}
+#endif
+
 #endif
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index e0b108b..4b78f07 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -40,6 +40,10 @@
  *                          (non-running threads are de facto in such a
  *                          state). This covers threads from all processes
  *                          running on the system. This command returns 0.
+ * @MEMBARRIER_CMD_REGISTER_EXPEDITED:
+ *                          TODO
+ * @MEMBARRIER_CMD_UNREGISTER_EXPEDITED:
+ *                          TODO
  *
  * Command to be passed to the membarrier system call. The commands need to
  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
@@ -48,6 +52,8 @@
 enum membarrier_cmd {
 	MEMBARRIER_CMD_QUERY = 0,
 	MEMBARRIER_CMD_SHARED = (1 << 0),
+	MEMBARRIER_CMD_REGISTER_EXPEDITED = (1 << 1),
+	MEMBARRIER_CMD_UNREGISTER_EXPEDITED = (1 << 2),
 };
 
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 11c5c8a..cec23e0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1769,6 +1769,8 @@ static __latent_entropy struct task_struct *copy_process(
 	 */
 	copy_seccomp(p);
 
+	membarrier_fork(p, clone_flags);
+
 	/*
 	 * Process group and session signals need to be delivered to just the
 	 * parent before the fork or both the parent and the child after the
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
index 536c727..65a6fbf 100644
--- a/kernel/membarrier.c
+++ b/kernel/membarrier.c
@@ -16,12 +16,79 @@
 
 #include <linux/syscalls.h>
 #include <linux/membarrier.h>
+#include <linux/tick.h>
+#include <linux/smp.h>
+#include <linux/sched.h>
+
+/*
+ * TODO: private sched.h is needed for runqueue. Should we move the
+ * sched code under kernel/sched/ ?
+ */
+#include "sched/sched.h"
 
 /*
  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  * except MEMBARRIER_CMD_QUERY.
  */
-#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
+#define MEMBARRIER_CMD_BITMASK	\
+	(MEMBARRIER_CMD_SHARED \
+	| MEMBARRIER_CMD_REGISTER_EXPEDITED \
+	| MEMBARRIER_CMD_UNREGISTER_EXPEDITED)
+
+static int membarrier_register_expedited(struct task_struct *t)
+{
+	struct rq *rq;
+
+	if (t->membarrier_expedited == UINT_MAX)
+		return -EOVERFLOW;
+	rq = this_rq();
+	raw_spin_lock(&rq->lock);
+	t->membarrier_expedited++;
+	raw_spin_unlock(&rq->lock);
+	return 0;
+}
+
+static int membarrier_unregister_expedited(struct task_struct *t)
+{
+	struct rq *rq;
+
+	if (!t->membarrier_expedited)
+		return -ENOENT;
+	rq = this_rq();
+	raw_spin_lock(&rq->lock);
+	t->membarrier_expedited--;
+	raw_spin_unlock(&rq->lock);
+	return 0;
+}
+
+static void memory_barrier(void *info)
+{
+	smp_mb();
+}
+
+static void membarrier_nohz_full_expedited(void)
+{
+	int cpu;
+
+	if (!tick_nohz_full_enabled())
+		return;
+	for_each_cpu(cpu, tick_nohz_full_mask) {
+		struct rq *rq;
+		struct task_struct *t;
+
+		rq = cpu_rq(cpu);
+		raw_spin_lock(&rq->lock);
+		t = rq->curr;
+		if (t->membarrier_expedited) {
+			int ret;
+
+			ret = smp_call_function_single(cpu, memory_barrier,
+					NULL, 1);
+			WARN_ON_ONCE(ret);
+		}
+		raw_spin_unlock(&rq->lock);
+	}
+}
 
 /**
  * sys_membarrier - issue memory barriers on a set of threads
@@ -57,9 +124,15 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 	case MEMBARRIER_CMD_QUERY:
 		return MEMBARRIER_CMD_BITMASK;
 	case MEMBARRIER_CMD_SHARED:
-		if (num_online_cpus() > 1)
+		if (num_online_cpus() > 1) {
 			synchronize_sched();
+			membarrier_nohz_full_expedited();
+		}
 		return 0;
+	case MEMBARRIER_CMD_REGISTER_EXPEDITED:
+		return membarrier_register_expedited(current);
+	case MEMBARRIER_CMD_UNREGISTER_EXPEDITED:
+		return membarrier_unregister_expedited(current);
 	default:
 		return -EINVAL;
 	}
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 11+ messages in thread