linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH tip/core/rcu 0/5] Related non-RCU updates
@ 2017-07-24 21:57 Paul E. McKenney
  2017-07-24 21:58 ` [PATCH tip/core/rcu 1/5] module: Fix pr_fmt() bug for header use of printk Paul E. McKenney
                   ` (5 more replies)
  0 siblings, 6 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-24 21:57 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, jiangshanlai, dipankar, akpm, mathieu.desnoyers, josh,
	tglx, peterz, rostedt, dhowells, edumazet, fweisbec, oleg

Hello!

This series is outside of RCU, including things like fixes for bug
triggered by RCU commits.

1.	Fix pr_fmt() bug for header use of printk, courtesy of Joe Perches.

2.	Remove redundant INIT_TASK_RCU_TREE_PREEMPT() macro.

3.	Allow migrating kthreads into online but inactive CPUs, courtesy
	of Tejun Heo.

4.	Add expedited option to sys_membarrier().

5.	Fix use of smp_processor_id() in preemptible in cputime,
	courtesy of Wanpeng Li.

							Thanx, Paul

------------------------------------------------------------------------

 arch/blackfin/kernel/module.c   |   39 +++++++++++++++++++++------------------
 include/linux/init_task.h       |    8 +-------
 include/uapi/linux/membarrier.h |   11 +++++++++++
 kernel/membarrier.c             |    7 ++++++-
 kernel/sched/core.c             |    9 +++++++--
 kernel/sched/cputime.c          |    6 +++---
 6 files changed, 49 insertions(+), 31 deletions(-)

^ permalink raw reply	[flat|nested] 68+ messages in thread

* [PATCH tip/core/rcu 1/5] module: Fix pr_fmt() bug for header use of printk
  2017-07-24 21:57 [PATCH tip/core/rcu 0/5] Related non-RCU updates Paul E. McKenney
@ 2017-07-24 21:58 ` Paul E. McKenney
  2017-07-24 21:58 ` [PATCH tip/core/rcu 2/5] init_task: Remove redundant INIT_TASK_RCU_TREE_PREEMPT() macro Paul E. McKenney
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-24 21:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, jiangshanlai, dipankar, akpm, mathieu.desnoyers, josh,
	tglx, peterz, rostedt, dhowells, edumazet, fweisbec, oleg,
	Joe Perches, Paul E. McKenney

From: Joe Perches <joe@perches.com>

This commit removes the pr_fmt() macro, replacing it with mod_err() and
mod_debug() macros to avoid errors when using printk() from header files.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 arch/blackfin/kernel/module.c | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/arch/blackfin/kernel/module.c b/arch/blackfin/kernel/module.c
index 0188c933b155..15af5768c403 100644
--- a/arch/blackfin/kernel/module.c
+++ b/arch/blackfin/kernel/module.c
@@ -4,8 +4,6 @@
  * Licensed under the GPL-2 or later
  */
 
-#define pr_fmt(fmt) "module %s: " fmt, mod->name
-
 #include <linux/moduleloader.h>
 #include <linux/elf.h>
 #include <linux/vmalloc.h>
@@ -16,6 +14,11 @@
 #include <asm/cacheflush.h>
 #include <linux/uaccess.h>
 
+#define mod_err(mod, fmt, ...)						\
+	pr_err("module %s: " fmt, (mod)->name, ##__VA_ARGS__)
+#define mod_debug(mod, fmt, ...)					\
+	pr_debug("module %s: " fmt, (mod)->name, ##__VA_ARGS__)
+
 /* Transfer the section to the L1 memory */
 int
 module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
@@ -44,7 +47,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_inst_sram_alloc(s->sh_size);
 			mod->arch.text_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 inst memory allocation failed\n");
+				mod_err(mod, "L1 inst memory allocation failed\n");
 				return -1;
 			}
 			dma_memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -56,7 +59,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_sram_alloc(s->sh_size);
 			mod->arch.data_a_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -68,7 +71,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_sram_zalloc(s->sh_size);
 			mod->arch.bss_a_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 
@@ -77,7 +80,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_B_sram_alloc(s->sh_size);
 			mod->arch.data_b_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -87,7 +90,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_B_sram_alloc(s->sh_size);
 			mod->arch.bss_b_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 			memset(dest, 0, s->sh_size);
@@ -99,7 +102,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l2_sram_alloc(s->sh_size);
 			mod->arch.text_l2 = dest;
 			if (dest == NULL) {
-				pr_err("L2 SRAM allocation failed\n");
+				mod_err(mod, "L2 SRAM allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -111,7 +114,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l2_sram_alloc(s->sh_size);
 			mod->arch.data_l2 = dest;
 			if (dest == NULL) {
-				pr_err("L2 SRAM allocation failed\n");
+				mod_err(mod, "L2 SRAM allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -123,7 +126,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l2_sram_zalloc(s->sh_size);
 			mod->arch.bss_l2 = dest;
 			if (dest == NULL) {
-				pr_err("L2 SRAM allocation failed\n");
+				mod_err(mod, "L2 SRAM allocation failed\n");
 				return -1;
 			}
 
@@ -157,8 +160,8 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 	Elf32_Sym *sym;
 	unsigned long location, value, size;
 
-	pr_debug("applying relocate section %u to %u\n",
-		relsec, sechdrs[relsec].sh_info);
+	mod_debug(mod, "applying relocate section %u to %u\n",
+		  relsec, sechdrs[relsec].sh_info);
 
 	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
 		/* This is where to make the change */
@@ -174,14 +177,14 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 
 #ifdef CONFIG_SMP
 		if (location >= COREB_L1_DATA_A_START) {
-			pr_err("cannot relocate in L1: %u (SMP kernel)\n",
+			mod_err(mod, "cannot relocate in L1: %u (SMP kernel)\n",
 				ELF32_R_TYPE(rel[i].r_info));
 			return -ENOEXEC;
 		}
 #endif
 
-		pr_debug("location is %lx, value is %lx type is %d\n",
-			location, value, ELF32_R_TYPE(rel[i].r_info));
+		mod_debug(mod, "location is %lx, value is %lx type is %d\n",
+			  location, value, ELF32_R_TYPE(rel[i].r_info));
 
 		switch (ELF32_R_TYPE(rel[i].r_info)) {
 
@@ -200,12 +203,12 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 		case R_BFIN_PCREL12_JUMP:
 		case R_BFIN_PCREL12_JUMP_S:
 		case R_BFIN_PCREL10:
-			pr_err("unsupported relocation: %u (no -mlong-calls?)\n",
+			mod_err(mod, "unsupported relocation: %u (no -mlong-calls?)\n",
 				ELF32_R_TYPE(rel[i].r_info));
 			return -ENOEXEC;
 
 		default:
-			pr_err("unknown relocation: %u\n",
+			mod_err(mod, "unknown relocation: %u\n",
 				ELF32_R_TYPE(rel[i].r_info));
 			return -ENOEXEC;
 		}
@@ -222,7 +225,7 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 			isram_memcpy((void *)location, &value, size);
 			break;
 		default:
-			pr_err("invalid relocation for %#lx\n", location);
+			mod_err(mod, "invalid relocation for %#lx\n", location);
 			return -ENOEXEC;
 		}
 	}
-- 
2.5.2

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH tip/core/rcu 2/5] init_task: Remove redundant INIT_TASK_RCU_TREE_PREEMPT() macro
  2017-07-24 21:57 [PATCH tip/core/rcu 0/5] Related non-RCU updates Paul E. McKenney
  2017-07-24 21:58 ` [PATCH tip/core/rcu 1/5] module: Fix pr_fmt() bug for header use of printk Paul E. McKenney
@ 2017-07-24 21:58 ` Paul E. McKenney
  2017-07-24 21:58 ` [PATCH tip/core/rcu 3/5] EXPERIMENTAL sched: Allow migrating kthreads into online but inactive CPUs Paul E. McKenney
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-24 21:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, jiangshanlai, dipankar, akpm, mathieu.desnoyers, josh,
	tglx, peterz, rostedt, dhowells, edumazet, fweisbec, oleg,
	Paul E. McKenney

Back in the dim distant past, the task_struct structure's RCU-related
fields optionally included those needed for CONFIG_RCU_BOOST, even in
CONFIG_PREEMPT_RCU builds.  The INIT_TASK_RCU_TREE_PREEMPT() macro was
used to provide initializers for those optional CONFIG_RCU_BOOST fields.
However, the CONFIG_RCU_BOOST fields are now included unconditionally
in CONFIG_PREEMPT_RCU builds, so there is no longer any need fro the
INIT_TASK_RCU_TREE_PREEMPT() macro.  This commit therefore removes it
in favor of initializing the ->rcu_blocked_node field directly in the
INIT_TASK_RCU_PREEMPT() macro.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/init_task.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index a2f6707e9fc0..0e849715e5be 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -126,17 +126,11 @@ extern struct group_info init_groups;
 #endif
 
 #ifdef CONFIG_PREEMPT_RCU
-#define INIT_TASK_RCU_TREE_PREEMPT()					\
-	.rcu_blocked_node = NULL,
-#else
-#define INIT_TASK_RCU_TREE_PREEMPT(tsk)
-#endif
-#ifdef CONFIG_PREEMPT_RCU
 #define INIT_TASK_RCU_PREEMPT(tsk)					\
 	.rcu_read_lock_nesting = 0,					\
 	.rcu_read_unlock_special.s = 0,					\
 	.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),		\
-	INIT_TASK_RCU_TREE_PREEMPT()
+	.rcu_blocked_node = NULL,
 #else
 #define INIT_TASK_RCU_PREEMPT(tsk)
 #endif
-- 
2.5.2

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH tip/core/rcu 3/5] EXPERIMENTAL sched: Allow migrating kthreads into online but inactive CPUs
  2017-07-24 21:57 [PATCH tip/core/rcu 0/5] Related non-RCU updates Paul E. McKenney
  2017-07-24 21:58 ` [PATCH tip/core/rcu 1/5] module: Fix pr_fmt() bug for header use of printk Paul E. McKenney
  2017-07-24 21:58 ` [PATCH tip/core/rcu 2/5] init_task: Remove redundant INIT_TASK_RCU_TREE_PREEMPT() macro Paul E. McKenney
@ 2017-07-24 21:58 ` Paul E. McKenney
  2017-07-24 21:58 ` [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option Paul E. McKenney
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-24 21:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, jiangshanlai, dipankar, akpm, mathieu.desnoyers, josh,
	tglx, peterz, rostedt, dhowells, edumazet, fweisbec, oleg,
	Tejun Heo, Paul E. McKenney

From: Tejun Heo <tj@kernel.org>

Per-cpu workqueues have been tripping CPU affinity sanity checks while
a CPU is being offlined.  A per-cpu kworker ends up running on a CPU
which isn't its target CPU while the CPU is online but inactive.

While the scheduler allows kthreads to wake up on an online but
inactive CPU, it doesn't allow a running kthread to be migrated to
such a CPU, which leads to an odd situation where setting affinity on
a sleeping and running kthread leads to different results.

Each mem-reclaim workqueue has one rescuer which guarantees forward
progress and the rescuer needs to bind itself to the CPU which needs
help in making forward progress; however, due to the above issue,
while set_cpus_allowed_ptr() succeeds, the rescuer doesn't end up on
the correct CPU if the CPU is in the process of going offline,
tripping the sanity check and executing the work item on the wrong
CPU.

This patch updates __migrate_task() so that kthreads can be migrated
into an inactive but online CPU.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/sched/core.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 17c667b427b4..bfee6ea7db49 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -951,8 +951,13 @@ struct migration_arg {
 static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
 				 struct task_struct *p, int dest_cpu)
 {
-	if (unlikely(!cpu_active(dest_cpu)))
-		return rq;
+	if (p->flags & PF_KTHREAD) {
+		if (unlikely(!cpu_online(dest_cpu)))
+			return rq;
+	} else {
+		if (unlikely(!cpu_active(dest_cpu)))
+			return rq;
+	}
 
 	/* Affinity changed (again). */
 	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
-- 
2.5.2

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-24 21:57 [PATCH tip/core/rcu 0/5] Related non-RCU updates Paul E. McKenney
                   ` (2 preceding siblings ...)
  2017-07-24 21:58 ` [PATCH tip/core/rcu 3/5] EXPERIMENTAL sched: Allow migrating kthreads into online but inactive CPUs Paul E. McKenney
@ 2017-07-24 21:58 ` Paul E. McKenney
  2017-07-25  4:27   ` Boqun Feng
                     ` (2 more replies)
  2017-07-24 21:58 ` [PATCH tip/core/rcu 5/5] EXP: sched/cputime: Fix using smp_processor_id() in preemptible Paul E. McKenney
  2017-07-31 22:51 ` [PATCH tip/core/rcu 0/5] Related non-RCU updates Paul E. McKenney
  5 siblings, 3 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-24 21:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, jiangshanlai, dipankar, akpm, mathieu.desnoyers, josh,
	tglx, peterz, rostedt, dhowells, edumazet, fweisbec, oleg,
	Paul E. McKenney

The sys_membarrier() system call has proven too slow for some use
cases, which has prompted users to instead rely on TLB shootdown.
Although TLB shootdown is much faster, it has the slight disadvantage
of not working at all on arm and arm64.  This commit therefore adds
an expedited option to the sys_membarrier() system call.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/uapi/linux/membarrier.h | 11 +++++++++++
 kernel/membarrier.c             |  7 ++++++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index e0b108bd2624..ba36d8a6be61 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -40,6 +40,16 @@
  *                          (non-running threads are de facto in such a
  *                          state). This covers threads from all processes
  *                          running on the system. This command returns 0.
+ * @MEMBARRIER_CMD_SHARED_EXPEDITED:  Execute a memory barrier on all
+ *			    running threads, but in an expedited fashion.
+ *                          Upon return from system call, the caller thread
+ *                          is ensured that all running threads have passed
+ *                          through a state where all memory accesses to
+ *                          user-space addresses match program order between
+ *                          entry to and return from the system call
+ *                          (non-running threads are de facto in such a
+ *                          state). This covers threads from all processes
+ *                          running on the system. This command returns 0.
  *
  * Command to be passed to the membarrier system call. The commands need to
  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
@@ -48,6 +58,7 @@
 enum membarrier_cmd {
 	MEMBARRIER_CMD_QUERY = 0,
 	MEMBARRIER_CMD_SHARED = (1 << 0),
+	MEMBARRIER_CMD_SHARED_EXPEDITED = (2 << 0),
 };
 
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
index 9f9284f37f8d..b749c39bb219 100644
--- a/kernel/membarrier.c
+++ b/kernel/membarrier.c
@@ -22,7 +22,8 @@
  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  * except MEMBARRIER_CMD_QUERY.
  */
-#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
+#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED |		\
+				 MEMBARRIER_CMD_SHARED_EXPEDITED)
 
 /**
  * sys_membarrier - issue memory barriers on a set of threads
@@ -64,6 +65,10 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 		if (num_online_cpus() > 1)
 			synchronize_sched();
 		return 0;
+	case MEMBARRIER_CMD_SHARED_EXPEDITED:
+		if (num_online_cpus() > 1)
+			synchronize_sched_expedited();
+		return 0;
 	default:
 		return -EINVAL;
 	}
-- 
2.5.2

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH tip/core/rcu 5/5] EXP: sched/cputime: Fix using smp_processor_id() in preemptible
  2017-07-24 21:57 [PATCH tip/core/rcu 0/5] Related non-RCU updates Paul E. McKenney
                   ` (3 preceding siblings ...)
  2017-07-24 21:58 ` [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option Paul E. McKenney
@ 2017-07-24 21:58 ` Paul E. McKenney
  2017-07-24 22:01   ` Wanpeng Li
  2017-07-31 22:51 ` [PATCH tip/core/rcu 0/5] Related non-RCU updates Paul E. McKenney
  5 siblings, 1 reply; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-24 21:58 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, jiangshanlai, dipankar, akpm, mathieu.desnoyers, josh,
	tglx, peterz, rostedt, dhowells, edumazet, fweisbec, oleg,
	Wanpeng Li, Luiz Capitulino, Rik van Riel, Paul E. McKenney

From: Wanpeng Li <wanpeng.li@hotmail.com>

 BUG: using smp_processor_id() in preemptible [00000000] code: 99-trinity/181
 caller is debug_smp_processor_id+0x17/0x19
 CPU: 0 PID: 181 Comm: 99-trinity Not tainted 4.12.0-01059-g2a42eb9 #1
 Call Trace:
  dump_stack+0x82/0xb8
  check_preemption_disabled+0xd1/0xe3
  debug_smp_processor_id+0x17/0x19
  vtime_delta+0xd/0x2c
  task_cputime+0x89/0xdb
  thread_group_cputime+0x11b/0x1ed
  thread_group_cputime_adjusted+0x1f/0x47
  wait_consider_task+0x2a9/0xaf9
  ? lock_acquire+0x97/0xa4
  do_wait+0xdf/0x1f4
  SYSC_wait4+0x8e/0xb5
  ? list_add+0x34/0x34
  SyS_wait4+0x9/0xb
  do_syscall_64+0x70/0x82
  entry_SYSCALL64_slow_path+0x25/0x25

As Frederic pointed out:

| Although those sched_clock_cpu() things seem to only matter when the
| sched_clock() is unstable. And that stability is a condition for nohz_full
| to work anyway. So probably sched_clock() alone would be enough.

This patch fixes it by replacing sched_clock_cpu() in vtime_delta() by
sched_clock() to avoid to call smp_processor_id() in preemptible context.

Reported-by: Xiaolong Ye <xiaolong.ye@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/sched/cputime.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 6e3ea4ac1bda..14d2dbf97c53 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -683,7 +683,7 @@ static u64 vtime_delta(struct vtime *vtime)
 {
 	unsigned long long clock;
 
-	clock = sched_clock_cpu(smp_processor_id());
+	clock = sched_clock();
 	if (clock < vtime->starttime)
 		return 0;
 
@@ -814,7 +814,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
 
 	write_seqcount_begin(&vtime->seqcount);
 	vtime->state = VTIME_SYS;
-	vtime->starttime = sched_clock_cpu(smp_processor_id());
+	vtime->starttime = sched_clock();
 	write_seqcount_end(&vtime->seqcount);
 }
 
@@ -826,7 +826,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
 	local_irq_save(flags);
 	write_seqcount_begin(&vtime->seqcount);
 	vtime->state = VTIME_SYS;
-	vtime->starttime = sched_clock_cpu(cpu);
+	vtime->starttime = sched_clock();
 	write_seqcount_end(&vtime->seqcount);
 	local_irq_restore(flags);
 }
-- 
2.5.2

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 5/5] EXP: sched/cputime: Fix using smp_processor_id() in preemptible
  2017-07-24 21:58 ` [PATCH tip/core/rcu 5/5] EXP: sched/cputime: Fix using smp_processor_id() in preemptible Paul E. McKenney
@ 2017-07-24 22:01   ` Wanpeng Li
  2017-07-24 22:29     ` Paul E. McKenney
  0 siblings, 1 reply; 68+ messages in thread
From: Wanpeng Li @ 2017-07-24 22:01 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, Ingo Molnar, Lai Jiangshan, dipankar,
	Andrew Morton, Mathieu Desnoyers, Josh Triplett, Thomas Gleixner,
	Peter Zijlstra, Steven Rostedt, dhowells, Eric Dumazet,
	Frederic Weisbecker, Oleg Nesterov, Wanpeng Li, Luiz Capitulino,
	Rik van Riel

2017-07-25 5:58 GMT+08:00 Paul E. McKenney <paulmck@linux.vnet.ibm.com>:
> From: Wanpeng Li <wanpeng.li@hotmail.com>
>
>  BUG: using smp_processor_id() in preemptible [00000000] code: 99-trinity/181

What's the meaning of EXP? Btw, the patch is in linus's tree currently.

Regards,
Wanpeng Li

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 5/5] EXP: sched/cputime: Fix using smp_processor_id() in preemptible
  2017-07-24 22:01   ` Wanpeng Li
@ 2017-07-24 22:29     ` Paul E. McKenney
  0 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-24 22:29 UTC (permalink / raw)
  To: Wanpeng Li
  Cc: linux-kernel, Ingo Molnar, Lai Jiangshan, dipankar,
	Andrew Morton, Mathieu Desnoyers, Josh Triplett, Thomas Gleixner,
	Peter Zijlstra, Steven Rostedt, dhowells, Eric Dumazet,
	Frederic Weisbecker, Oleg Nesterov, Wanpeng Li, Luiz Capitulino,
	Rik van Riel

On Tue, Jul 25, 2017 at 06:01:55AM +0800, Wanpeng Li wrote:
> 2017-07-25 5:58 GMT+08:00 Paul E. McKenney <paulmck@linux.vnet.ibm.com>:
> > From: Wanpeng Li <wanpeng.li@hotmail.com>
> >
> >  BUG: using smp_processor_id() in preemptible [00000000] code: 99-trinity/181
> 
> What's the meaning of EXP? Btw, the patch is in linus's tree currently.

The meaning of "EXP" is that I am not sure whether I am pushing it.
Which, since it is already in Linus's tree, I am not.  I will drop it
when I rebase onto 4.13-rc2, but I will keep it until then given that
I get rcutorture failures without it.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-24 21:58 ` [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option Paul E. McKenney
@ 2017-07-25  4:27   ` Boqun Feng
  2017-07-25 16:24     ` Paul E. McKenney
  2017-07-25 13:21   ` Mathieu Desnoyers
  2017-07-25 16:33   ` Peter Zijlstra
  2 siblings, 1 reply; 68+ messages in thread
From: Boqun Feng @ 2017-07-25  4:27 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, fweisbec, oleg

[-- Attachment #1: Type: text/plain, Size: 3199 bytes --]

On Mon, Jul 24, 2017 at 02:58:16PM -0700, Paul E. McKenney wrote:
> The sys_membarrier() system call has proven too slow for some use
> cases, which has prompted users to instead rely on TLB shootdown.
> Although TLB shootdown is much faster, it has the slight disadvantage
> of not working at all on arm and arm64.  This commit therefore adds
> an expedited option to the sys_membarrier() system call.
> 
> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> ---
>  include/uapi/linux/membarrier.h | 11 +++++++++++
>  kernel/membarrier.c             |  7 ++++++-
>  2 files changed, 17 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
> index e0b108bd2624..ba36d8a6be61 100644
> --- a/include/uapi/linux/membarrier.h
> +++ b/include/uapi/linux/membarrier.h
> @@ -40,6 +40,16 @@
>   *                          (non-running threads are de facto in such a
>   *                          state). This covers threads from all processes
>   *                          running on the system. This command returns 0.
> + * @MEMBARRIER_CMD_SHARED_EXPEDITED:  Execute a memory barrier on all
> + *			    running threads, but in an expedited fashion.
> + *                          Upon return from system call, the caller thread
> + *                          is ensured that all running threads have passed
> + *                          through a state where all memory accesses to
> + *                          user-space addresses match program order between
> + *                          entry to and return from the system call
> + *                          (non-running threads are de facto in such a
> + *                          state). This covers threads from all processes
> + *                          running on the system. This command returns 0.
>   *
>   * Command to be passed to the membarrier system call. The commands need to
>   * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
> @@ -48,6 +58,7 @@
>  enum membarrier_cmd {
>  	MEMBARRIER_CMD_QUERY = 0,
>  	MEMBARRIER_CMD_SHARED = (1 << 0),
> +	MEMBARRIER_CMD_SHARED_EXPEDITED = (2 << 0),

Should this better be "(1 << 1)" ;-)

Regards,
Boqun

>  };
>  
>  #endif /* _UAPI_LINUX_MEMBARRIER_H */
> diff --git a/kernel/membarrier.c b/kernel/membarrier.c
> index 9f9284f37f8d..b749c39bb219 100644
> --- a/kernel/membarrier.c
> +++ b/kernel/membarrier.c
> @@ -22,7 +22,8 @@
>   * Bitmask made from a "or" of all commands within enum membarrier_cmd,
>   * except MEMBARRIER_CMD_QUERY.
>   */
> -#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
> +#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED |		\
> +				 MEMBARRIER_CMD_SHARED_EXPEDITED)
>  
>  /**
>   * sys_membarrier - issue memory barriers on a set of threads
> @@ -64,6 +65,10 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
>  		if (num_online_cpus() > 1)
>  			synchronize_sched();
>  		return 0;
> +	case MEMBARRIER_CMD_SHARED_EXPEDITED:
> +		if (num_online_cpus() > 1)
> +			synchronize_sched_expedited();
> +		return 0;
>  	default:
>  		return -EINVAL;
>  	}
> -- 
> 2.5.2
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-24 21:58 ` [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option Paul E. McKenney
  2017-07-25  4:27   ` Boqun Feng
@ 2017-07-25 13:21   ` Mathieu Desnoyers
  2017-07-25 16:48     ` Paul E. McKenney
  2017-07-25 16:33   ` Peter Zijlstra
  2 siblings, 1 reply; 68+ messages in thread
From: Mathieu Desnoyers @ 2017-07-25 13:21 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, Ingo Molnar, Lai Jiangshan, dipankar,
	Andrew Morton, Josh Triplett, Thomas Gleixner, Peter Zijlstra,
	rostedt, David Howells, Eric Dumazet, fweisbec, Oleg Nesterov

----- On Jul 24, 2017, at 5:58 PM, Paul E. McKenney paulmck@linux.vnet.ibm.com wrote:

> The sys_membarrier() system call has proven too slow for some use
> cases, which has prompted users to instead rely on TLB shootdown.
> Although TLB shootdown is much faster, it has the slight disadvantage
> of not working at all on arm and arm64.  This commit therefore adds
> an expedited option to the sys_membarrier() system call.

Is this now possible because the synchronize_sched_expedited()
implementation does not require to send IPIs to all CPUS ? I
suspect that using tree srcu now solves this somehow, but can
you tell us a bit more about why it is now OK to expose this
to user-space ?

The commit message here does not explain why it is OK real-time
wise to expose this feature as a system call.

Thanks,

Mathieu


> 
> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> ---
> include/uapi/linux/membarrier.h | 11 +++++++++++
> kernel/membarrier.c             |  7 ++++++-
> 2 files changed, 17 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
> index e0b108bd2624..ba36d8a6be61 100644
> --- a/include/uapi/linux/membarrier.h
> +++ b/include/uapi/linux/membarrier.h
> @@ -40,6 +40,16 @@
>  *                          (non-running threads are de facto in such a
>  *                          state). This covers threads from all processes
>  *                          running on the system. This command returns 0.
> + * @MEMBARRIER_CMD_SHARED_EXPEDITED:  Execute a memory barrier on all
> + *			    running threads, but in an expedited fashion.
> + *                          Upon return from system call, the caller thread
> + *                          is ensured that all running threads have passed
> + *                          through a state where all memory accesses to
> + *                          user-space addresses match program order between
> + *                          entry to and return from the system call
> + *                          (non-running threads are de facto in such a
> + *                          state). This covers threads from all processes
> + *                          running on the system. This command returns 0.
>  *
>  * Command to be passed to the membarrier system call. The commands need to
>  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
> @@ -48,6 +58,7 @@
> enum membarrier_cmd {
> 	MEMBARRIER_CMD_QUERY = 0,
> 	MEMBARRIER_CMD_SHARED = (1 << 0),
> +	MEMBARRIER_CMD_SHARED_EXPEDITED = (2 << 0),
> };
> 
> #endif /* _UAPI_LINUX_MEMBARRIER_H */
> diff --git a/kernel/membarrier.c b/kernel/membarrier.c
> index 9f9284f37f8d..b749c39bb219 100644
> --- a/kernel/membarrier.c
> +++ b/kernel/membarrier.c
> @@ -22,7 +22,8 @@
>  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
>  * except MEMBARRIER_CMD_QUERY.
>  */
> -#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
> +#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED |		\
> +				 MEMBARRIER_CMD_SHARED_EXPEDITED)
> 
> /**
>  * sys_membarrier - issue memory barriers on a set of threads
> @@ -64,6 +65,10 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
> 		if (num_online_cpus() > 1)
> 			synchronize_sched();
> 		return 0;
> +	case MEMBARRIER_CMD_SHARED_EXPEDITED:
> +		if (num_online_cpus() > 1)
> +			synchronize_sched_expedited();
> +		return 0;
> 	default:
> 		return -EINVAL;
> 	}
> --
> 2.5.2

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25  4:27   ` Boqun Feng
@ 2017-07-25 16:24     ` Paul E. McKenney
  0 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-25 16:24 UTC (permalink / raw)
  To: Boqun Feng
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, peterz, rostedt, dhowells,
	edumazet, fweisbec, oleg

On Tue, Jul 25, 2017 at 12:27:01PM +0800, Boqun Feng wrote:
> On Mon, Jul 24, 2017 at 02:58:16PM -0700, Paul E. McKenney wrote:
> > The sys_membarrier() system call has proven too slow for some use
> > cases, which has prompted users to instead rely on TLB shootdown.
> > Although TLB shootdown is much faster, it has the slight disadvantage
> > of not working at all on arm and arm64.  This commit therefore adds
> > an expedited option to the sys_membarrier() system call.
> > 
> > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > ---
> >  include/uapi/linux/membarrier.h | 11 +++++++++++
> >  kernel/membarrier.c             |  7 ++++++-
> >  2 files changed, 17 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
> > index e0b108bd2624..ba36d8a6be61 100644
> > --- a/include/uapi/linux/membarrier.h
> > +++ b/include/uapi/linux/membarrier.h
> > @@ -40,6 +40,16 @@
> >   *                          (non-running threads are de facto in such a
> >   *                          state). This covers threads from all processes
> >   *                          running on the system. This command returns 0.
> > + * @MEMBARRIER_CMD_SHARED_EXPEDITED:  Execute a memory barrier on all
> > + *			    running threads, but in an expedited fashion.
> > + *                          Upon return from system call, the caller thread
> > + *                          is ensured that all running threads have passed
> > + *                          through a state where all memory accesses to
> > + *                          user-space addresses match program order between
> > + *                          entry to and return from the system call
> > + *                          (non-running threads are de facto in such a
> > + *                          state). This covers threads from all processes
> > + *                          running on the system. This command returns 0.
> >   *
> >   * Command to be passed to the membarrier system call. The commands need to
> >   * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
> > @@ -48,6 +58,7 @@
> >  enum membarrier_cmd {
> >  	MEMBARRIER_CMD_QUERY = 0,
> >  	MEMBARRIER_CMD_SHARED = (1 << 0),
> > +	MEMBARRIER_CMD_SHARED_EXPEDITED = (2 << 0),
> 
> Should this better be "(1 << 1)" ;-)

Same value, but yes, much more aligned with the intent.  Good catch,
thank you, fixed!

							Thanx, Paul

> Regards,
> Boqun
> 
> >  };
> >  
> >  #endif /* _UAPI_LINUX_MEMBARRIER_H */
> > diff --git a/kernel/membarrier.c b/kernel/membarrier.c
> > index 9f9284f37f8d..b749c39bb219 100644
> > --- a/kernel/membarrier.c
> > +++ b/kernel/membarrier.c
> > @@ -22,7 +22,8 @@
> >   * Bitmask made from a "or" of all commands within enum membarrier_cmd,
> >   * except MEMBARRIER_CMD_QUERY.
> >   */
> > -#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
> > +#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED |		\
> > +				 MEMBARRIER_CMD_SHARED_EXPEDITED)
> >  
> >  /**
> >   * sys_membarrier - issue memory barriers on a set of threads
> > @@ -64,6 +65,10 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
> >  		if (num_online_cpus() > 1)
> >  			synchronize_sched();
> >  		return 0;
> > +	case MEMBARRIER_CMD_SHARED_EXPEDITED:
> > +		if (num_online_cpus() > 1)
> > +			synchronize_sched_expedited();
> > +		return 0;
> >  	default:
> >  		return -EINVAL;
> >  	}
> > -- 
> > 2.5.2
> > 

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-24 21:58 ` [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option Paul E. McKenney
  2017-07-25  4:27   ` Boqun Feng
  2017-07-25 13:21   ` Mathieu Desnoyers
@ 2017-07-25 16:33   ` Peter Zijlstra
  2017-07-25 16:49     ` Paul E. McKenney
  2 siblings, 1 reply; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-25 16:33 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg

On Mon, Jul 24, 2017 at 02:58:16PM -0700, Paul E. McKenney wrote:
> The sys_membarrier() system call has proven too slow for some use
> cases, which has prompted users to instead rely on TLB shootdown.
> Although TLB shootdown is much faster, it has the slight disadvantage
> of not working at all on arm and arm64.  This commit therefore adds
> an expedited option to the sys_membarrier() system call.

> @@ -64,6 +65,10 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
>  		if (num_online_cpus() > 1)
>  			synchronize_sched();
>  		return 0;
> +	case MEMBARRIER_CMD_SHARED_EXPEDITED:
> +		if (num_online_cpus() > 1)
> +			synchronize_sched_expedited();
> +		return 0;

So you now give unprivileged userspace the means to IPI the entire
machine?

So what do we do when someone goes and does:

	for (;;)
		sys_membarrier(MEMBARRIER_CMD_SHARED_EXPEDITED, 0);

on us?

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 13:21   ` Mathieu Desnoyers
@ 2017-07-25 16:48     ` Paul E. McKenney
  0 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-25 16:48 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: linux-kernel, Ingo Molnar, Lai Jiangshan, dipankar,
	Andrew Morton, Josh Triplett, Thomas Gleixner, Peter Zijlstra,
	rostedt, David Howells, Eric Dumazet, fweisbec, Oleg Nesterov

On Tue, Jul 25, 2017 at 01:21:08PM +0000, Mathieu Desnoyers wrote:
> ----- On Jul 24, 2017, at 5:58 PM, Paul E. McKenney paulmck@linux.vnet.ibm.com wrote:
> 
> > The sys_membarrier() system call has proven too slow for some use
> > cases, which has prompted users to instead rely on TLB shootdown.
> > Although TLB shootdown is much faster, it has the slight disadvantage
> > of not working at all on arm and arm64.  This commit therefore adds
> > an expedited option to the sys_membarrier() system call.
> 
> Is this now possible because the synchronize_sched_expedited()
> implementation does not require to send IPIs to all CPUS ? I
> suspect that using tree srcu now solves this somehow, but can
> you tell us a bit more about why it is now OK to expose this
> to user-space ?

I have gotten complaints from several users that sys_membarrier() is too
slow to be useful for them.  So they are hacking around this problem by
unmapping a region of memory, thus getting the IPIs and memory barriers
on all CPUs, but with additional mm overhead.  Plus this is non-portable,
and fragile with respect to reasonable optimizations, as was discussed
on LKML some time back:

	https://marc.info/?l=linux-kernel&m=142619683526482

So we really need to make sys_membarrier() work for these users.
If we don't, we certainly will look quite silly criticizing their
use of invoking TLB shootdown via unmapping, now won't we?

Now back in 2015, expedited grace periods were horribly slow, but
I have optimized them to the point that it should be no worse than
TLB shootdown IPIs.  Plus it is portable, and not subject to death
by optimization.

> The commit message here does not explain why it is OK real-time
> wise to expose this feature as a system call.

I figure that kernels providing that level of real-time response
will disable this, perhaps in a manner similar to that for NO_HZ_FULL.

Plus I intend to add your earlier IPI-all-threads-in-this-process
option, which will allow the people asking for this to do reasonable
testing.

Obviously, unless there are good test results and some level of user
enthusiasm, this patch goes nowhere.

Seem reasonable?

							Thanx, Paul

> Thanks,
> 
> Mathieu
> 
> 
> > 
> > Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> > ---
> > include/uapi/linux/membarrier.h | 11 +++++++++++
> > kernel/membarrier.c             |  7 ++++++-
> > 2 files changed, 17 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
> > index e0b108bd2624..ba36d8a6be61 100644
> > --- a/include/uapi/linux/membarrier.h
> > +++ b/include/uapi/linux/membarrier.h
> > @@ -40,6 +40,16 @@
> >  *                          (non-running threads are de facto in such a
> >  *                          state). This covers threads from all processes
> >  *                          running on the system. This command returns 0.
> > + * @MEMBARRIER_CMD_SHARED_EXPEDITED:  Execute a memory barrier on all
> > + *			    running threads, but in an expedited fashion.
> > + *                          Upon return from system call, the caller thread
> > + *                          is ensured that all running threads have passed
> > + *                          through a state where all memory accesses to
> > + *                          user-space addresses match program order between
> > + *                          entry to and return from the system call
> > + *                          (non-running threads are de facto in such a
> > + *                          state). This covers threads from all processes
> > + *                          running on the system. This command returns 0.
> >  *
> >  * Command to be passed to the membarrier system call. The commands need to
> >  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
> > @@ -48,6 +58,7 @@
> > enum membarrier_cmd {
> > 	MEMBARRIER_CMD_QUERY = 0,
> > 	MEMBARRIER_CMD_SHARED = (1 << 0),
> > +	MEMBARRIER_CMD_SHARED_EXPEDITED = (2 << 0),
> > };
> > 
> > #endif /* _UAPI_LINUX_MEMBARRIER_H */
> > diff --git a/kernel/membarrier.c b/kernel/membarrier.c
> > index 9f9284f37f8d..b749c39bb219 100644
> > --- a/kernel/membarrier.c
> > +++ b/kernel/membarrier.c
> > @@ -22,7 +22,8 @@
> >  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
> >  * except MEMBARRIER_CMD_QUERY.
> >  */
> > -#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
> > +#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED |		\
> > +				 MEMBARRIER_CMD_SHARED_EXPEDITED)
> > 
> > /**
> >  * sys_membarrier - issue memory barriers on a set of threads
> > @@ -64,6 +65,10 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
> > 		if (num_online_cpus() > 1)
> > 			synchronize_sched();
> > 		return 0;
> > +	case MEMBARRIER_CMD_SHARED_EXPEDITED:
> > +		if (num_online_cpus() > 1)
> > +			synchronize_sched_expedited();
> > +		return 0;
> > 	default:
> > 		return -EINVAL;
> > 	}
> > --
> > 2.5.2
> 
> -- 
> Mathieu Desnoyers
> EfficiOS Inc.
> http://www.efficios.com
> 

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 16:33   ` Peter Zijlstra
@ 2017-07-25 16:49     ` Paul E. McKenney
  2017-07-25 16:59       ` Peter Zijlstra
  0 siblings, 1 reply; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-25 16:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg

On Tue, Jul 25, 2017 at 06:33:18PM +0200, Peter Zijlstra wrote:
> On Mon, Jul 24, 2017 at 02:58:16PM -0700, Paul E. McKenney wrote:
> > The sys_membarrier() system call has proven too slow for some use
> > cases, which has prompted users to instead rely on TLB shootdown.
> > Although TLB shootdown is much faster, it has the slight disadvantage
> > of not working at all on arm and arm64.  This commit therefore adds
> > an expedited option to the sys_membarrier() system call.
> 
> > @@ -64,6 +65,10 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
> >  		if (num_online_cpus() > 1)
> >  			synchronize_sched();
> >  		return 0;
> > +	case MEMBARRIER_CMD_SHARED_EXPEDITED:
> > +		if (num_online_cpus() > 1)
> > +			synchronize_sched_expedited();
> > +		return 0;
> 
> So you now give unprivileged userspace the means to IPI the entire
> machine?
> 
> So what do we do when someone goes and does:
> 
> 	for (;;)
> 		sys_membarrier(MEMBARRIER_CMD_SHARED_EXPEDITED, 0);
> 
> on us?

The same thing that happens when they call munmap().

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 16:49     ` Paul E. McKenney
@ 2017-07-25 16:59       ` Peter Zijlstra
  2017-07-25 17:17         ` Paul E. McKenney
  0 siblings, 1 reply; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-25 16:59 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg

On Tue, Jul 25, 2017 at 09:49:00AM -0700, Paul E. McKenney wrote:
> On Tue, Jul 25, 2017 at 06:33:18PM +0200, Peter Zijlstra wrote:
> > On Mon, Jul 24, 2017 at 02:58:16PM -0700, Paul E. McKenney wrote:
> > > The sys_membarrier() system call has proven too slow for some use
> > > cases, which has prompted users to instead rely on TLB shootdown.
> > > Although TLB shootdown is much faster, it has the slight disadvantage
> > > of not working at all on arm and arm64.  This commit therefore adds
> > > an expedited option to the sys_membarrier() system call.
> > 
> > > @@ -64,6 +65,10 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
> > >  		if (num_online_cpus() > 1)
> > >  			synchronize_sched();
> > >  		return 0;
> > > +	case MEMBARRIER_CMD_SHARED_EXPEDITED:
> > > +		if (num_online_cpus() > 1)
> > > +			synchronize_sched_expedited();
> > > +		return 0;
> > 
> > So you now give unprivileged userspace the means to IPI the entire
> > machine?
> > 
> > So what do we do when someone goes and does:
> > 
> > 	for (;;)
> > 		sys_membarrier(MEMBARRIER_CMD_SHARED_EXPEDITED, 0);
> > 
> > on us?
> 
> The same thing that happens when they call munmap().

munmap() TLB invalidate is limited to those CPUs that actually ran
threads of their process, while this is machine wide.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 16:59       ` Peter Zijlstra
@ 2017-07-25 17:17         ` Paul E. McKenney
  2017-07-25 18:53           ` Peter Zijlstra
  0 siblings, 1 reply; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-25 17:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg

On Tue, Jul 25, 2017 at 06:59:57PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 25, 2017 at 09:49:00AM -0700, Paul E. McKenney wrote:
> > On Tue, Jul 25, 2017 at 06:33:18PM +0200, Peter Zijlstra wrote:
> > > On Mon, Jul 24, 2017 at 02:58:16PM -0700, Paul E. McKenney wrote:
> > > > The sys_membarrier() system call has proven too slow for some use
> > > > cases, which has prompted users to instead rely on TLB shootdown.
> > > > Although TLB shootdown is much faster, it has the slight disadvantage
> > > > of not working at all on arm and arm64.  This commit therefore adds
> > > > an expedited option to the sys_membarrier() system call.
> > > 
> > > > @@ -64,6 +65,10 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
> > > >  		if (num_online_cpus() > 1)
> > > >  			synchronize_sched();
> > > >  		return 0;
> > > > +	case MEMBARRIER_CMD_SHARED_EXPEDITED:
> > > > +		if (num_online_cpus() > 1)
> > > > +			synchronize_sched_expedited();
> > > > +		return 0;
> > > 
> > > So you now give unprivileged userspace the means to IPI the entire
> > > machine?
> > > 
> > > So what do we do when someone goes and does:
> > > 
> > > 	for (;;)
> > > 		sys_membarrier(MEMBARRIER_CMD_SHARED_EXPEDITED, 0);
> > > 
> > > on us?
> > 
> > The same thing that happens when they call munmap().
> 
> munmap() TLB invalidate is limited to those CPUs that actually ran
> threads of their process, while this is machine wide.

Or those CPUs running threads of any process mapping the underlying file
or whatever.  And in either case, this can span the whole machine.  Plus
there are a number of other ways for users to do on-demand full-system
IPIs, including any number of ways to wake up large numbers of CPUs,
including from unrelated processes.

But I do plan to add another alternative that is limited to threads of
the running process.  I will be carrying both versions to enable those
who have been bugging me about this to do testing.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 17:17         ` Paul E. McKenney
@ 2017-07-25 18:53           ` Peter Zijlstra
  2017-07-25 19:36             ` Paul E. McKenney
  0 siblings, 1 reply; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-25 18:53 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg

On Tue, Jul 25, 2017 at 10:17:01AM -0700, Paul E. McKenney wrote:

> > munmap() TLB invalidate is limited to those CPUs that actually ran
> > threads of their process, while this is machine wide.
> 
> Or those CPUs running threads of any process mapping the underlying file
> or whatever.

That doesn't sound right. munmap() of a shared file only invalidates
this process's map of it.

Swapping a file page otoh will indeed touch the union of cpumasks over
all processes mapping that page.

> And in either case, this can span the whole machine.  Plus
> there are a number of other ways for users to do on-demand full-system
> IPIs, including any number of ways to wake up large numbers of CPUs,
> including from unrelated processes.

Which are those? I thought we significantly reduced those with the nohz
full work. Most IPI uses now first check if a CPU actually needs the IPI
before sending it IIRC.

> But I do plan to add another alternative that is limited to threads of
> the running process.  I will be carrying both versions to enable those
> who have been bugging me about this to do testing.

Sending IPIs to mm_cpumask() might be better than expedited, but I'm
still hesitant. Just because people want it doesn't mean its a good
idea. We need to weight this against the potential for abuse.

People want userspace preempt disable, no matter how hard they want it,
they're not getting it because its a completely crap idea.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 18:53           ` Peter Zijlstra
@ 2017-07-25 19:36             ` Paul E. McKenney
  2017-07-25 20:24               ` Peter Zijlstra
  2017-07-27 10:14               ` Peter Zijlstra
  0 siblings, 2 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-25 19:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg

On Tue, Jul 25, 2017 at 08:53:20PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 25, 2017 at 10:17:01AM -0700, Paul E. McKenney wrote:
> 
> > > munmap() TLB invalidate is limited to those CPUs that actually ran
> > > threads of their process, while this is machine wide.
> > 
> > Or those CPUs running threads of any process mapping the underlying file
> > or whatever.
> 
> That doesn't sound right. munmap() of a shared file only invalidates
> this process's map of it.
> 
> Swapping a file page otoh will indeed touch the union of cpumasks over
> all processes mapping that page.

There are a lot of variations, to be sure.  For whatever it is worth,
the original patch that started this uses mprotect():

https://github.com/msullivan/userspace-rcu/commit/04656b468d418efbc5d934ab07954eb8395a7ab0

> > And in either case, this can span the whole machine.  Plus
> > there are a number of other ways for users to do on-demand full-system
> > IPIs, including any number of ways to wake up large numbers of CPUs,
> > including from unrelated processes.
> 
> Which are those? I thought we significantly reduced those with the nohz
> full work. Most IPI uses now first check if a CPU actually needs the IPI
> before sending it IIRC.

If the task being awakened is higher priority than the task currently
running on a given CPU, that CPU still gets an IPI, right?  Or am I
completely confused?

> > But I do plan to add another alternative that is limited to threads of
> > the running process.  I will be carrying both versions to enable those
> > who have been bugging me about this to do testing.
> 
> Sending IPIs to mm_cpumask() might be better than expedited, but I'm
> still hesitant. Just because people want it doesn't mean its a good
> idea. We need to weight this against the potential for abuse.
> 
> People want userspace preempt disable, no matter how hard they want it,
> they're not getting it because its a completely crap idea.

Unlike userspace preempt disable, in this case we get the abuse anyway
via existing mechanisms, as in they are already being abused.  If we
provide a mechanism for this purpose, we at least have the potential
for handling the abuse, for example:

o	"Defanging" sys_membarrier() on systems that are sensitive to
	latency.  For example, this patch can be defanged by booting
	with the rcupdate.rcu_normal=1 kernel boot parameter, which
	causes requests for expedited grace periods to instead use
	normal grace periods.

o	Detecting and responding to abuse.  For example, perhaps if there
	are more than (say) 50 expedited sys_membarrier()s within a given
	jiffy, the excess sys_membarrier()s are non-expedited.

o	Batching optimizations allow large number of concurrent requests
	to be handled with fewer grace periods -- and both normal and
	expedited grace periods already do exactly this.

This horse is already out, so trying to shut the gate won't be effective.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 19:36             ` Paul E. McKenney
@ 2017-07-25 20:24               ` Peter Zijlstra
  2017-07-25 21:19                 ` Paul E. McKenney
  2017-07-27 10:14               ` Peter Zijlstra
  1 sibling, 1 reply; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-25 20:24 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg

On Tue, Jul 25, 2017 at 12:36:12PM -0700, Paul E. McKenney wrote:

> There are a lot of variations, to be sure.  For whatever it is worth,
> the original patch that started this uses mprotect():
> 
> https://github.com/msullivan/userspace-rcu/commit/04656b468d418efbc5d934ab07954eb8395a7ab0

FWIW that will not work on s390 (and maybe others), they don't in fact
require IPIs for remote TLB invalidation.

> > Which are those? I thought we significantly reduced those with the nohz
> > full work. Most IPI uses now first check if a CPU actually needs the IPI
> > before sending it IIRC.
> 
> If the task being awakened is higher priority than the task currently
> running on a given CPU, that CPU still gets an IPI, right?  Or am I
> completely confused?

I was thinking of things like on_each_cpu_cond().

> Unlike userspace preempt disable, in this case we get the abuse anyway
> via existing mechanisms, as in they are already being abused.  If we
> provide a mechanism for this purpose, we at least have the potential
> for handling the abuse, for example:
> 
> o	"Defanging" sys_membarrier() on systems that are sensitive to
> 	latency.  For example, this patch can be defanged by booting
> 	with the rcupdate.rcu_normal=1 kernel boot parameter, which
> 	causes requests for expedited grace periods to instead use
> 	normal grace periods.
> 
> o	Detecting and responding to abuse.  For example, perhaps if there
> 	are more than (say) 50 expedited sys_membarrier()s within a given
> 	jiffy, the excess sys_membarrier()s are non-expedited.
> 
> o	Batching optimizations allow large number of concurrent requests
> 	to be handled with fewer grace periods -- and both normal and
> 	expedited grace periods already do exactly this.
> 
> This horse is already out, so trying to shut the gate won't be effective.

Well, I'm not sure there is an easy means of doing machine wide IPIs for
!root out there. This would be a first.

Something along the lines of:

void dummy(void *arg)
{
	/* IPIs are assumed to be serializing */
}

void ipi_mm(struct mm_struct *mm)
{
	cpumask_var_t cpus;
	int cpu;

	zalloc_cpumask_var(&cpus, GFP_KERNEL);

	for_each_cpu(cpu, mm_cpumask(mm)) {
		struct task_struct *p;

		/*
		 * If the current task of @cpu isn't of this @mm, then
		 * it needs a context switch to become one, which will
		 * provide the ordering we require.
		 */
		rcu_read_lock();
		p = task_rcu_dereference(&cpu_curr(cpu));
		if (p && p->mm == mm)
			__cpumask_set_cpu(cpu, cpus);
		rcu_read_unlock();
	}

	on_each_cpu_mask(cpus, dummy, NULL, 1);
}

Would appear to be minimally invasive and only shoot at CPUs we're
currently running our process on, which greatly reduces the impact.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 20:24               ` Peter Zijlstra
@ 2017-07-25 21:19                 ` Paul E. McKenney
  2017-07-25 21:55                   ` Peter Zijlstra
  2017-07-26  9:36                   ` Will Deacon
  0 siblings, 2 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-25 21:19 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

On Tue, Jul 25, 2017 at 10:24:51PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 25, 2017 at 12:36:12PM -0700, Paul E. McKenney wrote:
> 
> > There are a lot of variations, to be sure.  For whatever it is worth,
> > the original patch that started this uses mprotect():
> > 
> > https://github.com/msullivan/userspace-rcu/commit/04656b468d418efbc5d934ab07954eb8395a7ab0
> 
> FWIW that will not work on s390 (and maybe others), they don't in fact
> require IPIs for remote TLB invalidation.

Nor will it for ARM.  Nor (I think) for PowerPC.  But that is in fact
what people are doing right now in real life.  Hence my renewed interest
in sys_membarrier().

> > > Which are those? I thought we significantly reduced those with the nohz
> > > full work. Most IPI uses now first check if a CPU actually needs the IPI
> > > before sending it IIRC.
> > 
> > If the task being awakened is higher priority than the task currently
> > running on a given CPU, that CPU still gets an IPI, right?  Or am I
> > completely confused?
> 
> I was thinking of things like on_each_cpu_cond().

Fair enough.

But it would not be hard for userspace code to force IPIs by repeatedly
awakening higher-priority threads that sleep immediately after being
awakened, right?

> > Unlike userspace preempt disable, in this case we get the abuse anyway
> > via existing mechanisms, as in they are already being abused.  If we
> > provide a mechanism for this purpose, we at least have the potential
> > for handling the abuse, for example:
> > 
> > o	"Defanging" sys_membarrier() on systems that are sensitive to
> > 	latency.  For example, this patch can be defanged by booting
> > 	with the rcupdate.rcu_normal=1 kernel boot parameter, which
> > 	causes requests for expedited grace periods to instead use
> > 	normal grace periods.
> > 
> > o	Detecting and responding to abuse.  For example, perhaps if there
> > 	are more than (say) 50 expedited sys_membarrier()s within a given
> > 	jiffy, the excess sys_membarrier()s are non-expedited.
> > 
> > o	Batching optimizations allow large number of concurrent requests
> > 	to be handled with fewer grace periods -- and both normal and
> > 	expedited grace periods already do exactly this.
> > 
> > This horse is already out, so trying to shut the gate won't be effective.
> 
> Well, I'm not sure there is an easy means of doing machine wide IPIs for
> !root out there. This would be a first.
> 
> Something along the lines of:
> 
> void dummy(void *arg)
> {
> 	/* IPIs are assumed to be serializing */
> }
> 
> void ipi_mm(struct mm_struct *mm)
> {
> 	cpumask_var_t cpus;
> 	int cpu;
> 
> 	zalloc_cpumask_var(&cpus, GFP_KERNEL);
> 
> 	for_each_cpu(cpu, mm_cpumask(mm)) {
> 		struct task_struct *p;
> 
> 		/*
> 		 * If the current task of @cpu isn't of this @mm, then
> 		 * it needs a context switch to become one, which will
> 		 * provide the ordering we require.
> 		 */
> 		rcu_read_lock();
> 		p = task_rcu_dereference(&cpu_curr(cpu));
> 		if (p && p->mm == mm)
> 			__cpumask_set_cpu(cpu, cpus);
> 		rcu_read_unlock();
> 	}
> 
> 	on_each_cpu_mask(cpus, dummy, NULL, 1);
> }
> 
> Would appear to be minimally invasive and only shoot at CPUs we're
> currently running our process on, which greatly reduces the impact.

I am good with this approach as well, and I do very much like that it
avoids IPIing CPUs that aren't running our process (at least in the
common case).  But don't we also need added memory ordering?  It is
sort of OK to IPI a CPU that just now switched away from our process,
but not so good to miss IPIing a CPU that switched to our process just
a little before sys_membarrier().

I was intending to base this on the last few versions of a 2010 patch,
but maybe things have changed:

	https://marc.info/?l=linux-kernel&m=126358017229620&w=2
	https://marc.info/?l=linux-kernel&m=126436996014016&w=2
	https://marc.info/?l=linux-kernel&m=126601479802978&w=2
	https://marc.info/?l=linux-kernel&m=126970692903302&w=2

Discussion here:

	https://marc.info/?l=linux-kernel&m=126349766324224&w=2

The discussion led to acquiring the runqueue locks, as there was
otherwise a need to add code to the scheduler fastpaths.

There was a desire to make this work automatically among multiple
processes sharing some memory, but I believe that in this case
the user is going to have to track the multiple processes anyway,
and so can simply do sys_membarrier from each:

	https://marc.info/?l=linux-arch&m=126686393820832&w=2

Some architectures are less precise than others in tracking which
CPUs are running a given process due to ASIDs, though this is
thought to be a non-problem:

	https://marc.info/?l=linux-arch&m=126716090413065&w=2
	https://marc.info/?l=linux-arch&m=126716262815202&w=2

Thoughts?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 21:19                 ` Paul E. McKenney
@ 2017-07-25 21:55                   ` Peter Zijlstra
  2017-07-25 22:39                     ` Mathieu Desnoyers
                                       ` (2 more replies)
  2017-07-26  9:36                   ` Will Deacon
  1 sibling, 3 replies; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-25 21:55 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

On Tue, Jul 25, 2017 at 02:19:26PM -0700, Paul E. McKenney wrote:
> On Tue, Jul 25, 2017 at 10:24:51PM +0200, Peter Zijlstra wrote:
> > On Tue, Jul 25, 2017 at 12:36:12PM -0700, Paul E. McKenney wrote:
> > 
> > > There are a lot of variations, to be sure.  For whatever it is worth,
> > > the original patch that started this uses mprotect():
> > > 
> > > https://github.com/msullivan/userspace-rcu/commit/04656b468d418efbc5d934ab07954eb8395a7ab0
> > 
> > FWIW that will not work on s390 (and maybe others), they don't in fact
> > require IPIs for remote TLB invalidation.
> 
> Nor will it for ARM.  Nor (I think) for PowerPC.  But that is in fact
> what people are doing right now in real life.  Hence my renewed interest
> in sys_membarrier().

People always do crazy stuff, but what surprised me is that such s patch
got merged in urcu even though its known broken for a number of
architectures.

> But it would not be hard for userspace code to force IPIs by repeatedly
> awakening higher-priority threads that sleep immediately after being
> awakened, right?

RT tasks are not readily available to !root, and the user might have
been constrained to a subset of available CPUs.

> > Well, I'm not sure there is an easy means of doing machine wide IPIs for
> > !root out there. This would be a first.
> > 
> > Something along the lines of:
> > 
> > void dummy(void *arg)
> > {
> > 	/* IPIs are assumed to be serializing */
> > }
> > 
> > void ipi_mm(struct mm_struct *mm)
> > {
> > 	cpumask_var_t cpus;
> > 	int cpu;
> > 
> > 	zalloc_cpumask_var(&cpus, GFP_KERNEL);
> > 
> > 	for_each_cpu(cpu, mm_cpumask(mm)) {
> > 		struct task_struct *p;
> > 
> > 		/*
> > 		 * If the current task of @cpu isn't of this @mm, then
> > 		 * it needs a context switch to become one, which will
> > 		 * provide the ordering we require.
> > 		 */
> > 		rcu_read_lock();
> > 		p = task_rcu_dereference(&cpu_curr(cpu));
> > 		if (p && p->mm == mm)
> > 			__cpumask_set_cpu(cpu, cpus);
> > 		rcu_read_unlock();
> > 	}
> > 
> > 	on_each_cpu_mask(cpus, dummy, NULL, 1);
> > }
> > 
> > Would appear to be minimally invasive and only shoot at CPUs we're
> > currently running our process on, which greatly reduces the impact.
> 
> I am good with this approach as well, and I do very much like that it
> avoids IPIing CPUs that aren't running our process (at least in the
> common case).  But don't we also need added memory ordering?  It is
> sort of OK to IPI a CPU that just now switched away from our process,
> but not so good to miss IPIing a CPU that switched to our process just
> a little before sys_membarrier().

My thinking was that if we observe '!= mm' that CPU will have to do a
context switch in order to make it true. That context switch will
provide the ordering we're after so all is well.

Quite possible there's a hole in, but since I'm running on fumes someone
needs to spell it out for me :-)

> I was intending to base this on the last few versions of a 2010 patch,
> but maybe things have changed:
> 
> 	https://marc.info/?l=linux-kernel&m=126358017229620&w=2
> 	https://marc.info/?l=linux-kernel&m=126436996014016&w=2
> 	https://marc.info/?l=linux-kernel&m=126601479802978&w=2
> 	https://marc.info/?l=linux-kernel&m=126970692903302&w=2
> 
> Discussion here:
> 
> 	https://marc.info/?l=linux-kernel&m=126349766324224&w=2
> 
> The discussion led to acquiring the runqueue locks, as there was
> otherwise a need to add code to the scheduler fastpaths.

TL;DR..  that's far too much to trawl through.

> Some architectures are less precise than others in tracking which
> CPUs are running a given process due to ASIDs, though this is
> thought to be a non-problem:
> 
> 	https://marc.info/?l=linux-arch&m=126716090413065&w=2
> 	https://marc.info/?l=linux-arch&m=126716262815202&w=2
> 
> Thoughts?

Yes, there are architectures that only accumulate bits in mm_cpumask(),
with the additional check to see if the remote task belongs to our MM
this should be a non-issue.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 21:55                   ` Peter Zijlstra
@ 2017-07-25 22:39                     ` Mathieu Desnoyers
  2017-07-25 22:50                     ` Mathieu Desnoyers
  2017-07-25 23:59                     ` Paul E. McKenney
  2 siblings, 0 replies; 68+ messages in thread
From: Mathieu Desnoyers @ 2017-07-25 22:39 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul E. McKenney, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

----- On Jul 25, 2017, at 5:55 PM, Peter Zijlstra peterz@infradead.org wrote:

> On Tue, Jul 25, 2017 at 02:19:26PM -0700, Paul E. McKenney wrote:
>> On Tue, Jul 25, 2017 at 10:24:51PM +0200, Peter Zijlstra wrote:
>> > On Tue, Jul 25, 2017 at 12:36:12PM -0700, Paul E. McKenney wrote:
>> > 
>> > > There are a lot of variations, to be sure.  For whatever it is worth,
>> > > the original patch that started this uses mprotect():
>> > > 
>> > > https://github.com/msullivan/userspace-rcu/commit/04656b468d418efbc5d934ab07954eb8395a7ab0
>> > 
>> > FWIW that will not work on s390 (and maybe others), they don't in fact
>> > require IPIs for remote TLB invalidation.
>> 
>> Nor will it for ARM.  Nor (I think) for PowerPC.  But that is in fact
>> what people are doing right now in real life.  Hence my renewed interest
>> in sys_membarrier().
> 
> People always do crazy stuff, but what surprised me is that such s patch
> got merged in urcu even though its known broken for a number of
> architectures.

As maintainer of liburcu, I can certainly say that this patch never made it into
liburcu master branch (official repo at git://git.liburcu.org/userspace-rcu.git).
Paul is referring to a liburcu fork by a github user "msullivan", not the official
tree.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 21:55                   ` Peter Zijlstra
  2017-07-25 22:39                     ` Mathieu Desnoyers
@ 2017-07-25 22:50                     ` Mathieu Desnoyers
  2017-07-26  0:01                       ` Paul E. McKenney
  2017-07-26  7:46                       ` Peter Zijlstra
  2017-07-25 23:59                     ` Paul E. McKenney
  2 siblings, 2 replies; 68+ messages in thread
From: Mathieu Desnoyers @ 2017-07-25 22:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Paul E. McKenney, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

----- On Jul 25, 2017, at 5:55 PM, Peter Zijlstra peterz@infradead.org wrote:

> On Tue, Jul 25, 2017 at 02:19:26PM -0700, Paul E. McKenney wrote:
>> On Tue, Jul 25, 2017 at 10:24:51PM +0200, Peter Zijlstra wrote:
>> > On Tue, Jul 25, 2017 at 12:36:12PM -0700, Paul E. McKenney wrote:
[...]
> 
>> But it would not be hard for userspace code to force IPIs by repeatedly
>> awakening higher-priority threads that sleep immediately after being
>> awakened, right?
> 
> RT tasks are not readily available to !root, and the user might have
> been constrained to a subset of available CPUs.
> 
>> > Well, I'm not sure there is an easy means of doing machine wide IPIs for
>> > !root out there. This would be a first.
>> > 
>> > Something along the lines of:
>> > 
>> > void dummy(void *arg)
>> > {
>> > 	/* IPIs are assumed to be serializing */
>> > }
>> > 
>> > void ipi_mm(struct mm_struct *mm)
>> > {
>> > 	cpumask_var_t cpus;
>> > 	int cpu;
>> > 
>> > 	zalloc_cpumask_var(&cpus, GFP_KERNEL);
>> > 
>> > 	for_each_cpu(cpu, mm_cpumask(mm)) {
>> > 		struct task_struct *p;
>> > 
>> > 		/*
>> > 		 * If the current task of @cpu isn't of this @mm, then
>> > 		 * it needs a context switch to become one, which will
>> > 		 * provide the ordering we require.
>> > 		 */
>> > 		rcu_read_lock();
>> > 		p = task_rcu_dereference(&cpu_curr(cpu));
>> > 		if (p && p->mm == mm)
>> > 			__cpumask_set_cpu(cpu, cpus);
>> > 		rcu_read_unlock();
>> > 	}
>> > 
>> > 	on_each_cpu_mask(cpus, dummy, NULL, 1);
>> > }
>> > 
>> > Would appear to be minimally invasive and only shoot at CPUs we're
>> > currently running our process on, which greatly reduces the impact.
>> 
>> I am good with this approach as well, and I do very much like that it
>> avoids IPIing CPUs that aren't running our process (at least in the
>> common case).  But don't we also need added memory ordering?  It is
>> sort of OK to IPI a CPU that just now switched away from our process,
>> but not so good to miss IPIing a CPU that switched to our process just
>> a little before sys_membarrier().
> 
> My thinking was that if we observe '!= mm' that CPU will have to do a
> context switch in order to make it true. That context switch will
> provide the ordering we're after so all is well.
> 
> Quite possible there's a hole in, but since I'm running on fumes someone
> needs to spell it out for me :-)
> 
>> I was intending to base this on the last few versions of a 2010 patch,
>> but maybe things have changed:
>> 
>> 	https://marc.info/?l=linux-kernel&m=126358017229620&w=2
>> 	https://marc.info/?l=linux-kernel&m=126436996014016&w=2
>> 	https://marc.info/?l=linux-kernel&m=126601479802978&w=2
>> 	https://marc.info/?l=linux-kernel&m=126970692903302&w=2
>> 
>> Discussion here:
>> 
>> 	https://marc.info/?l=linux-kernel&m=126349766324224&w=2
>> 
>> The discussion led to acquiring the runqueue locks, as there was
>> otherwise a need to add code to the scheduler fastpaths.
> 
> TL;DR..  that's far too much to trawl through.
> 
>> Some architectures are less precise than others in tracking which
>> CPUs are running a given process due to ASIDs, though this is
>> thought to be a non-problem:
>> 
>> 	https://marc.info/?l=linux-arch&m=126716090413065&w=2
>> 	https://marc.info/?l=linux-arch&m=126716262815202&w=2
>> 
>> Thoughts?
> 
> Yes, there are architectures that only accumulate bits in mm_cpumask(),
> with the additional check to see if the remote task belongs to our MM
> this should be a non-issue.

This would implement a MEMBARRIER_CMD_PRIVATE_EXPEDITED (or such) flag
for expedited process-local effect. This differs from the "SHARED" flag,
since the SHARED flag affects threads accessing memory mappings shared
across processes as well.

I wonder if we could create a MEMBARRIER_CMD_SHARED_EXPEDITED behavior
by iterating on all memory mappings mapped into the current process,
and build a cpumask based on the union of all mm masks encountered ?
Then we could send the IPI to all cpus belonging to that cpumask. Or
am I missing something obvious ?

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 21:55                   ` Peter Zijlstra
  2017-07-25 22:39                     ` Mathieu Desnoyers
  2017-07-25 22:50                     ` Mathieu Desnoyers
@ 2017-07-25 23:59                     ` Paul E. McKenney
  2017-07-26  7:41                       ` Peter Zijlstra
  2 siblings, 1 reply; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-25 23:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

On Tue, Jul 25, 2017 at 11:55:10PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 25, 2017 at 02:19:26PM -0700, Paul E. McKenney wrote:
> > On Tue, Jul 25, 2017 at 10:24:51PM +0200, Peter Zijlstra wrote:
> > > On Tue, Jul 25, 2017 at 12:36:12PM -0700, Paul E. McKenney wrote:
> > > 
> > > > There are a lot of variations, to be sure.  For whatever it is worth,
> > > > the original patch that started this uses mprotect():
> > > > 
> > > > https://github.com/msullivan/userspace-rcu/commit/04656b468d418efbc5d934ab07954eb8395a7ab0
> > > 
> > > FWIW that will not work on s390 (and maybe others), they don't in fact
> > > require IPIs for remote TLB invalidation.
> > 
> > Nor will it for ARM.  Nor (I think) for PowerPC.  But that is in fact
> > what people are doing right now in real life.  Hence my renewed interest
> > in sys_membarrier().
> 
> People always do crazy stuff, but what surprised me is that such s patch
> got merged in urcu even though its known broken for a number of
> architectures.

It did not get merged into urcu.  It is instead used directly by a
number of people for a number of concurrent algorithms.

> > But it would not be hard for userspace code to force IPIs by repeatedly
> > awakening higher-priority threads that sleep immediately after being
> > awakened, right?
> 
> RT tasks are not readily available to !root, and the user might have
> been constrained to a subset of available CPUs.

So non-idle non-nohz CPUs never get IPIed for wakeups of SCHED_OTHER
threads?

> > > Well, I'm not sure there is an easy means of doing machine wide IPIs for
> > > !root out there. This would be a first.
> > > 
> > > Something along the lines of:
> > > 
> > > void dummy(void *arg)
> > > {
> > > 	/* IPIs are assumed to be serializing */
> > > }
> > > 
> > > void ipi_mm(struct mm_struct *mm)
> > > {
> > > 	cpumask_var_t cpus;
> > > 	int cpu;
> > > 
> > > 	zalloc_cpumask_var(&cpus, GFP_KERNEL);
> > > 
> > > 	for_each_cpu(cpu, mm_cpumask(mm)) {
> > > 		struct task_struct *p;
> > > 
> > > 		/*
> > > 		 * If the current task of @cpu isn't of this @mm, then
> > > 		 * it needs a context switch to become one, which will
> > > 		 * provide the ordering we require.
> > > 		 */
> > > 		rcu_read_lock();
> > > 		p = task_rcu_dereference(&cpu_curr(cpu));
> > > 		if (p && p->mm == mm)
> > > 			__cpumask_set_cpu(cpu, cpus);
> > > 		rcu_read_unlock();
> > > 	}
> > > 
> > > 	on_each_cpu_mask(cpus, dummy, NULL, 1);
> > > }
> > > 
> > > Would appear to be minimally invasive and only shoot at CPUs we're
> > > currently running our process on, which greatly reduces the impact.
> > 
> > I am good with this approach as well, and I do very much like that it
> > avoids IPIing CPUs that aren't running our process (at least in the
> > common case).  But don't we also need added memory ordering?  It is
> > sort of OK to IPI a CPU that just now switched away from our process,
> > but not so good to miss IPIing a CPU that switched to our process just
> > a little before sys_membarrier().
> 
> My thinking was that if we observe '!= mm' that CPU will have to do a
> context switch in order to make it true. That context switch will
> provide the ordering we're after so all is well.
> 
> Quite possible there's a hole in, but since I'm running on fumes someone
> needs to spell it out for me :-)

This would be the https://marc.info/?l=linux-kernel&m=126349766324224&w=2
URL below.

Which might or might not still be applicable.

> > I was intending to base this on the last few versions of a 2010 patch,
> > but maybe things have changed:
> > 
> > 	https://marc.info/?l=linux-kernel&m=126358017229620&w=2
> > 	https://marc.info/?l=linux-kernel&m=126436996014016&w=2
> > 	https://marc.info/?l=linux-kernel&m=126601479802978&w=2
> > 	https://marc.info/?l=linux-kernel&m=126970692903302&w=2
> > 
> > Discussion here:
> > 
> > 	https://marc.info/?l=linux-kernel&m=126349766324224&w=2
> > 
> > The discussion led to acquiring the runqueue locks, as there was
> > otherwise a need to add code to the scheduler fastpaths.
> 
> TL;DR..  that's far too much to trawl through.

So we re-derive it from first principles instead?  ;-)

> > Some architectures are less precise than others in tracking which
> > CPUs are running a given process due to ASIDs, though this is
> > thought to be a non-problem:
> > 
> > 	https://marc.info/?l=linux-arch&m=126716090413065&w=2
> > 	https://marc.info/?l=linux-arch&m=126716262815202&w=2
> > 
> > Thoughts?
> 
> Yes, there are architectures that only accumulate bits in mm_cpumask(),
> with the additional check to see if the remote task belongs to our MM
> this should be a non-issue.

Makes sense.

								Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 22:50                     ` Mathieu Desnoyers
@ 2017-07-26  0:01                       ` Paul E. McKenney
  2017-07-26  7:46                       ` Peter Zijlstra
  1 sibling, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-26  0:01 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

On Tue, Jul 25, 2017 at 10:50:13PM +0000, Mathieu Desnoyers wrote:
> ----- On Jul 25, 2017, at 5:55 PM, Peter Zijlstra peterz@infradead.org wrote:
> 
> > On Tue, Jul 25, 2017 at 02:19:26PM -0700, Paul E. McKenney wrote:
> >> On Tue, Jul 25, 2017 at 10:24:51PM +0200, Peter Zijlstra wrote:
> >> > On Tue, Jul 25, 2017 at 12:36:12PM -0700, Paul E. McKenney wrote:
> [...]
> > 
> >> But it would not be hard for userspace code to force IPIs by repeatedly
> >> awakening higher-priority threads that sleep immediately after being
> >> awakened, right?
> > 
> > RT tasks are not readily available to !root, and the user might have
> > been constrained to a subset of available CPUs.
> > 
> >> > Well, I'm not sure there is an easy means of doing machine wide IPIs for
> >> > !root out there. This would be a first.
> >> > 
> >> > Something along the lines of:
> >> > 
> >> > void dummy(void *arg)
> >> > {
> >> > 	/* IPIs are assumed to be serializing */
> >> > }
> >> > 
> >> > void ipi_mm(struct mm_struct *mm)
> >> > {
> >> > 	cpumask_var_t cpus;
> >> > 	int cpu;
> >> > 
> >> > 	zalloc_cpumask_var(&cpus, GFP_KERNEL);
> >> > 
> >> > 	for_each_cpu(cpu, mm_cpumask(mm)) {
> >> > 		struct task_struct *p;
> >> > 
> >> > 		/*
> >> > 		 * If the current task of @cpu isn't of this @mm, then
> >> > 		 * it needs a context switch to become one, which will
> >> > 		 * provide the ordering we require.
> >> > 		 */
> >> > 		rcu_read_lock();
> >> > 		p = task_rcu_dereference(&cpu_curr(cpu));
> >> > 		if (p && p->mm == mm)
> >> > 			__cpumask_set_cpu(cpu, cpus);
> >> > 		rcu_read_unlock();
> >> > 	}
> >> > 
> >> > 	on_each_cpu_mask(cpus, dummy, NULL, 1);
> >> > }
> >> > 
> >> > Would appear to be minimally invasive and only shoot at CPUs we're
> >> > currently running our process on, which greatly reduces the impact.
> >> 
> >> I am good with this approach as well, and I do very much like that it
> >> avoids IPIing CPUs that aren't running our process (at least in the
> >> common case).  But don't we also need added memory ordering?  It is
> >> sort of OK to IPI a CPU that just now switched away from our process,
> >> but not so good to miss IPIing a CPU that switched to our process just
> >> a little before sys_membarrier().
> > 
> > My thinking was that if we observe '!= mm' that CPU will have to do a
> > context switch in order to make it true. That context switch will
> > provide the ordering we're after so all is well.
> > 
> > Quite possible there's a hole in, but since I'm running on fumes someone
> > needs to spell it out for me :-)
> > 
> >> I was intending to base this on the last few versions of a 2010 patch,
> >> but maybe things have changed:
> >> 
> >> 	https://marc.info/?l=linux-kernel&m=126358017229620&w=2
> >> 	https://marc.info/?l=linux-kernel&m=126436996014016&w=2
> >> 	https://marc.info/?l=linux-kernel&m=126601479802978&w=2
> >> 	https://marc.info/?l=linux-kernel&m=126970692903302&w=2
> >> 
> >> Discussion here:
> >> 
> >> 	https://marc.info/?l=linux-kernel&m=126349766324224&w=2
> >> 
> >> The discussion led to acquiring the runqueue locks, as there was
> >> otherwise a need to add code to the scheduler fastpaths.
> > 
> > TL;DR..  that's far too much to trawl through.
> > 
> >> Some architectures are less precise than others in tracking which
> >> CPUs are running a given process due to ASIDs, though this is
> >> thought to be a non-problem:
> >> 
> >> 	https://marc.info/?l=linux-arch&m=126716090413065&w=2
> >> 	https://marc.info/?l=linux-arch&m=126716262815202&w=2
> >> 
> >> Thoughts?
> > 
> > Yes, there are architectures that only accumulate bits in mm_cpumask(),
> > with the additional check to see if the remote task belongs to our MM
> > this should be a non-issue.
> 
> This would implement a MEMBARRIER_CMD_PRIVATE_EXPEDITED (or such) flag
> for expedited process-local effect. This differs from the "SHARED" flag,
> since the SHARED flag affects threads accessing memory mappings shared
> across processes as well.
> 
> I wonder if we could create a MEMBARRIER_CMD_SHARED_EXPEDITED behavior
> by iterating on all memory mappings mapped into the current process,
> and build a cpumask based on the union of all mm masks encountered ?
> Then we could send the IPI to all cpus belonging to that cpumask. Or
> am I missing something obvious ?

I suspect that something like this would work, but I agree with your 2010
self, who argued that this should be follow-on functionality.  After all,
the user probably needs to be aware of who is sharing for other reasons,
and can then make each process do sys_membarrier().

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 23:59                     ` Paul E. McKenney
@ 2017-07-26  7:41                       ` Peter Zijlstra
  2017-07-26 15:41                         ` Paul E. McKenney
  0 siblings, 1 reply; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-26  7:41 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

On Tue, Jul 25, 2017 at 04:59:36PM -0700, Paul E. McKenney wrote:
> On Tue, Jul 25, 2017 at 11:55:10PM +0200, Peter Zijlstra wrote:

> > People always do crazy stuff, but what surprised me is that such s patch
> > got merged in urcu even though its known broken for a number of
> > architectures.
> 
> It did not get merged into urcu.  It is instead used directly by a
> number of people for a number of concurrent algorithms.

Yah, Mathieu also already pointed that out. It seems I really cannot
deal with github well -- that website always terminally confuses me.

> > > But it would not be hard for userspace code to force IPIs by repeatedly
> > > awakening higher-priority threads that sleep immediately after being
> > > awakened, right?
> > 
> > RT tasks are not readily available to !root, and the user might have
> > been constrained to a subset of available CPUs.
> 
> So non-idle non-nohz CPUs never get IPIed for wakeups of SCHED_OTHER
> threads?

Sure, but SCHED_OTHER auto throttles in that if there's anything else to
run, you get to wait. So you can't generate an IPI storm with it. Also,
again, we can be limited to a subset of CPUs.

> > My thinking was that if we observe '!= mm' that CPU will have to do a
> > context switch in order to make it true. That context switch will
> > provide the ordering we're after so all is well.
> > 
> > Quite possible there's a hole in, but since I'm running on fumes someone
> > needs to spell it out for me :-)
> 
> This would be the https://marc.info/?l=linux-kernel&m=126349766324224&w=2
> URL below.
> 
> Which might or might not still be applicable.

I think we actually have those two smp_mb()'s around the rq->curr
assignment.

we have smp_mb__before_spinlock(), which per the argument here:

  https://lkml.kernel.org/r/20170607162013.755917928@infradead.org

is actually a full MB, irrespective of that weird smp_wmb() definition
we have now. And we have switch_mm() on the other side.

> > > I was intending to base this on the last few versions of a 2010 patch,
> > > but maybe things have changed:
> > > 
> > > 	https://marc.info/?l=linux-kernel&m=126358017229620&w=2
> > > 	https://marc.info/?l=linux-kernel&m=126436996014016&w=2
> > > 	https://marc.info/?l=linux-kernel&m=126601479802978&w=2
> > > 	https://marc.info/?l=linux-kernel&m=126970692903302&w=2
> > > 
> > > Discussion here:
> > > 
> > > 	https://marc.info/?l=linux-kernel&m=126349766324224&w=2
> > > 
> > > The discussion led to acquiring the runqueue locks, as there was
> > > otherwise a need to add code to the scheduler fastpaths.
> > 
> > TL;DR..  that's far too much to trawl through.
> 
> So we re-derive it from first principles instead?  ;-)

Yep, that's what I usually do anyway, who knows what kind of crazy our
younger selves were up to ;-)

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 22:50                     ` Mathieu Desnoyers
  2017-07-26  0:01                       ` Paul E. McKenney
@ 2017-07-26  7:46                       ` Peter Zijlstra
  2017-07-26 15:42                         ` Paul E. McKenney
  1 sibling, 1 reply; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-26  7:46 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Paul E. McKenney, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

On Tue, Jul 25, 2017 at 10:50:13PM +0000, Mathieu Desnoyers wrote:
> This would implement a MEMBARRIER_CMD_PRIVATE_EXPEDITED (or such) flag
> for expedited process-local effect. This differs from the "SHARED" flag,
> since the SHARED flag affects threads accessing memory mappings shared
> across processes as well.
> 
> I wonder if we could create a MEMBARRIER_CMD_SHARED_EXPEDITED behavior
> by iterating on all memory mappings mapped into the current process,
> and build a cpumask based on the union of all mm masks encountered ?
> Then we could send the IPI to all cpus belonging to that cpumask. Or
> am I missing something obvious ?

I would readily object to such a beast. You far too quickly end up
having to IPI everybody because of some stupid shared map or something
(yes I know, normal DSOs are mapped private).

The whole private_expedited is only palatable because we can only hinder
our own threads (much).

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 21:19                 ` Paul E. McKenney
  2017-07-25 21:55                   ` Peter Zijlstra
@ 2017-07-26  9:36                   ` Will Deacon
  2017-07-26 15:46                     ` Paul E. McKenney
  1 sibling, 1 reply; 68+ messages in thread
From: Will Deacon @ 2017-07-26  9:36 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, linux-kernel, mingo, jiangshanlai, dipankar,
	akpm, mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg

On Tue, Jul 25, 2017 at 02:19:26PM -0700, Paul E. McKenney wrote:
> Some architectures are less precise than others in tracking which
> CPUs are running a given process due to ASIDs, though this is
> thought to be a non-problem:
> 
> 	https://marc.info/?l=linux-arch&m=126716090413065&w=2
> 	https://marc.info/?l=linux-arch&m=126716262815202&w=2
> 
> Thoughts?

On arm64, we *never* touch mm_cpumask, so it will always be empty. The only
thing we could potentially use it for is non-broadcast TLB invalidation if
the mm was only active on a single CPU, but when I implemented that we
pretty much never hit that case. I was hoping fork()+exec() would trigger it
(e.g. scripts), but the scheduler treats both of those as rebalancing points
so the temporary ASID before the exec always has two CPUs set in its mask.

Will

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-26  7:41                       ` Peter Zijlstra
@ 2017-07-26 15:41                         ` Paul E. McKenney
  2017-07-27  8:30                           ` Peter Zijlstra
  0 siblings, 1 reply; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-26 15:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

On Wed, Jul 26, 2017 at 09:41:28AM +0200, Peter Zijlstra wrote:
> On Tue, Jul 25, 2017 at 04:59:36PM -0700, Paul E. McKenney wrote:
> > On Tue, Jul 25, 2017 at 11:55:10PM +0200, Peter Zijlstra wrote:
> 
> > > People always do crazy stuff, but what surprised me is that such s patch
> > > got merged in urcu even though its known broken for a number of
> > > architectures.
> > 
> > It did not get merged into urcu.  It is instead used directly by a
> > number of people for a number of concurrent algorithms.
> 
> Yah, Mathieu also already pointed that out. It seems I really cannot
> deal with github well -- that website always terminally confuses me.
> 
> > > > But it would not be hard for userspace code to force IPIs by repeatedly
> > > > awakening higher-priority threads that sleep immediately after being
> > > > awakened, right?
> > > 
> > > RT tasks are not readily available to !root, and the user might have
> > > been constrained to a subset of available CPUs.
> > 
> > So non-idle non-nohz CPUs never get IPIed for wakeups of SCHED_OTHER
> > threads?
> 
> Sure, but SCHED_OTHER auto throttles in that if there's anything else to
> run, you get to wait. So you can't generate an IPI storm with it. Also,
> again, we can be limited to a subset of CPUs.

OK, what is its auto-throttle policy?  One round of IPIs per jiffy or
some such?

Does this auto-throttling also apply if the user is running a CPU-bound
SCHED_BATCH or SCHED_IDLE task on each CPU, and periodically waking up
one of a large group of SCHED_OTHER tasks, where the SCHED_OTHER tasks
immediately sleep upon being awakened?

> > > My thinking was that if we observe '!= mm' that CPU will have to do a
> > > context switch in order to make it true. That context switch will
> > > provide the ordering we're after so all is well.
> > > 
> > > Quite possible there's a hole in, but since I'm running on fumes someone
> > > needs to spell it out for me :-)
> > 
> > This would be the https://marc.info/?l=linux-kernel&m=126349766324224&w=2
> > URL below.
> > 
> > Which might or might not still be applicable.
> 
> I think we actually have those two smp_mb()'s around the rq->curr
> assignment.
> 
> we have smp_mb__before_spinlock(), which per the argument here:
> 
>   https://lkml.kernel.org/r/20170607162013.755917928@infradead.org
> 
> is actually a full MB, irrespective of that weird smp_wmb() definition
> we have now. And we have switch_mm() on the other side.

OK, and the rq->curr assignment is in common code, correct?  Does this
allow the IPI-only-requesting-process approach to live entirely within
common code?

The 2010 email thread ended up with sys_membarrier() acquiring the
runqueue lock for each CPU, because doing otherwise meant adding code
to the scheduler fastpath.  Don't we still need to do this?

https://marc.info/?l=linux-kernel&m=126341138408407&w=2
https://marc.info/?l=linux-kernel&m=126349766324224&w=2

> > > > I was intending to base this on the last few versions of a 2010 patch,
> > > > but maybe things have changed:
> > > > 
> > > > 	https://marc.info/?l=linux-kernel&m=126358017229620&w=2
> > > > 	https://marc.info/?l=linux-kernel&m=126436996014016&w=2
> > > > 	https://marc.info/?l=linux-kernel&m=126601479802978&w=2
> > > > 	https://marc.info/?l=linux-kernel&m=126970692903302&w=2
> > > > 
> > > > Discussion here:
> > > > 
> > > > 	https://marc.info/?l=linux-kernel&m=126349766324224&w=2
> > > > 
> > > > The discussion led to acquiring the runqueue locks, as there was
> > > > otherwise a need to add code to the scheduler fastpaths.
> > > 
> > > TL;DR..  that's far too much to trawl through.
> > 
> > So we re-derive it from first principles instead?  ;-)
> 
> Yep, that's what I usually do anyway, who knows what kind of crazy our
> younger selves were up to ;-)

In my experience, it ends up being a type of crazy worth ignoring only
if I don't ignore it.  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-26  7:46                       ` Peter Zijlstra
@ 2017-07-26 15:42                         ` Paul E. McKenney
  2017-07-26 18:01                           ` Mathieu Desnoyers
  0 siblings, 1 reply; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-26 15:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

On Wed, Jul 26, 2017 at 09:46:56AM +0200, Peter Zijlstra wrote:
> On Tue, Jul 25, 2017 at 10:50:13PM +0000, Mathieu Desnoyers wrote:
> > This would implement a MEMBARRIER_CMD_PRIVATE_EXPEDITED (or such) flag
> > for expedited process-local effect. This differs from the "SHARED" flag,
> > since the SHARED flag affects threads accessing memory mappings shared
> > across processes as well.
> > 
> > I wonder if we could create a MEMBARRIER_CMD_SHARED_EXPEDITED behavior
> > by iterating on all memory mappings mapped into the current process,
> > and build a cpumask based on the union of all mm masks encountered ?
> > Then we could send the IPI to all cpus belonging to that cpumask. Or
> > am I missing something obvious ?
> 
> I would readily object to such a beast. You far too quickly end up
> having to IPI everybody because of some stupid shared map or something
> (yes I know, normal DSOs are mapped private).

Agreed, we should keep things simple to start with.  The user can always
invoke sys_membarrier() from each process.

							Thanx, Paul

> The whole private_expedited is only palatable because we can only hinder
> our own threads (much).
> 

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-26  9:36                   ` Will Deacon
@ 2017-07-26 15:46                     ` Paul E. McKenney
  0 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-26 15:46 UTC (permalink / raw)
  To: Will Deacon
  Cc: Peter Zijlstra, linux-kernel, mingo, jiangshanlai, dipankar,
	akpm, mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg

On Wed, Jul 26, 2017 at 10:36:46AM +0100, Will Deacon wrote:
> On Tue, Jul 25, 2017 at 02:19:26PM -0700, Paul E. McKenney wrote:
> > Some architectures are less precise than others in tracking which
> > CPUs are running a given process due to ASIDs, though this is
> > thought to be a non-problem:
> > 
> > 	https://marc.info/?l=linux-arch&m=126716090413065&w=2
> > 	https://marc.info/?l=linux-arch&m=126716262815202&w=2
> > 
> > Thoughts?
> 
> On arm64, we *never* touch mm_cpumask, so it will always be empty. The only
> thing we could potentially use it for is non-broadcast TLB invalidation if
> the mm was only active on a single CPU, but when I implemented that we
> pretty much never hit that case. I was hoping fork()+exec() would trigger it
> (e.g. scripts), but the scheduler treats both of those as rebalancing points
> so the temporary ASID before the exec always has two CPUs set in its mask.

OK, so it sounds like we should ignore ->mm_cpumask except possibly
for an architecture-specific optimization, and rely solely on ->mm
in the absence of such optimizations, right?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-26 15:42                         ` Paul E. McKenney
@ 2017-07-26 18:01                           ` Mathieu Desnoyers
  2017-07-26 18:30                             ` Paul E. McKenney
  2017-07-27  8:53                             ` Peter Zijlstra
  0 siblings, 2 replies; 68+ messages in thread
From: Mathieu Desnoyers @ 2017-07-26 18:01 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

----- On Jul 26, 2017, at 11:42 AM, Paul E. McKenney paulmck@linux.vnet.ibm.com wrote:

> On Wed, Jul 26, 2017 at 09:46:56AM +0200, Peter Zijlstra wrote:
>> On Tue, Jul 25, 2017 at 10:50:13PM +0000, Mathieu Desnoyers wrote:
>> > This would implement a MEMBARRIER_CMD_PRIVATE_EXPEDITED (or such) flag
>> > for expedited process-local effect. This differs from the "SHARED" flag,
>> > since the SHARED flag affects threads accessing memory mappings shared
>> > across processes as well.
>> > 
>> > I wonder if we could create a MEMBARRIER_CMD_SHARED_EXPEDITED behavior
>> > by iterating on all memory mappings mapped into the current process,
>> > and build a cpumask based on the union of all mm masks encountered ?
>> > Then we could send the IPI to all cpus belonging to that cpumask. Or
>> > am I missing something obvious ?
>> 
>> I would readily object to such a beast. You far too quickly end up
>> having to IPI everybody because of some stupid shared map or something
>> (yes I know, normal DSOs are mapped private).
> 
> Agreed, we should keep things simple to start with.  The user can always
> invoke sys_membarrier() from each process.

Another alternative for a MEMBARRIER_CMD_SHARED_EXPEDITED would be rate-limiting
per thread. For instance, we could add a new "ulimit" that would bound the
number of expedited membarrier per thread that can be done per millisecond,
and switch to synchronize_sched() whenever a thread goes beyond that limit
for the rest of the time-slot.

A RT system that really cares about not having userspace sending IPIs
to all cpus could set the ulimit value to 0, which would always use
synchronize_sched().

Thoughts ?

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-26 18:01                           ` Mathieu Desnoyers
@ 2017-07-26 18:30                             ` Paul E. McKenney
  2017-07-26 20:37                               ` Mathieu Desnoyers
  2017-07-27 10:24                               ` Peter Zijlstra
  2017-07-27  8:53                             ` Peter Zijlstra
  1 sibling, 2 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-26 18:30 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

On Wed, Jul 26, 2017 at 06:01:15PM +0000, Mathieu Desnoyers wrote:
> ----- On Jul 26, 2017, at 11:42 AM, Paul E. McKenney paulmck@linux.vnet.ibm.com wrote:
> 
> > On Wed, Jul 26, 2017 at 09:46:56AM +0200, Peter Zijlstra wrote:
> >> On Tue, Jul 25, 2017 at 10:50:13PM +0000, Mathieu Desnoyers wrote:
> >> > This would implement a MEMBARRIER_CMD_PRIVATE_EXPEDITED (or such) flag
> >> > for expedited process-local effect. This differs from the "SHARED" flag,
> >> > since the SHARED flag affects threads accessing memory mappings shared
> >> > across processes as well.
> >> > 
> >> > I wonder if we could create a MEMBARRIER_CMD_SHARED_EXPEDITED behavior
> >> > by iterating on all memory mappings mapped into the current process,
> >> > and build a cpumask based on the union of all mm masks encountered ?
> >> > Then we could send the IPI to all cpus belonging to that cpumask. Or
> >> > am I missing something obvious ?
> >> 
> >> I would readily object to such a beast. You far too quickly end up
> >> having to IPI everybody because of some stupid shared map or something
> >> (yes I know, normal DSOs are mapped private).
> > 
> > Agreed, we should keep things simple to start with.  The user can always
> > invoke sys_membarrier() from each process.
> 
> Another alternative for a MEMBARRIER_CMD_SHARED_EXPEDITED would be rate-limiting
> per thread. For instance, we could add a new "ulimit" that would bound the
> number of expedited membarrier per thread that can be done per millisecond,
> and switch to synchronize_sched() whenever a thread goes beyond that limit
> for the rest of the time-slot.
> 
> A RT system that really cares about not having userspace sending IPIs
> to all cpus could set the ulimit value to 0, which would always use
> synchronize_sched().
> 
> Thoughts ?

The patch I posted reverts to synchronize_sched() in kernels booted with
rcupdate.rcu_normal=1.  ;-)

But who is pushing for multiple-process sys_membarrier()?  Everyone I
have talked to is OK with it being local to the current process.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-26 18:30                             ` Paul E. McKenney
@ 2017-07-26 20:37                               ` Mathieu Desnoyers
  2017-07-26 21:11                                 ` Paul E. McKenney
  2017-07-27 10:24                               ` Peter Zijlstra
  1 sibling, 1 reply; 68+ messages in thread
From: Mathieu Desnoyers @ 2017-07-26 20:37 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

----- On Jul 26, 2017, at 2:30 PM, Paul E. McKenney paulmck@linux.vnet.ibm.com wrote:

> On Wed, Jul 26, 2017 at 06:01:15PM +0000, Mathieu Desnoyers wrote:
>> ----- On Jul 26, 2017, at 11:42 AM, Paul E. McKenney paulmck@linux.vnet.ibm.com
>> wrote:
>> 
>> > On Wed, Jul 26, 2017 at 09:46:56AM +0200, Peter Zijlstra wrote:
>> >> On Tue, Jul 25, 2017 at 10:50:13PM +0000, Mathieu Desnoyers wrote:
>> >> > This would implement a MEMBARRIER_CMD_PRIVATE_EXPEDITED (or such) flag
>> >> > for expedited process-local effect. This differs from the "SHARED" flag,
>> >> > since the SHARED flag affects threads accessing memory mappings shared
>> >> > across processes as well.
>> >> > 
>> >> > I wonder if we could create a MEMBARRIER_CMD_SHARED_EXPEDITED behavior
>> >> > by iterating on all memory mappings mapped into the current process,
>> >> > and build a cpumask based on the union of all mm masks encountered ?
>> >> > Then we could send the IPI to all cpus belonging to that cpumask. Or
>> >> > am I missing something obvious ?
>> >> 
>> >> I would readily object to such a beast. You far too quickly end up
>> >> having to IPI everybody because of some stupid shared map or something
>> >> (yes I know, normal DSOs are mapped private).
>> > 
>> > Agreed, we should keep things simple to start with.  The user can always
>> > invoke sys_membarrier() from each process.
>> 
>> Another alternative for a MEMBARRIER_CMD_SHARED_EXPEDITED would be rate-limiting
>> per thread. For instance, we could add a new "ulimit" that would bound the
>> number of expedited membarrier per thread that can be done per millisecond,
>> and switch to synchronize_sched() whenever a thread goes beyond that limit
>> for the rest of the time-slot.
>> 
>> A RT system that really cares about not having userspace sending IPIs
>> to all cpus could set the ulimit value to 0, which would always use
>> synchronize_sched().
>> 
>> Thoughts ?
> 
> The patch I posted reverts to synchronize_sched() in kernels booted with
> rcupdate.rcu_normal=1.  ;-)
> 
> But who is pushing for multiple-process sys_membarrier()?  Everyone I
> have talked to is OK with it being local to the current process.

I guess I'm probably the guilty one intending to do weird stuff in userspace ;)

Here are my two use-cases: 

* a new multi-process liburcu flavor, useful if e.g. a set of processes are
  responsible for updating a shared memory data structure, and a separate set
  of processes read that data structure. The readers can be killed without ill
  effect on the other processes. The synchronization could be done by one
  multi-process liburcu flavor per reader process "group".

* lttng-ust user-space ring buffers (shared across processes).

Both rely on a shared memory mapping for communication between processes, and
I would like to be able to issue a sys_membarrier targeting all CPUs that may
currently touch the shared memory mapping.

I don't really need a system-wide effect, but I would like to be able to target
a shared memory mapping and efficiently do an expedited sys_membarrier on all
cpus involved.

With lttng-ust, the shared buffers can spawn across 1000+ processes, so
asking each process to issue sys_membarrier would add lots of unneeded overhead,
because this would issue lots of needless memory barriers.

Thoughts ?

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-26 20:37                               ` Mathieu Desnoyers
@ 2017-07-26 21:11                                 ` Paul E. McKenney
  2017-07-27  1:45                                   ` Paul E. McKenney
  0 siblings, 1 reply; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-26 21:11 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

On Wed, Jul 26, 2017 at 08:37:23PM +0000, Mathieu Desnoyers wrote:
> ----- On Jul 26, 2017, at 2:30 PM, Paul E. McKenney paulmck@linux.vnet.ibm.com wrote:
> 
> > On Wed, Jul 26, 2017 at 06:01:15PM +0000, Mathieu Desnoyers wrote:
> >> ----- On Jul 26, 2017, at 11:42 AM, Paul E. McKenney paulmck@linux.vnet.ibm.com
> >> wrote:
> >> 
> >> > On Wed, Jul 26, 2017 at 09:46:56AM +0200, Peter Zijlstra wrote:
> >> >> On Tue, Jul 25, 2017 at 10:50:13PM +0000, Mathieu Desnoyers wrote:
> >> >> > This would implement a MEMBARRIER_CMD_PRIVATE_EXPEDITED (or such) flag
> >> >> > for expedited process-local effect. This differs from the "SHARED" flag,
> >> >> > since the SHARED flag affects threads accessing memory mappings shared
> >> >> > across processes as well.
> >> >> > 
> >> >> > I wonder if we could create a MEMBARRIER_CMD_SHARED_EXPEDITED behavior
> >> >> > by iterating on all memory mappings mapped into the current process,
> >> >> > and build a cpumask based on the union of all mm masks encountered ?
> >> >> > Then we could send the IPI to all cpus belonging to that cpumask. Or
> >> >> > am I missing something obvious ?
> >> >> 
> >> >> I would readily object to such a beast. You far too quickly end up
> >> >> having to IPI everybody because of some stupid shared map or something
> >> >> (yes I know, normal DSOs are mapped private).
> >> > 
> >> > Agreed, we should keep things simple to start with.  The user can always
> >> > invoke sys_membarrier() from each process.
> >> 
> >> Another alternative for a MEMBARRIER_CMD_SHARED_EXPEDITED would be rate-limiting
> >> per thread. For instance, we could add a new "ulimit" that would bound the
> >> number of expedited membarrier per thread that can be done per millisecond,
> >> and switch to synchronize_sched() whenever a thread goes beyond that limit
> >> for the rest of the time-slot.
> >> 
> >> A RT system that really cares about not having userspace sending IPIs
> >> to all cpus could set the ulimit value to 0, which would always use
> >> synchronize_sched().
> >> 
> >> Thoughts ?
> > 
> > The patch I posted reverts to synchronize_sched() in kernels booted with
> > rcupdate.rcu_normal=1.  ;-)
> > 
> > But who is pushing for multiple-process sys_membarrier()?  Everyone I
> > have talked to is OK with it being local to the current process.
> 
> I guess I'm probably the guilty one intending to do weird stuff in userspace ;)
> 
> Here are my two use-cases: 
> 
> * a new multi-process liburcu flavor, useful if e.g. a set of processes are
>   responsible for updating a shared memory data structure, and a separate set
>   of processes read that data structure. The readers can be killed without ill
>   effect on the other processes. The synchronization could be done by one
>   multi-process liburcu flavor per reader process "group".
> 
> * lttng-ust user-space ring buffers (shared across processes).
> 
> Both rely on a shared memory mapping for communication between processes, and
> I would like to be able to issue a sys_membarrier targeting all CPUs that may
> currently touch the shared memory mapping.
> 
> I don't really need a system-wide effect, but I would like to be able to target
> a shared memory mapping and efficiently do an expedited sys_membarrier on all
> cpus involved.
> 
> With lttng-ust, the shared buffers can spawn across 1000+ processes, so
> asking each process to issue sys_membarrier would add lots of unneeded overhead,
> because this would issue lots of needless memory barriers.
> 
> Thoughts ?

Dealing explicitly with 1000+ processes sounds like no picnic.  It instead
sounds like a job for synchronize_sched_expedited().  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-26 21:11                                 ` Paul E. McKenney
@ 2017-07-27  1:45                                   ` Paul E. McKenney
  2017-07-27 12:39                                     ` Mathieu Desnoyers
  0 siblings, 1 reply; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-27  1:45 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

On Wed, Jul 26, 2017 at 02:11:46PM -0700, Paul E. McKenney wrote:
> On Wed, Jul 26, 2017 at 08:37:23PM +0000, Mathieu Desnoyers wrote:
> > ----- On Jul 26, 2017, at 2:30 PM, Paul E. McKenney paulmck@linux.vnet.ibm.com wrote:
> > 
> > > On Wed, Jul 26, 2017 at 06:01:15PM +0000, Mathieu Desnoyers wrote:
> > >> ----- On Jul 26, 2017, at 11:42 AM, Paul E. McKenney paulmck@linux.vnet.ibm.com
> > >> wrote:
> > >> 
> > >> > On Wed, Jul 26, 2017 at 09:46:56AM +0200, Peter Zijlstra wrote:
> > >> >> On Tue, Jul 25, 2017 at 10:50:13PM +0000, Mathieu Desnoyers wrote:
> > >> >> > This would implement a MEMBARRIER_CMD_PRIVATE_EXPEDITED (or such) flag
> > >> >> > for expedited process-local effect. This differs from the "SHARED" flag,
> > >> >> > since the SHARED flag affects threads accessing memory mappings shared
> > >> >> > across processes as well.
> > >> >> > 
> > >> >> > I wonder if we could create a MEMBARRIER_CMD_SHARED_EXPEDITED behavior
> > >> >> > by iterating on all memory mappings mapped into the current process,
> > >> >> > and build a cpumask based on the union of all mm masks encountered ?
> > >> >> > Then we could send the IPI to all cpus belonging to that cpumask. Or
> > >> >> > am I missing something obvious ?
> > >> >> 
> > >> >> I would readily object to such a beast. You far too quickly end up
> > >> >> having to IPI everybody because of some stupid shared map or something
> > >> >> (yes I know, normal DSOs are mapped private).
> > >> > 
> > >> > Agreed, we should keep things simple to start with.  The user can always
> > >> > invoke sys_membarrier() from each process.
> > >> 
> > >> Another alternative for a MEMBARRIER_CMD_SHARED_EXPEDITED would be rate-limiting
> > >> per thread. For instance, we could add a new "ulimit" that would bound the
> > >> number of expedited membarrier per thread that can be done per millisecond,
> > >> and switch to synchronize_sched() whenever a thread goes beyond that limit
> > >> for the rest of the time-slot.
> > >> 
> > >> A RT system that really cares about not having userspace sending IPIs
> > >> to all cpus could set the ulimit value to 0, which would always use
> > >> synchronize_sched().
> > >> 
> > >> Thoughts ?
> > > 
> > > The patch I posted reverts to synchronize_sched() in kernels booted with
> > > rcupdate.rcu_normal=1.  ;-)
> > > 
> > > But who is pushing for multiple-process sys_membarrier()?  Everyone I
> > > have talked to is OK with it being local to the current process.
> > 
> > I guess I'm probably the guilty one intending to do weird stuff in userspace ;)
> > 
> > Here are my two use-cases: 
> > 
> > * a new multi-process liburcu flavor, useful if e.g. a set of processes are
> >   responsible for updating a shared memory data structure, and a separate set
> >   of processes read that data structure. The readers can be killed without ill
> >   effect on the other processes. The synchronization could be done by one
> >   multi-process liburcu flavor per reader process "group".
> > 
> > * lttng-ust user-space ring buffers (shared across processes).
> > 
> > Both rely on a shared memory mapping for communication between processes, and
> > I would like to be able to issue a sys_membarrier targeting all CPUs that may
> > currently touch the shared memory mapping.
> > 
> > I don't really need a system-wide effect, but I would like to be able to target
> > a shared memory mapping and efficiently do an expedited sys_membarrier on all
> > cpus involved.
> > 
> > With lttng-ust, the shared buffers can spawn across 1000+ processes, so
> > asking each process to issue sys_membarrier would add lots of unneeded overhead,
> > because this would issue lots of needless memory barriers.
> > 
> > Thoughts ?
> 
> Dealing explicitly with 1000+ processes sounds like no picnic.  It instead
> sounds like a job for synchronize_sched_expedited().  ;-)

Actually...

Mathieu, does your use case require unprivileged access to sys_membarrier()?

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-26 15:41                         ` Paul E. McKenney
@ 2017-07-27  8:30                           ` Peter Zijlstra
  2017-07-27 13:08                             ` Paul E. McKenney
  0 siblings, 1 reply; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-27  8:30 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

On Wed, Jul 26, 2017 at 08:41:10AM -0700, Paul E. McKenney wrote:
> On Wed, Jul 26, 2017 at 09:41:28AM +0200, Peter Zijlstra wrote:
> > On Tue, Jul 25, 2017 at 04:59:36PM -0700, Paul E. McKenney wrote:

> > Sure, but SCHED_OTHER auto throttles in that if there's anything else to
> > run, you get to wait. So you can't generate an IPI storm with it. Also,
> > again, we can be limited to a subset of CPUs.
> 
> OK, what is its auto-throttle policy?  One round of IPIs per jiffy or
> some such?

No. Its called wakeup latency :-) Your SCHED_OTHER task will not get to
insta-run all the time. If there are other tasks already running, we'll
not IPI unless it should preempt.

If its idle, nobody cares..

> Does this auto-throttling also apply if the user is running a CPU-bound
> SCHED_BATCH or SCHED_IDLE task on each CPU, and periodically waking up
> one of a large group of SCHED_OTHER tasks, where the SCHED_OTHER tasks
> immediately sleep upon being awakened?

SCHED_BATCH is even more likely to suffer wakeup latency since it will
never preempt anything.

> OK, and the rq->curr assignment is in common code, correct?  Does this
> allow the IPI-only-requesting-process approach to live entirely within
> common code?

That is the idea.

> The 2010 email thread ended up with sys_membarrier() acquiring the
> runqueue lock for each CPU,

Yes, that's something I'm not happy with. Machine wide banging of that
lock will be a performance no-no.

> because doing otherwise meant adding code to the scheduler fastpath.

And that's obviously another thing I'm not happy with either.

> Don't we still need to do this?
> 
> https://marc.info/?l=linux-kernel&m=126341138408407&w=2
> https://marc.info/?l=linux-kernel&m=126349766324224&w=2

I don't know.. those seem focussed on mm_cpumask() and we can't use that
per Will's email.


So I think we need to think anew on this, start from the ground up.

What is missing for this:

static void ipi_mb(void *info)
{
	smp_mb(); // IPIs should be serializing but paranoid
}


sys_membarrier()
{
	smp_mb(); // because sysenter isn't an unconditional mb

	for_each_online_cpu(cpu) {
		struct task_struct *p;

		rcu_read_lock();
		p = task_rcu_dereference(&cpu_curr(cpu));
		if (p && p->mm == current->mm)
			__set_bit(cpus, cpu);
		rcu_read_unlock();
	}

	on_cpu_cpu_mask(cpus, ipi_mb, NULL, 1); // does local smp_mb() too
}

VS

__schedule()
{
	spin_lock(&rq->lock);
	smp_mb__after_spinlock(); // really full mb implied

	/* lots */

	if (likely(prev != next)) {

		rq->curr = next;

		context_switch() {
			switch_mm();
			switch_to();
			// neither need imply a barrier

			spin_unlock(&rq->lock);
		}
	}
}




So I think we need either switch_mm() or switch_to() to imply a full
barrier for this to work, otherwise we get:

  CPU0				CPU1


  lock rq->lock
  mb

  rq->curr = A

  unlock rq->lock

  lock rq->lock
  mb

				sys_membarrier()

				mb

				for_each_online_cpu()
				  p = A
				  // no match no IPI

				mb
  rq->curr = B

  unlock rq->lock


And that's bad, because now CPU0 doesn't have an MB happening _after_
sys_membarrier() if B matches.


So without audit, I only know of PPC and Alpha not having a barrier in
either switch_*().

x86 obviously has barriers all over the place, arm has a super duper
heavy barrier in switch_to().

One option might be to resurrect spin_unlock_wait(), although to use
that here is really ugly too, but it would avoid thrashing the
rq->lock.

I think it'd end up having to look like:

	rq = cpu_rq(cpu);
again:
	rcu_read_lock()
	p = task_rcu_dereference(&rq->curr);
	if (p) {
		raw_spin_unlock_wait(&rq->lock);
		q = task_rcu_dereference(&rq->curr);
		if (q != p) {
			rcu_read_unlock();
			goto again;
		}
	}
	...

which is just about as horrible as it looks.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-26 18:01                           ` Mathieu Desnoyers
  2017-07-26 18:30                             ` Paul E. McKenney
@ 2017-07-27  8:53                             ` Peter Zijlstra
  2017-07-27 10:09                               ` Peter Zijlstra
                                                 ` (2 more replies)
  1 sibling, 3 replies; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-27  8:53 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Paul E. McKenney, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

On Wed, Jul 26, 2017 at 06:01:15PM +0000, Mathieu Desnoyers wrote:

> Another alternative for a MEMBARRIER_CMD_SHARED_EXPEDITED would be rate-limiting
> per thread. For instance, we could add a new "ulimit" that would bound the
> number of expedited membarrier per thread that can be done per millisecond,
> and switch to synchronize_sched() whenever a thread goes beyond that limit
> for the rest of the time-slot.

You forgot to ask yourself how you could abuse this.. just spawn more
threads.

Per-thread limits are nearly useless, because spawning new threads is
cheap.

> A RT system that really cares about not having userspace sending IPIs
> to all cpus could set the ulimit value to 0, which would always use
> synchronize_sched().
> 
> Thoughts ?

So I really don't like SHARED_EXPEDITED, and your use-cases (from later
emails) makes me think sys_membarrier() should have a pointer argument
to identify the shared mapping.

But even then, iterating the rmap for something that has 1000+ maps
isn't going to be nice or fast, even in kernel space.

Another crazy idea is using madvise() for this. The new MADV_MEMBAR
could revoke PROT_WRITE and PROT_READ for all extant PTEs. Then the
tasks attempting access will fault and the fault handler can figure out
if it still needs to issue a MB or not before reinstating the PTE.

That is fully contained to the tasks actually having that map, and
doesn't perturb anybody else.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27  8:53                             ` Peter Zijlstra
@ 2017-07-27 10:09                               ` Peter Zijlstra
  2017-07-27 10:22                               ` Will Deacon
  2017-07-27 13:14                               ` Paul E. McKenney
  2 siblings, 0 replies; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-27 10:09 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Paul E. McKenney, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

On Thu, Jul 27, 2017 at 10:53:12AM +0200, Peter Zijlstra wrote:

> Another crazy idea is using madvise() for this. The new MADV_MEMBAR
> could revoke PROT_WRITE and PROT_READ for all extant PTEs. Then the
> tasks attempting access will fault and the fault handler can figure out
> if it still needs to issue a MB or not before reinstating the PTE.

Slight oversight is that page-tables are per process and we need a MB
per thread, so the fault handler can simply issue a private
sys_membarrier() to spread the joy around the local process.

> That is fully contained to the tasks actually having that map, and
> doesn't perturb anybody else.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-25 19:36             ` Paul E. McKenney
  2017-07-25 20:24               ` Peter Zijlstra
@ 2017-07-27 10:14               ` Peter Zijlstra
  2017-07-27 12:56                 ` Paul E. McKenney
  1 sibling, 1 reply; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-27 10:14 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg

On Tue, Jul 25, 2017 at 12:36:12PM -0700, Paul E. McKenney wrote:
> This horse is already out, so trying to shut the gate won't be effective.

So I'm not convinced it is. The mprotect() hack isn't portable as we've
established and on x86 where it does work, it doesn't (much) perturb
tasks not related to our process because we keep a tight mm_cpumask().

And if there are other (unpriv.) means of spraying IPIs around, we
should most certainly look at fixing those, not just shrug and make
matters worse.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27  8:53                             ` Peter Zijlstra
  2017-07-27 10:09                               ` Peter Zijlstra
@ 2017-07-27 10:22                               ` Will Deacon
  2017-07-27 13:14                               ` Paul E. McKenney
  2 siblings, 0 replies; 68+ messages in thread
From: Will Deacon @ 2017-07-27 10:22 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, Paul E. McKenney, linux-kernel, Ingo Molnar,
	Lai Jiangshan, dipankar, Andrew Morton, Josh Triplett,
	Thomas Gleixner, rostedt, David Howells, Eric Dumazet, fweisbec,
	Oleg Nesterov

On Thu, Jul 27, 2017 at 10:53:12AM +0200, Peter Zijlstra wrote:
> On Wed, Jul 26, 2017 at 06:01:15PM +0000, Mathieu Desnoyers wrote:
> 
> > Another alternative for a MEMBARRIER_CMD_SHARED_EXPEDITED would be rate-limiting
> > per thread. For instance, we could add a new "ulimit" that would bound the
> > number of expedited membarrier per thread that can be done per millisecond,
> > and switch to synchronize_sched() whenever a thread goes beyond that limit
> > for the rest of the time-slot.
> 
> You forgot to ask yourself how you could abuse this.. just spawn more
> threads.
> 
> Per-thread limits are nearly useless, because spawning new threads is
> cheap.
> 
> > A RT system that really cares about not having userspace sending IPIs
> > to all cpus could set the ulimit value to 0, which would always use
> > synchronize_sched().
> > 
> > Thoughts ?
> 
> So I really don't like SHARED_EXPEDITED, and your use-cases (from later
> emails) makes me think sys_membarrier() should have a pointer argument
> to identify the shared mapping.
> 
> But even then, iterating the rmap for something that has 1000+ maps
> isn't going to be nice or fast, even in kernel space.
> 
> Another crazy idea is using madvise() for this. The new MADV_MEMBAR
> could revoke PROT_WRITE and PROT_READ for all extant PTEs. Then the
> tasks attempting access will fault and the fault handler can figure out
> if it still needs to issue a MB or not before reinstating the PTE.

If you did that, wouldn't you need to leave the faulting entries intact
until all CPUs with the mm scheduled have either faulted or
context-switched? We don't have per-cpu permission bits in the page tables,
unfortunately.

Will

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-26 18:30                             ` Paul E. McKenney
  2017-07-26 20:37                               ` Mathieu Desnoyers
@ 2017-07-27 10:24                               ` Peter Zijlstra
  2017-07-27 14:52                                 ` Paul E. McKenney
  1 sibling, 1 reply; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-27 10:24 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Mathieu Desnoyers, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

On Wed, Jul 26, 2017 at 11:30:32AM -0700, Paul E. McKenney wrote:
> The patch I posted reverts to synchronize_sched() in kernels booted with
> rcupdate.rcu_normal=1.  ;-)

So boot parameters are no solution and are only slightly better than
compile time switches.

What if you have a machine that runs workloads that want both options?
partitioning and containers are somewhat popular these days and system
wide tunables don't work for them.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27  1:45                                   ` Paul E. McKenney
@ 2017-07-27 12:39                                     ` Mathieu Desnoyers
  2017-07-27 14:44                                       ` Paul E. McKenney
  0 siblings, 1 reply; 68+ messages in thread
From: Mathieu Desnoyers @ 2017-07-27 12:39 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

----- On Jul 26, 2017, at 9:45 PM, Paul E. McKenney paulmck@linux.vnet.ibm.com wrote:

> On Wed, Jul 26, 2017 at 02:11:46PM -0700, Paul E. McKenney wrote:
>> On Wed, Jul 26, 2017 at 08:37:23PM +0000, Mathieu Desnoyers wrote:
>> > ----- On Jul 26, 2017, at 2:30 PM, Paul E. McKenney paulmck@linux.vnet.ibm.com
>> > wrote:
>> > 
>> > > On Wed, Jul 26, 2017 at 06:01:15PM +0000, Mathieu Desnoyers wrote:
>> > >> ----- On Jul 26, 2017, at 11:42 AM, Paul E. McKenney paulmck@linux.vnet.ibm.com
>> > >> wrote:
>> > >> 
>> > >> > On Wed, Jul 26, 2017 at 09:46:56AM +0200, Peter Zijlstra wrote:
>> > >> >> On Tue, Jul 25, 2017 at 10:50:13PM +0000, Mathieu Desnoyers wrote:
>> > >> >> > This would implement a MEMBARRIER_CMD_PRIVATE_EXPEDITED (or such) flag
>> > >> >> > for expedited process-local effect. This differs from the "SHARED" flag,
>> > >> >> > since the SHARED flag affects threads accessing memory mappings shared
>> > >> >> > across processes as well.
>> > >> >> > 
>> > >> >> > I wonder if we could create a MEMBARRIER_CMD_SHARED_EXPEDITED behavior
>> > >> >> > by iterating on all memory mappings mapped into the current process,
>> > >> >> > and build a cpumask based on the union of all mm masks encountered ?
>> > >> >> > Then we could send the IPI to all cpus belonging to that cpumask. Or
>> > >> >> > am I missing something obvious ?
>> > >> >> 
>> > >> >> I would readily object to such a beast. You far too quickly end up
>> > >> >> having to IPI everybody because of some stupid shared map or something
>> > >> >> (yes I know, normal DSOs are mapped private).
>> > >> > 
>> > >> > Agreed, we should keep things simple to start with.  The user can always
>> > >> > invoke sys_membarrier() from each process.
>> > >> 
>> > >> Another alternative for a MEMBARRIER_CMD_SHARED_EXPEDITED would be rate-limiting
>> > >> per thread. For instance, we could add a new "ulimit" that would bound the
>> > >> number of expedited membarrier per thread that can be done per millisecond,
>> > >> and switch to synchronize_sched() whenever a thread goes beyond that limit
>> > >> for the rest of the time-slot.
>> > >> 
>> > >> A RT system that really cares about not having userspace sending IPIs
>> > >> to all cpus could set the ulimit value to 0, which would always use
>> > >> synchronize_sched().
>> > >> 
>> > >> Thoughts ?
>> > > 
>> > > The patch I posted reverts to synchronize_sched() in kernels booted with
>> > > rcupdate.rcu_normal=1.  ;-)
>> > > 
>> > > But who is pushing for multiple-process sys_membarrier()?  Everyone I
>> > > have talked to is OK with it being local to the current process.
>> > 
>> > I guess I'm probably the guilty one intending to do weird stuff in userspace ;)
>> > 
>> > Here are my two use-cases:
>> > 
>> > * a new multi-process liburcu flavor, useful if e.g. a set of processes are
>> >   responsible for updating a shared memory data structure, and a separate set
>> >   of processes read that data structure. The readers can be killed without ill
>> >   effect on the other processes. The synchronization could be done by one
>> >   multi-process liburcu flavor per reader process "group".
>> > 
>> > * lttng-ust user-space ring buffers (shared across processes).
>> > 
>> > Both rely on a shared memory mapping for communication between processes, and
>> > I would like to be able to issue a sys_membarrier targeting all CPUs that may
>> > currently touch the shared memory mapping.
>> > 
>> > I don't really need a system-wide effect, but I would like to be able to target
>> > a shared memory mapping and efficiently do an expedited sys_membarrier on all
>> > cpus involved.
>> > 
>> > With lttng-ust, the shared buffers can spawn across 1000+ processes, so
>> > asking each process to issue sys_membarrier would add lots of unneeded overhead,
>> > because this would issue lots of needless memory barriers.
>> > 
>> > Thoughts ?
>> 
>> Dealing explicitly with 1000+ processes sounds like no picnic.  It instead
>> sounds like a job for synchronize_sched_expedited().  ;-)
> 
> Actually...
> 
> Mathieu, does your use case require unprivileged access to sys_membarrier()?

Unfortunately, yes, it does require sys_membarrier to be used from non-root
both for lttng-ust and liburcu multi-process. And as Peter pointed out, stuff
like containers complicates things even for the root case.

Thanks,

Mathieu

> 
> 							Thanx, Paul

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 10:14               ` Peter Zijlstra
@ 2017-07-27 12:56                 ` Paul E. McKenney
  2017-07-27 13:37                   ` Peter Zijlstra
  0 siblings, 1 reply; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-27 12:56 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg

On Thu, Jul 27, 2017 at 12:14:26PM +0200, Peter Zijlstra wrote:
> On Tue, Jul 25, 2017 at 12:36:12PM -0700, Paul E. McKenney wrote:
> > This horse is already out, so trying to shut the gate won't be effective.
> 
> So I'm not convinced it is. The mprotect() hack isn't portable as we've
> established and on x86 where it does work, it doesn't (much) perturb
> tasks not related to our process because we keep a tight mm_cpumask().

Wrong.  People are using it today, portable or not.  If we want them
to stop using it, we need to give them an alternative.  Period.

> And if there are other (unpriv.) means of spraying IPIs around, we
> should most certainly look at fixing those, not just shrug and make
> matters worse.

We need to keep an open mind for a bit.  If there was a trivial
solution, we would have implemented it back in 2010.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27  8:30                           ` Peter Zijlstra
@ 2017-07-27 13:08                             ` Paul E. McKenney
  2017-07-27 13:49                               ` Peter Zijlstra
                                                 ` (2 more replies)
  0 siblings, 3 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-27 13:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

On Thu, Jul 27, 2017 at 10:30:03AM +0200, Peter Zijlstra wrote:
> On Wed, Jul 26, 2017 at 08:41:10AM -0700, Paul E. McKenney wrote:
> > On Wed, Jul 26, 2017 at 09:41:28AM +0200, Peter Zijlstra wrote:
> > > On Tue, Jul 25, 2017 at 04:59:36PM -0700, Paul E. McKenney wrote:
> 
> > > Sure, but SCHED_OTHER auto throttles in that if there's anything else to
> > > run, you get to wait. So you can't generate an IPI storm with it. Also,
> > > again, we can be limited to a subset of CPUs.
> > 
> > OK, what is its auto-throttle policy?  One round of IPIs per jiffy or
> > some such?
> 
> No. Its called wakeup latency :-) Your SCHED_OTHER task will not get to
> insta-run all the time. If there are other tasks already running, we'll
> not IPI unless it should preempt.
> 
> If its idle, nobody cares..

So it does IPI immediately sometimes.

> > Does this auto-throttling also apply if the user is running a CPU-bound
> > SCHED_BATCH or SCHED_IDLE task on each CPU, and periodically waking up
> > one of a large group of SCHED_OTHER tasks, where the SCHED_OTHER tasks
> > immediately sleep upon being awakened?
> 
> SCHED_BATCH is even more likely to suffer wakeup latency since it will
> never preempt anything.

Ahem.  In this scenario, SCHED_BATCH is already running on a the CPU in
question, and a SCHED_OTHER task is awakened from some other CPU.

Do we IPI in that case.

> > OK, and the rq->curr assignment is in common code, correct?  Does this
> > allow the IPI-only-requesting-process approach to live entirely within
> > common code?
> 
> That is the idea.
> 
> > The 2010 email thread ended up with sys_membarrier() acquiring the
> > runqueue lock for each CPU,
> 
> Yes, that's something I'm not happy with. Machine wide banging of that
> lock will be a performance no-no.

Regardless of whether or not we need to acquire runqueue locks, IPI,
or whatever other distasteful operation, we should also be able to
throttle and batch operations to at least some extent.

> > because doing otherwise meant adding code to the scheduler fastpath.
> 
> And that's obviously another thing I'm not happy with either.

Nor should you or anyone be.

> > Don't we still need to do this?
> > 
> > https://marc.info/?l=linux-kernel&m=126341138408407&w=2
> > https://marc.info/?l=linux-kernel&m=126349766324224&w=2
> 
> I don't know.. those seem focussed on mm_cpumask() and we can't use that
> per Will's email.
> 
> So I think we need to think anew on this, start from the ground up.

Probably from several points in the ground, but OK...

> What is missing for this:
> 
> static void ipi_mb(void *info)
> {
> 	smp_mb(); // IPIs should be serializing but paranoid
> }
> 
> 
> sys_membarrier()
> {
> 	smp_mb(); // because sysenter isn't an unconditional mb
> 
> 	for_each_online_cpu(cpu) {
> 		struct task_struct *p;
> 
> 		rcu_read_lock();
> 		p = task_rcu_dereference(&cpu_curr(cpu));
> 		if (p && p->mm == current->mm)
> 			__set_bit(cpus, cpu);
> 		rcu_read_unlock();
> 	}
> 
> 	on_cpu_cpu_mask(cpus, ipi_mb, NULL, 1); // does local smp_mb() too
> }
> 
> VS
> 
> __schedule()
> {
> 	spin_lock(&rq->lock);
> 	smp_mb__after_spinlock(); // really full mb implied
> 
> 	/* lots */
> 
> 	if (likely(prev != next)) {
> 
> 		rq->curr = next;
> 
> 		context_switch() {
> 			switch_mm();
> 			switch_to();
> 			// neither need imply a barrier
> 
> 			spin_unlock(&rq->lock);
> 		}
> 	}
> }
> 
> 
> 
> 
> So I think we need either switch_mm() or switch_to() to imply a full
> barrier for this to work, otherwise we get:
> 
>   CPU0				CPU1
> 
> 
>   lock rq->lock
>   mb
> 
>   rq->curr = A
> 
>   unlock rq->lock
> 
>   lock rq->lock
>   mb
> 
> 				sys_membarrier()
> 
> 				mb
> 
> 				for_each_online_cpu()
> 				  p = A
> 				  // no match no IPI
> 
> 				mb
>   rq->curr = B
> 
>   unlock rq->lock
> 
> 
> And that's bad, because now CPU0 doesn't have an MB happening _after_
> sys_membarrier() if B matches.

Yes, this looks somewhat similar to the scenario that Mathieu pointed out
back in 2010: https://marc.info/?l=linux-kernel&m=126349766324224&w=2

> So without audit, I only know of PPC and Alpha not having a barrier in
> either switch_*().
> 
> x86 obviously has barriers all over the place, arm has a super duper
> heavy barrier in switch_to().

Agreed, if we are going to rely on ->mm, we need ordering on assignment
to it.

> One option might be to resurrect spin_unlock_wait(), although to use
> that here is really ugly too, but it would avoid thrashing the
> rq->lock.
> 
> I think it'd end up having to look like:
> 
> 	rq = cpu_rq(cpu);
> again:
> 	rcu_read_lock()
> 	p = task_rcu_dereference(&rq->curr);
> 	if (p) {
> 		raw_spin_unlock_wait(&rq->lock);
> 		q = task_rcu_dereference(&rq->curr);
> 		if (q != p) {
> 			rcu_read_unlock();
> 			goto again;
> 		}
> 	}
> 	...
> 
> which is just about as horrible as it looks.

It does indeed look a bit suboptimal.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27  8:53                             ` Peter Zijlstra
  2017-07-27 10:09                               ` Peter Zijlstra
  2017-07-27 10:22                               ` Will Deacon
@ 2017-07-27 13:14                               ` Paul E. McKenney
  2 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-27 13:14 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

On Thu, Jul 27, 2017 at 10:53:12AM +0200, Peter Zijlstra wrote:
> On Wed, Jul 26, 2017 at 06:01:15PM +0000, Mathieu Desnoyers wrote:
> 
> > Another alternative for a MEMBARRIER_CMD_SHARED_EXPEDITED would be rate-limiting
> > per thread. For instance, we could add a new "ulimit" that would bound the
> > number of expedited membarrier per thread that can be done per millisecond,
> > and switch to synchronize_sched() whenever a thread goes beyond that limit
> > for the rest of the time-slot.
> 
> You forgot to ask yourself how you could abuse this.. just spawn more
> threads.
> 
> Per-thread limits are nearly useless, because spawning new threads is
> cheap.

Agreed -- any per-thread limit has to be some portion/fraction of a
global limit.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 12:56                 ` Paul E. McKenney
@ 2017-07-27 13:37                   ` Peter Zijlstra
  2017-07-27 14:33                     ` Paul E. McKenney
  0 siblings, 1 reply; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-27 13:37 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg

On Thu, Jul 27, 2017 at 05:56:59AM -0700, Paul E. McKenney wrote:
> On Thu, Jul 27, 2017 at 12:14:26PM +0200, Peter Zijlstra wrote:
> > On Tue, Jul 25, 2017 at 12:36:12PM -0700, Paul E. McKenney wrote:
> > > This horse is already out, so trying to shut the gate won't be effective.
> > 
> > So I'm not convinced it is. The mprotect() hack isn't portable as we've
> > established and on x86 where it does work, it doesn't (much) perturb
> > tasks not related to our process because we keep a tight mm_cpumask().
> 
> Wrong.  People are using it today, portable or not.  If we want them
> to stop using it, we need to give them an alternative.  Period.

What's wrong? The mprotect() hack isn't portable, nor does it perturb
other users much.

I would much rather they use this than your
synchronize_sched_expedited() thing. Its much better behaved. And
they'll run into pain the moment they start using ARM,PPC,S390,etc..

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 13:08                             ` Paul E. McKenney
@ 2017-07-27 13:49                               ` Peter Zijlstra
  2017-07-27 14:32                                 ` Paul E. McKenney
  2017-07-27 13:55                               ` Boqun Feng
  2017-07-27 13:56                               ` Peter Zijlstra
  2 siblings, 1 reply; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-27 13:49 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

On Thu, Jul 27, 2017 at 06:08:16AM -0700, Paul E. McKenney wrote:

> > No. Its called wakeup latency :-) Your SCHED_OTHER task will not get to
> > insta-run all the time. If there are other tasks already running, we'll
> > not IPI unless it should preempt.
> > 
> > If its idle, nobody cares..
> 
> So it does IPI immediately sometimes.
> 
> > > Does this auto-throttling also apply if the user is running a CPU-bound
> > > SCHED_BATCH or SCHED_IDLE task on each CPU, and periodically waking up
> > > one of a large group of SCHED_OTHER tasks, where the SCHED_OTHER tasks
> > > immediately sleep upon being awakened?
> > 
> > SCHED_BATCH is even more likely to suffer wakeup latency since it will
> > never preempt anything.
> 
> Ahem.  In this scenario, SCHED_BATCH is already running on a the CPU in
> question, and a SCHED_OTHER task is awakened from some other CPU.
> 
> Do we IPI in that case.

So I'm a bit confused as to where you're trying to go with this.

I'm saying that if there are other users of our CPU, we can't
significantly disturb them with IPIs.

Yes, we'll sometimes IPI in order to do a preemption on wakeup. But we
cannot always win that. There is no wakeup triggered starvation case.

If you create a thread per CPU and have them insta sleep after wakeup,
and then keep prodding them to wakeup. They will disturb things less
than if they were while(1); loops.

If the machine is otherwise idle, nobody cares.

If there are other tasks on the system, the IPI rate is limited to the
wakeup latency of you tasks.


And any of this is limited to the CPUs we're allowed to run in the first
place.


So yes, the occasional IPI happens, but if there's other tasks, we can't
disturb them more than we could with while(1); tasks.


OTOH, something like:

	while(1)
		synchronize_sched_expedited();

as per your proposed patch, will spray IPIs to all CPUs and at high
rates.

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 13:08                             ` Paul E. McKenney
  2017-07-27 13:49                               ` Peter Zijlstra
@ 2017-07-27 13:55                               ` Boqun Feng
  2017-07-27 14:16                                 ` Paul E. McKenney
  2017-07-27 13:56                               ` Peter Zijlstra
  2 siblings, 1 reply; 68+ messages in thread
From: Boqun Feng @ 2017-07-27 13:55 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, linux-kernel, mingo, jiangshanlai, dipankar,
	akpm, mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

[-- Attachment #1: Type: text/plain, Size: 1981 bytes --]

Hi Paul,

I have a side question out of curiosity:

How does synchronize_sched() work properly for sys_membarrier()?

sys_membarrier() requires every other CPU does a smp_mb() before it
returns, and I know synchronize_sched() will wait until all CPUs running
a kernel thread do a context-switch, which has a smp_mb(). However, I
believe sched flavor RCU treat CPU running a user thread as a quiesent
state, so synchronize_sched() could return without that CPU does a
context switch. 

So why could we use synchronize_sched() for sys_membarrier()?

In particular, could the following happens?

	CPU 0:				CPU 1:
	=========================	==========================
	<in user space>			<in user space>
					{read Y}(reordered) <------------------------------+
	store Y;			                                                   |
					read X; --------------------------------------+    |
	sys_membarrier():		<timer interrupt>                             |    |
	  synchronize_sched();		  update_process_times(user): //user == true  |    |
	  				    rcu_check_callbacks(usr):                 |    |
					      if (user || ..) {                       |    |
					        rcu_sched_qs()                        |    |
						...                                   |    |
						<report quesient state in softirq>    |    |
					<return to user space>                        |    |
					read Y; --------------------------------------+----+
	store X;			                                              |
					{read X}(reordered) <-------------------------+

I assume the timer interrupt handler, which interrupts a user space and
reports a quiesent state for sched flavor RCU, may not have a smp_mb()
in some code path.

I may miss something subtle, but it just not very obvious how
synchronize_sched() will guarantee a remote CPU running in userspace to
do a smp_mb() before it returns, this is at least not in RCU
requirements, right?

Regards,
Boqun

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 13:08                             ` Paul E. McKenney
  2017-07-27 13:49                               ` Peter Zijlstra
  2017-07-27 13:55                               ` Boqun Feng
@ 2017-07-27 13:56                               ` Peter Zijlstra
  2017-07-27 15:19                                 ` Peter Zijlstra
  2 siblings, 1 reply; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-27 13:56 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon, Boqun Feng

On Thu, Jul 27, 2017 at 06:08:16AM -0700, Paul E. McKenney wrote:

> > So I think we need either switch_mm() or switch_to() to imply a full
> > barrier for this to work, otherwise we get:
> > 
> >   CPU0				CPU1
> > 
> > 
> >   lock rq->lock
> >   mb
> > 
> >   rq->curr = A
> > 
> >   unlock rq->lock
> > 
> >   lock rq->lock
> >   mb
> > 
> > 				sys_membarrier()
> > 
> > 				mb
> > 
> > 				for_each_online_cpu()
> > 				  p = A
> > 				  // no match no IPI
> > 
> > 				mb
> >   rq->curr = B
> > 
> >   unlock rq->lock
> > 
> > 
> > And that's bad, because now CPU0 doesn't have an MB happening _after_
> > sys_membarrier() if B matches.
> 
> Yes, this looks somewhat similar to the scenario that Mathieu pointed out
> back in 2010: https://marc.info/?l=linux-kernel&m=126349766324224&w=2

Yes. Minus the mm_cpumask() worries.

> > So without audit, I only know of PPC and Alpha not having a barrier in
> > either switch_*().
> > 
> > x86 obviously has barriers all over the place, arm has a super duper
> > heavy barrier in switch_to().
> 
> Agreed, if we are going to rely on ->mm, we need ordering on assignment
> to it.

Right, Boqun provided this reordering to show the problem:

  CPU0                                CPU1
 
 
  <in process X>
  lock rq->lock
  mb
 
  rq->curr = A
 
  unlock rq->lock
 
  <switch to process A>
 
  lock rq->lock
  mb
  read Y(reordered)<---+
                       |        store to Y
                       |
                       |        sys_membarrier()
                       |
                       |        mb
                       |
                       |        for_each_online_cpu()
                       |          p = A
                       |          // no match no IPI
                       |
                       |        mb
                       |
                       |        store to X
  rq->curr = B         |
                       |
  unlock rq->lock      |
  <switch to B>        |
  read X               |
                       |
  read Y --------------+

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 13:55                               ` Boqun Feng
@ 2017-07-27 14:16                                 ` Paul E. McKenney
  2017-07-27 14:29                                   ` Boqun Feng
  0 siblings, 1 reply; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-27 14:16 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Peter Zijlstra, linux-kernel, mingo, jiangshanlai, dipankar,
	akpm, mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

On Thu, Jul 27, 2017 at 09:55:51PM +0800, Boqun Feng wrote:
> Hi Paul,
> 
> I have a side question out of curiosity:
> 
> How does synchronize_sched() work properly for sys_membarrier()?
> 
> sys_membarrier() requires every other CPU does a smp_mb() before it
> returns, and I know synchronize_sched() will wait until all CPUs running
> a kernel thread do a context-switch, which has a smp_mb(). However, I
> believe sched flavor RCU treat CPU running a user thread as a quiesent
> state, so synchronize_sched() could return without that CPU does a
> context switch. 
> 
> So why could we use synchronize_sched() for sys_membarrier()?
> 
> In particular, could the following happens?
> 
> 	CPU 0:				CPU 1:
> 	=========================	==========================
> 	<in user space>			<in user space>
> 					{read Y}(reordered) <------------------------------+
> 	store Y;			                                                   |
> 					read X; --------------------------------------+    |
> 	sys_membarrier():		<timer interrupt>                             |    |
> 	  synchronize_sched();		  update_process_times(user): //user == true  |    |
> 	  				    rcu_check_callbacks(usr):                 |    |
> 					      if (user || ..) {                       |    |
> 					        rcu_sched_qs()                        |    |
> 						...                                   |    |
> 						<report quesient state in softirq>    |    |

The reporting of the quiescent state will acquire the leaf rcu_node
structure's lock, with an smp_mb__after_unlock_lock(), which will
one way or another be a full memory barrier.  So the reorderings
cannot happen.

Unless I am missing something subtle.  ;-)

						Thanx, Paul

> 					<return to user space>                        |    |
> 					read Y; --------------------------------------+----+
> 	store X;			                                              |
> 					{read X}(reordered) <-------------------------+
> 
> I assume the timer interrupt handler, which interrupts a user space and
> reports a quiesent state for sched flavor RCU, may not have a smp_mb()
> in some code path.
> 
> I may miss something subtle, but it just not very obvious how
> synchronize_sched() will guarantee a remote CPU running in userspace to
> do a smp_mb() before it returns, this is at least not in RCU
> requirements, right?
> 
> Regards,
> Boqun

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 14:16                                 ` Paul E. McKenney
@ 2017-07-27 14:29                                   ` Boqun Feng
  2017-07-27 14:36                                     ` Paul E. McKenney
  0 siblings, 1 reply; 68+ messages in thread
From: Boqun Feng @ 2017-07-27 14:29 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, linux-kernel, mingo, jiangshanlai, dipankar,
	akpm, mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

[-- Attachment #1: Type: text/plain, Size: 3055 bytes --]

On Thu, Jul 27, 2017 at 07:16:33AM -0700, Paul E. McKenney wrote:
> On Thu, Jul 27, 2017 at 09:55:51PM +0800, Boqun Feng wrote:
> > Hi Paul,
> > 
> > I have a side question out of curiosity:
> > 
> > How does synchronize_sched() work properly for sys_membarrier()?
> > 
> > sys_membarrier() requires every other CPU does a smp_mb() before it
> > returns, and I know synchronize_sched() will wait until all CPUs running
> > a kernel thread do a context-switch, which has a smp_mb(). However, I
> > believe sched flavor RCU treat CPU running a user thread as a quiesent
> > state, so synchronize_sched() could return without that CPU does a
> > context switch. 
> > 
> > So why could we use synchronize_sched() for sys_membarrier()?
> > 
> > In particular, could the following happens?
> > 
> > 	CPU 0:				CPU 1:
> > 	=========================	==========================
> > 	<in user space>			<in user space>
> > 					{read Y}(reordered) <------------------------------+
> > 	store Y;			                                                   |
> > 					read X; --------------------------------------+    |
> > 	sys_membarrier():		<timer interrupt>                             |    |
> > 	  synchronize_sched();		  update_process_times(user): //user == true  |    |
> > 	  				    rcu_check_callbacks(usr):                 |    |
> > 					      if (user || ..) {                       |    |
> > 					        rcu_sched_qs()                        |    |
> > 						...                                   |    |
> > 						<report quesient state in softirq>    |    |
> 
> The reporting of the quiescent state will acquire the leaf rcu_node
> structure's lock, with an smp_mb__after_unlock_lock(), which will
> one way or another be a full memory barrier.  So the reorderings
> cannot happen.
> 
> Unless I am missing something subtle.  ;-)
> 

Well, smp_mb__after_unlock_lock() in ARM64 is a no-op, and ARM64's lock
doesn't provide a smp_mb().

So my point is more like: synchronize_sched() happens to be a
sys_membarrier() because of some implementation detail, and if some day
we come up with a much cheaper way to implement sched flavor
RCU(hopefully!), synchronize_sched() may be not good for the job. So at
least, we'd better document this somewhere?

Regards,
Boqun

> 						Thanx, Paul
> 
> > 					<return to user space>                        |    |
> > 					read Y; --------------------------------------+----+
> > 	store X;			                                              |
> > 					{read X}(reordered) <-------------------------+
> > 
> > I assume the timer interrupt handler, which interrupts a user space and
> > reports a quiesent state for sched flavor RCU, may not have a smp_mb()
> > in some code path.
> > 
> > I may miss something subtle, but it just not very obvious how
> > synchronize_sched() will guarantee a remote CPU running in userspace to
> > do a smp_mb() before it returns, this is at least not in RCU
> > requirements, right?
> > 
> > Regards,
> > Boqun
> 
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 13:49                               ` Peter Zijlstra
@ 2017-07-27 14:32                                 ` Paul E. McKenney
  2017-07-27 14:36                                   ` Peter Zijlstra
  0 siblings, 1 reply; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-27 14:32 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

On Thu, Jul 27, 2017 at 03:49:08PM +0200, Peter Zijlstra wrote:
> On Thu, Jul 27, 2017 at 06:08:16AM -0700, Paul E. McKenney wrote:
> 
> > > No. Its called wakeup latency :-) Your SCHED_OTHER task will not get to
> > > insta-run all the time. If there are other tasks already running, we'll
> > > not IPI unless it should preempt.
> > > 
> > > If its idle, nobody cares..
> > 
> > So it does IPI immediately sometimes.
> > 
> > > > Does this auto-throttling also apply if the user is running a CPU-bound
> > > > SCHED_BATCH or SCHED_IDLE task on each CPU, and periodically waking up
> > > > one of a large group of SCHED_OTHER tasks, where the SCHED_OTHER tasks
> > > > immediately sleep upon being awakened?
> > > 
> > > SCHED_BATCH is even more likely to suffer wakeup latency since it will
> > > never preempt anything.
> > 
> > Ahem.  In this scenario, SCHED_BATCH is already running on a the CPU in
> > question, and a SCHED_OTHER task is awakened from some other CPU.
> > 
> > Do we IPI in that case.
> 
> So I'm a bit confused as to where you're trying to go with this.
> 
> I'm saying that if there are other users of our CPU, we can't
> significantly disturb them with IPIs.
> 
> Yes, we'll sometimes IPI in order to do a preemption on wakeup. But we
> cannot always win that. There is no wakeup triggered starvation case.
> 
> If you create a thread per CPU and have them insta sleep after wakeup,
> and then keep prodding them to wakeup. They will disturb things less
> than if they were while(1); loops.
> 
> If the machine is otherwise idle, nobody cares.
> 
> If there are other tasks on the system, the IPI rate is limited to the
> wakeup latency of you tasks.
> 
> 
> And any of this is limited to the CPUs we're allowed to run in the first
> place.
> 
> 
> So yes, the occasional IPI happens, but if there's other tasks, we can't
> disturb them more than we could with while(1); tasks.
> 
> 
> OTOH, something like:
> 
> 	while(1)
> 		synchronize_sched_expedited();
> 
> as per your proposed patch, will spray IPIs to all CPUs and at high
> rates.

OK, I have updated my patch to do throttling.

							Thanx, Paul

------------------------------------------------------------------------

commit 4cd5253094b6d7f9501e21e13aa4e2e78e8a70cd
Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Date:   Tue Jul 18 13:53:32 2017 -0700

    sys_membarrier: Add expedited option
    
    The sys_membarrier() system call has proven too slow for some use cases,
    which has prompted users to instead rely on TLB shootdown.  Although TLB
    shootdown is much faster, it has the slight disadvantage of not working
    at all on arm and arm64 and also of being vulnerable to reasonable
    optimizations that might skip some IPIs.  However, the Linux kernel
    does not currrently provide a reasonable alternative, so it is hard to
    criticize these users from doing what works for them on a given piece
    of hardware at a given time.
    
    This commit therefore adds an expedited option to the sys_membarrier()
    system call, thus providing a faster mechanism that is portable and
    is not subject to death by optimization.  Note that if more than one
    MEMBARRIER_CMD_SHARED_EXPEDITED sys_membarrier() call happens within
    the same jiffy, all but the first will use synchronize_sched() instead
    of synchronize_sched_expedited().
    
    Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
    [ paulmck: Fix code style issue pointed out by Boqun Feng. ]
    Tested-by: Avi Kivity <avi@scylladb.com>
    Cc: Maged Michael <maged.michael@gmail.com>
    Cc: Andrew Hunter <ahh@google.com>
    Cc: Geoffrey Romer <gromer@google.com>

diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index e0b108bd2624..5720386d0904 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -40,6 +40,16 @@
  *                          (non-running threads are de facto in such a
  *                          state). This covers threads from all processes
  *                          running on the system. This command returns 0.
+ * @MEMBARRIER_CMD_SHARED_EXPEDITED:  Execute a memory barrier on all
+ *			    running threads, but in an expedited fashion.
+ *                          Upon return from system call, the caller thread
+ *                          is ensured that all running threads have passed
+ *                          through a state where all memory accesses to
+ *                          user-space addresses match program order between
+ *                          entry to and return from the system call
+ *                          (non-running threads are de facto in such a
+ *                          state). This covers threads from all processes
+ *                          running on the system. This command returns 0.
  *
  * Command to be passed to the membarrier system call. The commands need to
  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
@@ -48,6 +58,7 @@
 enum membarrier_cmd {
 	MEMBARRIER_CMD_QUERY = 0,
 	MEMBARRIER_CMD_SHARED = (1 << 0),
+	MEMBARRIER_CMD_SHARED_EXPEDITED = (1 << 1),
 };
 
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
index 9f9284f37f8d..587e3bbfae7e 100644
--- a/kernel/membarrier.c
+++ b/kernel/membarrier.c
@@ -22,7 +22,8 @@
  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  * except MEMBARRIER_CMD_QUERY.
  */
-#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
+#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED |		\
+				 MEMBARRIER_CMD_SHARED_EXPEDITED)
 
 /**
  * sys_membarrier - issue memory barriers on a set of threads
@@ -64,6 +65,20 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 		if (num_online_cpus() > 1)
 			synchronize_sched();
 		return 0;
+	case MEMBARRIER_CMD_SHARED_EXPEDITED:
+		if (num_online_cpus() > 1) {
+			static unsigned long lastexp;
+			unsigned long j;
+
+			j = jiffies;
+			if (READ_ONCE(lastexp) == j) {
+				synchronize_sched();
+				WRITE_ONCE(lastexp, j);
+			} else {
+				synchronize_sched_expedited();
+			}
+		}
+		return 0;
 	default:
 		return -EINVAL;
 	}

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 13:37                   ` Peter Zijlstra
@ 2017-07-27 14:33                     ` Paul E. McKenney
  0 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-27 14:33 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg

On Thu, Jul 27, 2017 at 03:37:29PM +0200, Peter Zijlstra wrote:
> On Thu, Jul 27, 2017 at 05:56:59AM -0700, Paul E. McKenney wrote:
> > On Thu, Jul 27, 2017 at 12:14:26PM +0200, Peter Zijlstra wrote:
> > > On Tue, Jul 25, 2017 at 12:36:12PM -0700, Paul E. McKenney wrote:
> > > > This horse is already out, so trying to shut the gate won't be effective.
> > > 
> > > So I'm not convinced it is. The mprotect() hack isn't portable as we've
> > > established and on x86 where it does work, it doesn't (much) perturb
> > > tasks not related to our process because we keep a tight mm_cpumask().
> > 
> > Wrong.  People are using it today, portable or not.  If we want them
> > to stop using it, we need to give them an alternative.  Period.
> 
> What's wrong? The mprotect() hack isn't portable, nor does it perturb
> other users much.
> 
> I would much rather they use this than your
> synchronize_sched_expedited() thing. Its much better behaved. And
> they'll run into pain the moment they start using ARM,PPC,S390,etc..

What is wrong is that we currently don't provide them a reasonable
alternative.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 14:32                                 ` Paul E. McKenney
@ 2017-07-27 14:36                                   ` Peter Zijlstra
  2017-07-27 14:46                                     ` Paul E. McKenney
  0 siblings, 1 reply; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-27 14:36 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

On Thu, Jul 27, 2017 at 07:32:05AM -0700, Paul E. McKenney wrote:
> > as per your proposed patch, will spray IPIs to all CPUs and at high
> > rates.
> 
> OK, I have updated my patch to do throttling.

But not respect cpusets. Which is the other important point.

The scheduler based IPIs are limited to where we are allowed to place
tasks (as are the TLB ones for that matter, for the exact same reason).

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 14:29                                   ` Boqun Feng
@ 2017-07-27 14:36                                     ` Paul E. McKenney
  2017-07-27 14:41                                       ` Will Deacon
  2017-07-27 14:47                                       ` Boqun Feng
  0 siblings, 2 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-27 14:36 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Peter Zijlstra, linux-kernel, mingo, jiangshanlai, dipankar,
	akpm, mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

On Thu, Jul 27, 2017 at 10:29:55PM +0800, Boqun Feng wrote:
> On Thu, Jul 27, 2017 at 07:16:33AM -0700, Paul E. McKenney wrote:
> > On Thu, Jul 27, 2017 at 09:55:51PM +0800, Boqun Feng wrote:
> > > Hi Paul,
> > > 
> > > I have a side question out of curiosity:
> > > 
> > > How does synchronize_sched() work properly for sys_membarrier()?
> > > 
> > > sys_membarrier() requires every other CPU does a smp_mb() before it
> > > returns, and I know synchronize_sched() will wait until all CPUs running
> > > a kernel thread do a context-switch, which has a smp_mb(). However, I
> > > believe sched flavor RCU treat CPU running a user thread as a quiesent
> > > state, so synchronize_sched() could return without that CPU does a
> > > context switch. 
> > > 
> > > So why could we use synchronize_sched() for sys_membarrier()?
> > > 
> > > In particular, could the following happens?
> > > 
> > > 	CPU 0:				CPU 1:
> > > 	=========================	==========================
> > > 	<in user space>			<in user space>
> > > 					{read Y}(reordered) <------------------------------+
> > > 	store Y;			                                                   |
> > > 					read X; --------------------------------------+    |
> > > 	sys_membarrier():		<timer interrupt>                             |    |
> > > 	  synchronize_sched();		  update_process_times(user): //user == true  |    |
> > > 	  				    rcu_check_callbacks(usr):                 |    |
> > > 					      if (user || ..) {                       |    |
> > > 					        rcu_sched_qs()                        |    |
> > > 						...                                   |    |
> > > 						<report quesient state in softirq>    |    |
> > 
> > The reporting of the quiescent state will acquire the leaf rcu_node
> > structure's lock, with an smp_mb__after_unlock_lock(), which will
> > one way or another be a full memory barrier.  So the reorderings
> > cannot happen.
> > 
> > Unless I am missing something subtle.  ;-)
> > 
> 
> Well, smp_mb__after_unlock_lock() in ARM64 is a no-op, and ARM64's lock
> doesn't provide a smp_mb().
> 
> So my point is more like: synchronize_sched() happens to be a
> sys_membarrier() because of some implementation detail, and if some day
> we come up with a much cheaper way to implement sched flavor
> RCU(hopefully!), synchronize_sched() may be not good for the job. So at
> least, we'd better document this somewhere?

Last I heard, ARM's unlock/lock acted as a full barrier.  Will?

Please see the synchronize_sched() comment header for the documentation
you are asking for.  And the "Memory-Barrier Guarantees" section of
Documentation/RCU/Design/Requirements/Requirements.html.

							Thanx, Paul

> Regards,
> Boqun
> 
> > 						Thanx, Paul
> > 
> > > 					<return to user space>                        |    |
> > > 					read Y; --------------------------------------+----+
> > > 	store X;			                                              |
> > > 					{read X}(reordered) <-------------------------+
> > > 
> > > I assume the timer interrupt handler, which interrupts a user space and
> > > reports a quiesent state for sched flavor RCU, may not have a smp_mb()
> > > in some code path.
> > > 
> > > I may miss something subtle, but it just not very obvious how
> > > synchronize_sched() will guarantee a remote CPU running in userspace to
> > > do a smp_mb() before it returns, this is at least not in RCU
> > > requirements, right?
> > > 
> > > Regards,
> > > Boqun
> > 
> > 

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 14:36                                     ` Paul E. McKenney
@ 2017-07-27 14:41                                       ` Will Deacon
  2017-07-27 14:47                                       ` Boqun Feng
  1 sibling, 0 replies; 68+ messages in thread
From: Will Deacon @ 2017-07-27 14:41 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Boqun Feng, Peter Zijlstra, linux-kernel, mingo, jiangshanlai,
	dipankar, akpm, mathieu.desnoyers, josh, tglx, rostedt, dhowells,
	edumazet, fweisbec, oleg

On Thu, Jul 27, 2017 at 07:36:58AM -0700, Paul E. McKenney wrote:
> On Thu, Jul 27, 2017 at 10:29:55PM +0800, Boqun Feng wrote:
> > On Thu, Jul 27, 2017 at 07:16:33AM -0700, Paul E. McKenney wrote:
> > > On Thu, Jul 27, 2017 at 09:55:51PM +0800, Boqun Feng wrote:
> > > > Hi Paul,
> > > > 
> > > > I have a side question out of curiosity:
> > > > 
> > > > How does synchronize_sched() work properly for sys_membarrier()?
> > > > 
> > > > sys_membarrier() requires every other CPU does a smp_mb() before it
> > > > returns, and I know synchronize_sched() will wait until all CPUs running
> > > > a kernel thread do a context-switch, which has a smp_mb(). However, I
> > > > believe sched flavor RCU treat CPU running a user thread as a quiesent
> > > > state, so synchronize_sched() could return without that CPU does a
> > > > context switch. 
> > > > 
> > > > So why could we use synchronize_sched() for sys_membarrier()?
> > > > 
> > > > In particular, could the following happens?
> > > > 
> > > > 	CPU 0:				CPU 1:
> > > > 	=========================	==========================
> > > > 	<in user space>			<in user space>
> > > > 					{read Y}(reordered) <------------------------------+
> > > > 	store Y;			                                                   |
> > > > 					read X; --------------------------------------+    |
> > > > 	sys_membarrier():		<timer interrupt>                             |    |
> > > > 	  synchronize_sched();		  update_process_times(user): //user == true  |    |
> > > > 	  				    rcu_check_callbacks(usr):                 |    |
> > > > 					      if (user || ..) {                       |    |
> > > > 					        rcu_sched_qs()                        |    |
> > > > 						...                                   |    |
> > > > 						<report quesient state in softirq>    |    |
> > > 
> > > The reporting of the quiescent state will acquire the leaf rcu_node
> > > structure's lock, with an smp_mb__after_unlock_lock(), which will
> > > one way or another be a full memory barrier.  So the reorderings
> > > cannot happen.
> > > 
> > > Unless I am missing something subtle.  ;-)
> > > 
> > 
> > Well, smp_mb__after_unlock_lock() in ARM64 is a no-op, and ARM64's lock
> > doesn't provide a smp_mb().
> > 
> > So my point is more like: synchronize_sched() happens to be a
> > sys_membarrier() because of some implementation detail, and if some day
> > we come up with a much cheaper way to implement sched flavor
> > RCU(hopefully!), synchronize_sched() may be not good for the job. So at
> > least, we'd better document this somewhere?
> 
> Last I heard, ARM's unlock/lock acted as a full barrier.  Will?

Yeah, should do. unlock is release, lock is acquire and we're RCsc.

Will

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 12:39                                     ` Mathieu Desnoyers
@ 2017-07-27 14:44                                       ` Paul E. McKenney
  0 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-27 14:44 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Peter Zijlstra, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

On Thu, Jul 27, 2017 at 12:39:36PM +0000, Mathieu Desnoyers wrote:
> ----- On Jul 26, 2017, at 9:45 PM, Paul E. McKenney paulmck@linux.vnet.ibm.com wrote:
> 
> > On Wed, Jul 26, 2017 at 02:11:46PM -0700, Paul E. McKenney wrote:
> >> On Wed, Jul 26, 2017 at 08:37:23PM +0000, Mathieu Desnoyers wrote:
> >> > ----- On Jul 26, 2017, at 2:30 PM, Paul E. McKenney paulmck@linux.vnet.ibm.com
> >> > wrote:
> >> > 
> >> > > On Wed, Jul 26, 2017 at 06:01:15PM +0000, Mathieu Desnoyers wrote:
> >> > >> ----- On Jul 26, 2017, at 11:42 AM, Paul E. McKenney paulmck@linux.vnet.ibm.com
> >> > >> wrote:
> >> > >> 
> >> > >> > On Wed, Jul 26, 2017 at 09:46:56AM +0200, Peter Zijlstra wrote:
> >> > >> >> On Tue, Jul 25, 2017 at 10:50:13PM +0000, Mathieu Desnoyers wrote:
> >> > >> >> > This would implement a MEMBARRIER_CMD_PRIVATE_EXPEDITED (or such) flag
> >> > >> >> > for expedited process-local effect. This differs from the "SHARED" flag,
> >> > >> >> > since the SHARED flag affects threads accessing memory mappings shared
> >> > >> >> > across processes as well.
> >> > >> >> > 
> >> > >> >> > I wonder if we could create a MEMBARRIER_CMD_SHARED_EXPEDITED behavior
> >> > >> >> > by iterating on all memory mappings mapped into the current process,
> >> > >> >> > and build a cpumask based on the union of all mm masks encountered ?
> >> > >> >> > Then we could send the IPI to all cpus belonging to that cpumask. Or
> >> > >> >> > am I missing something obvious ?
> >> > >> >> 
> >> > >> >> I would readily object to such a beast. You far too quickly end up
> >> > >> >> having to IPI everybody because of some stupid shared map or something
> >> > >> >> (yes I know, normal DSOs are mapped private).
> >> > >> > 
> >> > >> > Agreed, we should keep things simple to start with.  The user can always
> >> > >> > invoke sys_membarrier() from each process.
> >> > >> 
> >> > >> Another alternative for a MEMBARRIER_CMD_SHARED_EXPEDITED would be rate-limiting
> >> > >> per thread. For instance, we could add a new "ulimit" that would bound the
> >> > >> number of expedited membarrier per thread that can be done per millisecond,
> >> > >> and switch to synchronize_sched() whenever a thread goes beyond that limit
> >> > >> for the rest of the time-slot.
> >> > >> 
> >> > >> A RT system that really cares about not having userspace sending IPIs
> >> > >> to all cpus could set the ulimit value to 0, which would always use
> >> > >> synchronize_sched().
> >> > >> 
> >> > >> Thoughts ?
> >> > > 
> >> > > The patch I posted reverts to synchronize_sched() in kernels booted with
> >> > > rcupdate.rcu_normal=1.  ;-)
> >> > > 
> >> > > But who is pushing for multiple-process sys_membarrier()?  Everyone I
> >> > > have talked to is OK with it being local to the current process.
> >> > 
> >> > I guess I'm probably the guilty one intending to do weird stuff in userspace ;)
> >> > 
> >> > Here are my two use-cases:
> >> > 
> >> > * a new multi-process liburcu flavor, useful if e.g. a set of processes are
> >> >   responsible for updating a shared memory data structure, and a separate set
> >> >   of processes read that data structure. The readers can be killed without ill
> >> >   effect on the other processes. The synchronization could be done by one
> >> >   multi-process liburcu flavor per reader process "group".
> >> > 
> >> > * lttng-ust user-space ring buffers (shared across processes).
> >> > 
> >> > Both rely on a shared memory mapping for communication between processes, and
> >> > I would like to be able to issue a sys_membarrier targeting all CPUs that may
> >> > currently touch the shared memory mapping.
> >> > 
> >> > I don't really need a system-wide effect, but I would like to be able to target
> >> > a shared memory mapping and efficiently do an expedited sys_membarrier on all
> >> > cpus involved.
> >> > 
> >> > With lttng-ust, the shared buffers can spawn across 1000+ processes, so
> >> > asking each process to issue sys_membarrier would add lots of unneeded overhead,
> >> > because this would issue lots of needless memory barriers.
> >> > 
> >> > Thoughts ?
> >> 
> >> Dealing explicitly with 1000+ processes sounds like no picnic.  It instead
> >> sounds like a job for synchronize_sched_expedited().  ;-)
> > 
> > Actually...
> > 
> > Mathieu, does your use case require unprivileged access to sys_membarrier()?
> 
> Unfortunately, yes, it does require sys_membarrier to be used from non-root
> both for lttng-ust and liburcu multi-process. And as Peter pointed out, stuff
> like containers complicates things even for the root case.

Hey, I was hoping!  ;-)

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 14:36                                   ` Peter Zijlstra
@ 2017-07-27 14:46                                     ` Paul E. McKenney
  0 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-27 14:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

On Thu, Jul 27, 2017 at 04:36:24PM +0200, Peter Zijlstra wrote:
> On Thu, Jul 27, 2017 at 07:32:05AM -0700, Paul E. McKenney wrote:
> > > as per your proposed patch, will spray IPIs to all CPUs and at high
> > > rates.
> > 
> > OK, I have updated my patch to do throttling.
> 
> But not respect cpusets. Which is the other important point.
> 
> The scheduler based IPIs are limited to where we are allowed to place
> tasks (as are the TLB ones for that matter, for the exact same reason).

Yes, these are disadvantages, no argument there.  But I will nevertheless
be carrying some variant of this patch until something better exists,
including that something being tested and found satisfactory by the
various people asking for this.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 14:36                                     ` Paul E. McKenney
  2017-07-27 14:41                                       ` Will Deacon
@ 2017-07-27 14:47                                       ` Boqun Feng
  2017-07-27 14:55                                         ` Paul E. McKenney
  1 sibling, 1 reply; 68+ messages in thread
From: Boqun Feng @ 2017-07-27 14:47 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Peter Zijlstra, linux-kernel, mingo, jiangshanlai, dipankar,
	akpm, mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

[-- Attachment #1: Type: text/plain, Size: 2742 bytes --]

On Thu, Jul 27, 2017 at 07:36:58AM -0700, Paul E. McKenney wrote:
> > > 
> > > The reporting of the quiescent state will acquire the leaf rcu_node
> > > structure's lock, with an smp_mb__after_unlock_lock(), which will
> > > one way or another be a full memory barrier.  So the reorderings
> > > cannot happen.
> > > 
> > > Unless I am missing something subtle.  ;-)
> > > 
> > 
> > Well, smp_mb__after_unlock_lock() in ARM64 is a no-op, and ARM64's lock
> > doesn't provide a smp_mb().
> > 
> > So my point is more like: synchronize_sched() happens to be a
> > sys_membarrier() because of some implementation detail, and if some day
> > we come up with a much cheaper way to implement sched flavor
> > RCU(hopefully!), synchronize_sched() may be not good for the job. So at
> > least, we'd better document this somewhere?
> 
> Last I heard, ARM's unlock/lock acted as a full barrier.  Will?
> 
> Please see the synchronize_sched() comment header for the documentation
> you are asking for.  And the "Memory-Barrier Guarantees" section of
> Documentation/RCU/Design/Requirements/Requirements.html.
> 

All those barrier guarantees are subject to a RCU read-side critical
section with a synchonize_*(), IIRC, for example:

 * On systems with more than one CPU, when synchronize_sched() returns,
 * each CPU is guaranteed to have executed a full memory barrier since the
 * end of its last RCU-sched read-side critical section whose beginning
 * preceded the call to synchronize_sched().  In addition, each CPU having

, which is not the case for a quiesent state without a read-side
critical section(i.e. non-context-switch quiesent state for sched Flavor)

I've read those requirements and could not find one to explain why there
will be a full barrier emitted in an interrupted user-space program.

Regards,
Boqun

> 							Thanx, Paul
> 
> > Regards,
> > Boqun
> > 
> > > 						Thanx, Paul
> > > 
> > > > 					<return to user space>                        |    |
> > > > 					read Y; --------------------------------------+----+
> > > > 	store X;			                                              |
> > > > 					{read X}(reordered) <-------------------------+
> > > > 
> > > > I assume the timer interrupt handler, which interrupts a user space and
> > > > reports a quiesent state for sched flavor RCU, may not have a smp_mb()
> > > > in some code path.
> > > > 
> > > > I may miss something subtle, but it just not very obvious how
> > > > synchronize_sched() will guarantee a remote CPU running in userspace to
> > > > do a smp_mb() before it returns, this is at least not in RCU
> > > > requirements, right?
> > > > 
> > > > Regards,
> > > > Boqun
> > > 
> > > 
> 
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 10:24                               ` Peter Zijlstra
@ 2017-07-27 14:52                                 ` Paul E. McKenney
  0 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-27 14:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, linux-kernel, Ingo Molnar, Lai Jiangshan,
	dipankar, Andrew Morton, Josh Triplett, Thomas Gleixner, rostedt,
	David Howells, Eric Dumazet, fweisbec, Oleg Nesterov,
	Will Deacon

On Thu, Jul 27, 2017 at 12:24:22PM +0200, Peter Zijlstra wrote:
> On Wed, Jul 26, 2017 at 11:30:32AM -0700, Paul E. McKenney wrote:
> > The patch I posted reverts to synchronize_sched() in kernels booted with
> > rcupdate.rcu_normal=1.  ;-)
> 
> So boot parameters are no solution and are only slightly better than
> compile time switches.
> 
> What if you have a machine that runs workloads that want both options?
> partitioning and containers are somewhat popular these days and system
> wide tunables don't work for them.

Agreed, these are disadvantages.  And again, I will nevertheless be
carrying some variant of this patch until something better is in place,
where that something has been shown to satisfy the needs of the people
requesting this feature.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 14:47                                       ` Boqun Feng
@ 2017-07-27 14:55                                         ` Paul E. McKenney
  0 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-27 14:55 UTC (permalink / raw)
  To: Boqun Feng
  Cc: Peter Zijlstra, linux-kernel, mingo, jiangshanlai, dipankar,
	akpm, mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon

On Thu, Jul 27, 2017 at 10:47:03PM +0800, Boqun Feng wrote:
> On Thu, Jul 27, 2017 at 07:36:58AM -0700, Paul E. McKenney wrote:
> > > > 
> > > > The reporting of the quiescent state will acquire the leaf rcu_node
> > > > structure's lock, with an smp_mb__after_unlock_lock(), which will
> > > > one way or another be a full memory barrier.  So the reorderings
> > > > cannot happen.
> > > > 
> > > > Unless I am missing something subtle.  ;-)
> > > > 
> > > 
> > > Well, smp_mb__after_unlock_lock() in ARM64 is a no-op, and ARM64's lock
> > > doesn't provide a smp_mb().
> > > 
> > > So my point is more like: synchronize_sched() happens to be a
> > > sys_membarrier() because of some implementation detail, and if some day
> > > we come up with a much cheaper way to implement sched flavor
> > > RCU(hopefully!), synchronize_sched() may be not good for the job. So at
> > > least, we'd better document this somewhere?
> > 
> > Last I heard, ARM's unlock/lock acted as a full barrier.  Will?
> > 
> > Please see the synchronize_sched() comment header for the documentation
> > you are asking for.  And the "Memory-Barrier Guarantees" section of
> > Documentation/RCU/Design/Requirements/Requirements.html.
> > 
> 
> All those barrier guarantees are subject to a RCU read-side critical
> section with a synchonize_*(), IIRC, for example:
> 
>  * On systems with more than one CPU, when synchronize_sched() returns,
>  * each CPU is guaranteed to have executed a full memory barrier since the
>  * end of its last RCU-sched read-side critical section whose beginning
>  * preceded the call to synchronize_sched().  In addition, each CPU having
> 
> , which is not the case for a quiesent state without a read-side
> critical section(i.e. non-context-switch quiesent state for sched Flavor)
> 
> I've read those requirements and could not find one to explain why there
> will be a full barrier emitted in an interrupted user-space program.

What you are forgetting is that for synchronize_sched(), any region of
code with preemption disabled is an RCU-sched read-side critical section.

							Thanx, Paul

> Regards,
> Boqun
> 
> > 							Thanx, Paul
> > 
> > > Regards,
> > > Boqun
> > > 
> > > > 						Thanx, Paul
> > > > 
> > > > > 					<return to user space>                        |    |
> > > > > 					read Y; --------------------------------------+----+
> > > > > 	store X;			                                              |
> > > > > 					{read X}(reordered) <-------------------------+
> > > > > 
> > > > > I assume the timer interrupt handler, which interrupts a user space and
> > > > > reports a quiesent state for sched flavor RCU, may not have a smp_mb()
> > > > > in some code path.
> > > > > 
> > > > > I may miss something subtle, but it just not very obvious how
> > > > > synchronize_sched() will guarantee a remote CPU running in userspace to
> > > > > do a smp_mb() before it returns, this is at least not in RCU
> > > > > requirements, right?
> > > > > 
> > > > > Regards,
> > > > > Boqun
> > > > 
> > > > 
> > 
> > 

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option
  2017-07-27 13:56                               ` Peter Zijlstra
@ 2017-07-27 15:19                                 ` Peter Zijlstra
  0 siblings, 0 replies; 68+ messages in thread
From: Peter Zijlstra @ 2017-07-27 15:19 UTC (permalink / raw)
  To: Paul E. McKenney, Nicholas Piggin
  Cc: linux-kernel, mingo, jiangshanlai, dipankar, akpm,
	mathieu.desnoyers, josh, tglx, rostedt, dhowells, edumazet,
	fweisbec, oleg, will.deacon, Boqun Feng


Hi Nick,

See below,

On Thu, Jul 27, 2017 at 03:56:10PM +0200, Peter Zijlstra wrote:
> On Thu, Jul 27, 2017 at 06:08:16AM -0700, Paul E. McKenney wrote:
> 
> > > So I think we need either switch_mm() or switch_to() to imply a full
> > > barrier for this to work, otherwise we get:
> > > 
> > >   CPU0				CPU1
> > > 
> > > 
> > >   lock rq->lock
> > >   mb
> > > 
> > >   rq->curr = A
> > > 
> > >   unlock rq->lock
> > > 
> > >   lock rq->lock
> > >   mb
> > > 
> > > 				sys_membarrier()
> > > 
> > > 				mb
> > > 
> > > 				for_each_online_cpu()
> > > 				  p = A
> > > 				  // no match no IPI
> > > 
> > > 				mb
> > >   rq->curr = B
> > > 
> > >   unlock rq->lock
> > > 
> > > 
> > > And that's bad, because now CPU0 doesn't have an MB happening _after_
> > > sys_membarrier() if B matches.
> > 
> > Yes, this looks somewhat similar to the scenario that Mathieu pointed out
> > back in 2010: https://marc.info/?l=linux-kernel&m=126349766324224&w=2
> 
> Yes. Minus the mm_cpumask() worries.
> 
> > > So without audit, I only know of PPC and Alpha not having a barrier in
> > > either switch_*().
> > > 
> > > x86 obviously has barriers all over the place, arm has a super duper
> > > heavy barrier in switch_to().
> > 
> > Agreed, if we are going to rely on ->mm, we need ordering on assignment
> > to it.
> 
> Right, Boqun provided this reordering to show the problem:
> 
>   CPU0                                CPU1
>  
>  
>   <in process X>
>   lock rq->lock
>   mb
>  
>   rq->curr = A
>  
>   unlock rq->lock
>  
>   <switch to process A>
>  
>   lock rq->lock
>   mb
>   read Y(reordered)<---+
>                        |        store to Y
>                        |
>                        |        sys_membarrier()
>                        |
>                        |        mb
>                        |
>                        |        for_each_online_cpu()
>                        |          p = A
>                        |          // no match no IPI
>                        |
>                        |        mb
>                        |
>                        |        store to X
>   rq->curr = B         |
>                        |
>   unlock rq->lock      |
>   <switch to B>        |
>   read X               |
>                        |
>   read Y --------------+

In order to make this work we need either switch_to() or switch_mm() to
provide smp_mb(). Now you're recently taken that out on PPC and I'm
thinking you're not keen to have to put it back in.

Mathieu was wondering if placing it in switch_mm() would be less onerous
on performance, thinking that address space changes are more expensive
in any case, seeing how they have a tail of cache and translation
misses. I'm thinking you're not happy either way :-)

Opinions?

^ permalink raw reply	[flat|nested] 68+ messages in thread

* Re: [PATCH tip/core/rcu 0/5] Related non-RCU updates
  2017-07-24 21:57 [PATCH tip/core/rcu 0/5] Related non-RCU updates Paul E. McKenney
                   ` (4 preceding siblings ...)
  2017-07-24 21:58 ` [PATCH tip/core/rcu 5/5] EXP: sched/cputime: Fix using smp_processor_id() in preemptible Paul E. McKenney
@ 2017-07-31 22:51 ` Paul E. McKenney
  2017-07-31 22:53   ` [PATCH v2 tip/core/rcu 1/4] module: Fix pr_fmt() bug for header use of printk Paul E. McKenney
                     ` (3 more replies)
  5 siblings, 4 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-31 22:51 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, jiangshanlai, dipankar, akpm, mathieu.desnoyers, josh,
	tglx, peterz, rostedt, dhowells, edumazet, fweisbec, oleg

Hello!

This series is outside of RCU, including things like fixes for bug
triggered by RCU commits.

1.	Fix pr_fmt() bug for header use of printk, courtesy of Joe Perches.

2.	Remove redundant INIT_TASK_RCU_TREE_PREEMPT() macro.

3.	Allow migrating kthreads into online but inactive CPUs, courtesy
	of Tejun Heo.

4.	Add expedited private command to sys_membarrier(), courtesy of
	Mathieu Desnoyers.

Changes since v1:

o	Substitute Mathieu Desnoyer's expedited private command for
	expedited grace periods in sys_membarrier().

o	Remove Wanpeng Li's fix of use of smp_processor_id() in preemptible
	in cputime because it is now in mainline.

							Thanx, Paul

------------------------------------------------------------------------

 b/MAINTAINERS                     |    2 
 b/arch/arm64/kernel/process.c     |    2 
 b/arch/blackfin/kernel/module.c   |   39 +++++----
 b/arch/x86/mm/tlb.c               |    3 
 b/include/linux/init_task.h       |    8 --
 b/include/uapi/linux/membarrier.h |   23 +++++
 b/kernel/Makefile                 |    1 
 b/kernel/sched/Makefile           |    1 
 b/kernel/sched/core.c             |   34 ++++++++
 b/kernel/sched/membarrier.c       |  152 ++++++++++++++++++++++++++++++++++++++
 kernel/membarrier.c               |   70 -----------------
 11 files changed, 233 insertions(+), 102 deletions(-)

^ permalink raw reply	[flat|nested] 68+ messages in thread

* [PATCH v2 tip/core/rcu 1/4] module: Fix pr_fmt() bug for header use of printk
  2017-07-31 22:51 ` [PATCH tip/core/rcu 0/5] Related non-RCU updates Paul E. McKenney
@ 2017-07-31 22:53   ` Paul E. McKenney
  2017-07-31 22:53   ` [PATCH v2 tip/core/rcu 2/4] init_task: Remove redundant INIT_TASK_RCU_TREE_PREEMPT() macro Paul E. McKenney
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-31 22:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, jiangshanlai, dipankar, akpm, mathieu.desnoyers, josh,
	tglx, peterz, rostedt, dhowells, edumazet, fweisbec, oleg,
	Joe Perches, Paul E. McKenney

From: Joe Perches <joe@perches.com>

This commit removes the pr_fmt() macro, replacing it with mod_err() and
mod_debug() macros to avoid errors when using printk() from header files.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 arch/blackfin/kernel/module.c | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/arch/blackfin/kernel/module.c b/arch/blackfin/kernel/module.c
index 0188c933b155..15af5768c403 100644
--- a/arch/blackfin/kernel/module.c
+++ b/arch/blackfin/kernel/module.c
@@ -4,8 +4,6 @@
  * Licensed under the GPL-2 or later
  */
 
-#define pr_fmt(fmt) "module %s: " fmt, mod->name
-
 #include <linux/moduleloader.h>
 #include <linux/elf.h>
 #include <linux/vmalloc.h>
@@ -16,6 +14,11 @@
 #include <asm/cacheflush.h>
 #include <linux/uaccess.h>
 
+#define mod_err(mod, fmt, ...)						\
+	pr_err("module %s: " fmt, (mod)->name, ##__VA_ARGS__)
+#define mod_debug(mod, fmt, ...)					\
+	pr_debug("module %s: " fmt, (mod)->name, ##__VA_ARGS__)
+
 /* Transfer the section to the L1 memory */
 int
 module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
@@ -44,7 +47,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_inst_sram_alloc(s->sh_size);
 			mod->arch.text_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 inst memory allocation failed\n");
+				mod_err(mod, "L1 inst memory allocation failed\n");
 				return -1;
 			}
 			dma_memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -56,7 +59,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_sram_alloc(s->sh_size);
 			mod->arch.data_a_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -68,7 +71,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_sram_zalloc(s->sh_size);
 			mod->arch.bss_a_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 
@@ -77,7 +80,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_B_sram_alloc(s->sh_size);
 			mod->arch.data_b_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -87,7 +90,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l1_data_B_sram_alloc(s->sh_size);
 			mod->arch.bss_b_l1 = dest;
 			if (dest == NULL) {
-				pr_err("L1 data memory allocation failed\n");
+				mod_err(mod, "L1 data memory allocation failed\n");
 				return -1;
 			}
 			memset(dest, 0, s->sh_size);
@@ -99,7 +102,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l2_sram_alloc(s->sh_size);
 			mod->arch.text_l2 = dest;
 			if (dest == NULL) {
-				pr_err("L2 SRAM allocation failed\n");
+				mod_err(mod, "L2 SRAM allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -111,7 +114,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l2_sram_alloc(s->sh_size);
 			mod->arch.data_l2 = dest;
 			if (dest == NULL) {
-				pr_err("L2 SRAM allocation failed\n");
+				mod_err(mod, "L2 SRAM allocation failed\n");
 				return -1;
 			}
 			memcpy(dest, (void *)s->sh_addr, s->sh_size);
@@ -123,7 +126,7 @@ module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
 			dest = l2_sram_zalloc(s->sh_size);
 			mod->arch.bss_l2 = dest;
 			if (dest == NULL) {
-				pr_err("L2 SRAM allocation failed\n");
+				mod_err(mod, "L2 SRAM allocation failed\n");
 				return -1;
 			}
 
@@ -157,8 +160,8 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 	Elf32_Sym *sym;
 	unsigned long location, value, size;
 
-	pr_debug("applying relocate section %u to %u\n",
-		relsec, sechdrs[relsec].sh_info);
+	mod_debug(mod, "applying relocate section %u to %u\n",
+		  relsec, sechdrs[relsec].sh_info);
 
 	for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) {
 		/* This is where to make the change */
@@ -174,14 +177,14 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 
 #ifdef CONFIG_SMP
 		if (location >= COREB_L1_DATA_A_START) {
-			pr_err("cannot relocate in L1: %u (SMP kernel)\n",
+			mod_err(mod, "cannot relocate in L1: %u (SMP kernel)\n",
 				ELF32_R_TYPE(rel[i].r_info));
 			return -ENOEXEC;
 		}
 #endif
 
-		pr_debug("location is %lx, value is %lx type is %d\n",
-			location, value, ELF32_R_TYPE(rel[i].r_info));
+		mod_debug(mod, "location is %lx, value is %lx type is %d\n",
+			  location, value, ELF32_R_TYPE(rel[i].r_info));
 
 		switch (ELF32_R_TYPE(rel[i].r_info)) {
 
@@ -200,12 +203,12 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 		case R_BFIN_PCREL12_JUMP:
 		case R_BFIN_PCREL12_JUMP_S:
 		case R_BFIN_PCREL10:
-			pr_err("unsupported relocation: %u (no -mlong-calls?)\n",
+			mod_err(mod, "unsupported relocation: %u (no -mlong-calls?)\n",
 				ELF32_R_TYPE(rel[i].r_info));
 			return -ENOEXEC;
 
 		default:
-			pr_err("unknown relocation: %u\n",
+			mod_err(mod, "unknown relocation: %u\n",
 				ELF32_R_TYPE(rel[i].r_info));
 			return -ENOEXEC;
 		}
@@ -222,7 +225,7 @@ apply_relocate_add(Elf_Shdr *sechdrs, const char *strtab,
 			isram_memcpy((void *)location, &value, size);
 			break;
 		default:
-			pr_err("invalid relocation for %#lx\n", location);
+			mod_err(mod, "invalid relocation for %#lx\n", location);
 			return -ENOEXEC;
 		}
 	}
-- 
2.5.2

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH v2 tip/core/rcu 2/4] init_task: Remove redundant INIT_TASK_RCU_TREE_PREEMPT() macro
  2017-07-31 22:51 ` [PATCH tip/core/rcu 0/5] Related non-RCU updates Paul E. McKenney
  2017-07-31 22:53   ` [PATCH v2 tip/core/rcu 1/4] module: Fix pr_fmt() bug for header use of printk Paul E. McKenney
@ 2017-07-31 22:53   ` Paul E. McKenney
  2017-07-31 22:53   ` [PATCH v2 tip/core/rcu 3/4] sched: Allow migrating kthreads into online but inactive CPUs Paul E. McKenney
  2017-07-31 22:53   ` [PATCH v2 tip/core/rcu 4/4] membarrier: Expedited private command Paul E. McKenney
  3 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-31 22:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, jiangshanlai, dipankar, akpm, mathieu.desnoyers, josh,
	tglx, peterz, rostedt, dhowells, edumazet, fweisbec, oleg,
	Paul E. McKenney

Back in the dim distant past, the task_struct structure's RCU-related
fields optionally included those needed for CONFIG_RCU_BOOST, even in
CONFIG_PREEMPT_RCU builds.  The INIT_TASK_RCU_TREE_PREEMPT() macro was
used to provide initializers for those optional CONFIG_RCU_BOOST fields.
However, the CONFIG_RCU_BOOST fields are now included unconditionally
in CONFIG_PREEMPT_RCU builds, so there is no longer any need fro the
INIT_TASK_RCU_TREE_PREEMPT() macro.  This commit therefore removes it
in favor of initializing the ->rcu_blocked_node field directly in the
INIT_TASK_RCU_PREEMPT() macro.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/init_task.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index a2f6707e9fc0..0e849715e5be 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -126,17 +126,11 @@ extern struct group_info init_groups;
 #endif
 
 #ifdef CONFIG_PREEMPT_RCU
-#define INIT_TASK_RCU_TREE_PREEMPT()					\
-	.rcu_blocked_node = NULL,
-#else
-#define INIT_TASK_RCU_TREE_PREEMPT(tsk)
-#endif
-#ifdef CONFIG_PREEMPT_RCU
 #define INIT_TASK_RCU_PREEMPT(tsk)					\
 	.rcu_read_lock_nesting = 0,					\
 	.rcu_read_unlock_special.s = 0,					\
 	.rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry),		\
-	INIT_TASK_RCU_TREE_PREEMPT()
+	.rcu_blocked_node = NULL,
 #else
 #define INIT_TASK_RCU_PREEMPT(tsk)
 #endif
-- 
2.5.2

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH v2 tip/core/rcu 3/4] sched: Allow migrating kthreads into online but inactive CPUs
  2017-07-31 22:51 ` [PATCH tip/core/rcu 0/5] Related non-RCU updates Paul E. McKenney
  2017-07-31 22:53   ` [PATCH v2 tip/core/rcu 1/4] module: Fix pr_fmt() bug for header use of printk Paul E. McKenney
  2017-07-31 22:53   ` [PATCH v2 tip/core/rcu 2/4] init_task: Remove redundant INIT_TASK_RCU_TREE_PREEMPT() macro Paul E. McKenney
@ 2017-07-31 22:53   ` Paul E. McKenney
  2017-07-31 22:53   ` [PATCH v2 tip/core/rcu 4/4] membarrier: Expedited private command Paul E. McKenney
  3 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-31 22:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, jiangshanlai, dipankar, akpm, mathieu.desnoyers, josh,
	tglx, peterz, rostedt, dhowells, edumazet, fweisbec, oleg,
	Tejun Heo, Paul E. McKenney

From: Tejun Heo <tj@kernel.org>

Per-cpu workqueues have been tripping CPU affinity sanity checks while
a CPU is being offlined.  A per-cpu kworker ends up running on a CPU
which isn't its target CPU while the CPU is online but inactive.

While the scheduler allows kthreads to wake up on an online but
inactive CPU, it doesn't allow a running kthread to be migrated to
such a CPU, which leads to an odd situation where setting affinity on
a sleeping and running kthread leads to different results.

Each mem-reclaim workqueue has one rescuer which guarantees forward
progress and the rescuer needs to bind itself to the CPU which needs
help in making forward progress; however, due to the above issue,
while set_cpus_allowed_ptr() succeeds, the rescuer doesn't end up on
the correct CPU if the CPU is in the process of going offline,
tripping the sanity check and executing the work item on the wrong
CPU.

This patch updates __migrate_task() so that kthreads can be migrated
into an inactive but online CPU.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/sched/core.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 17c667b427b4..bfee6ea7db49 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -951,8 +951,13 @@ struct migration_arg {
 static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
 				 struct task_struct *p, int dest_cpu)
 {
-	if (unlikely(!cpu_active(dest_cpu)))
-		return rq;
+	if (p->flags & PF_KTHREAD) {
+		if (unlikely(!cpu_online(dest_cpu)))
+			return rq;
+	} else {
+		if (unlikely(!cpu_active(dest_cpu)))
+			return rq;
+	}
 
 	/* Affinity changed (again). */
 	if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
-- 
2.5.2

^ permalink raw reply related	[flat|nested] 68+ messages in thread

* [PATCH v2 tip/core/rcu 4/4] membarrier: Expedited private command
  2017-07-31 22:51 ` [PATCH tip/core/rcu 0/5] Related non-RCU updates Paul E. McKenney
                     ` (2 preceding siblings ...)
  2017-07-31 22:53   ` [PATCH v2 tip/core/rcu 3/4] sched: Allow migrating kthreads into online but inactive CPUs Paul E. McKenney
@ 2017-07-31 22:53   ` Paul E. McKenney
  3 siblings, 0 replies; 68+ messages in thread
From: Paul E. McKenney @ 2017-07-31 22:53 UTC (permalink / raw)
  To: linux-kernel
  Cc: mingo, jiangshanlai, dipankar, akpm, mathieu.desnoyers, josh,
	tglx, peterz, rostedt, dhowells, edumazet, fweisbec, oleg,
	Paul E. McKenney, Boqun Feng, Andrew Hunter, Maged Michael,
	gromer, Avi Kivity, Benjamin Herrenschmidt, Paul Mackerras,
	Michael Ellerman

From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>

Implement MEMBARRIER_CMD_PRIVATE_EXPEDITED with IPIs using cpumask built
from all runqueues for which current thread's mm is the same as the
thread calling sys_membarrier. It executes faster than the non-expedited
variant (no blocking). It also works on NOHZ_FULL configurations.

Scheduler-wise, it requires a memory barrier before and after context
switching between processes (which have different mm). The memory
barrier before context switch is already present. For the barrier after
context switch:

* Our TSO archs can do RELEASE without being a full barrier. Look at
  x86 spin_unlock() being a regular STORE for example.  But for those
  archs, all atomics imply smp_mb and all of them have atomic ops in
  switch_mm() for mm_cpumask(), and issue load_cr3() which acts as a
  full barrier.

* From all weakly ordered machines, only ARM64 and PPC can do RELEASE,
  the rest does indeed do smp_mb(), so there the spin_unlock() is a full
  barrier and we're good.

* ARM64 has a very heavy barrier in switch_to(), which suffices.

* PPC just removed its barrier from switch_to(), but appears to be
  talking about adding something to switch_mm(). So add a
  smp_mb__after_unlock_lock() for now, until this is settled on the PPC
  side.

Changes since v3:
- Properly document the memory barriers provided by each architecture.

Changes since v2:
- Address comments from Peter Zijlstra,
- Add smp_mb__after_unlock_lock() after finish_lock_switch() in
  finish_task_switch() to add the memory barrier we need after storing
  to rq->curr. This is much simpler than the previous approach relying
  on atomic_dec_and_test() in mmdrop(), which actually added a memory
  barrier in the common case of switching between userspace processes.
- Return -EINVAL when MEMBARRIER_CMD_SHARED is used on a nohz_full
  kernel, rather than having the whole membarrier system call returning
  -ENOSYS. Indeed, CMD_PRIVATE_EXPEDITED is compatible with nohz_full.
  Adapt the CMD_QUERY mask accordingly.

Changes since v1:
- move membarrier code under kernel/sched/ because it uses the
  scheduler runqueue,
- only add the barrier when we switch from a kernel thread. The case
  where we switch from a user-space thread is already handled by
  the atomic_dec_and_test() in mmdrop().
- add a comment to mmdrop() documenting the requirement on the implicit
  memory barrier.

CC: Peter Zijlstra <peterz@infradead.org>
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
CC: Andrew Hunter <ahh@google.com>
CC: Maged Michael <maged.michael@gmail.com>
CC: gromer@google.com
CC: Avi Kivity <avi@scylladb.com>
CC: Benjamin Herrenschmidt <benh@kernel.crashing.org>
CC: Paul Mackerras <paulus@samba.org>
CC: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Dave Watson <davejwatson@fb.com>
---
 MAINTAINERS                     |   2 +-
 arch/arm64/kernel/process.c     |   2 +
 arch/x86/mm/tlb.c               |   3 +-
 include/uapi/linux/membarrier.h |  23 +++++-
 kernel/Makefile                 |   1 -
 kernel/membarrier.c             |  70 ------------------
 kernel/sched/Makefile           |   1 +
 kernel/sched/core.c             |  25 +++++++
 kernel/sched/membarrier.c       | 152 ++++++++++++++++++++++++++++++++++++++++
 9 files changed, 204 insertions(+), 75 deletions(-)
 delete mode 100644 kernel/membarrier.c
 create mode 100644 kernel/sched/membarrier.c

diff --git a/MAINTAINERS b/MAINTAINERS
index f66488dfdbc9..3b035584272f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8621,7 +8621,7 @@ M:	Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
 M:	"Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
 L:	linux-kernel@vger.kernel.org
 S:	Supported
-F:	kernel/membarrier.c
+F:	kernel/sched/membarrier.c
 F:	include/uapi/linux/membarrier.h
 
 MEMORY MANAGEMENT
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 659ae8094ed5..c8f7d98d8cb9 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -360,6 +360,8 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
 	/*
 	 * Complete any pending TLB or cache maintenance on this CPU in case
 	 * the thread migrates to a different CPU.
+	 * This full barrier is also required by the membarrier system
+	 * call.
 	 */
 	dsb(ish);
 
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 014d07a80053..bb103d693f33 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -132,7 +132,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	 * due to instruction fetches or for no reason at all,
 	 * and neither LOCK nor MFENCE orders them.
 	 * Fortunately, load_cr3() is serializing and gives the
-	 * ordering guarantee we need.
+	 * ordering guarantee we need. This full barrier is also
+	 * required by the membarrier system call.
 	 */
 	load_cr3(next->pgd);
 
diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index e0b108bd2624..6d47b3249d8a 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -40,14 +40,33 @@
  *                          (non-running threads are de facto in such a
  *                          state). This covers threads from all processes
  *                          running on the system. This command returns 0.
+ * @MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+ *                          Execute a memory barrier on each running
+ *                          thread belonging to the same process as the current
+ *                          thread. Upon return from system call, the
+ *                          caller thread is ensured that all its running
+ *                          threads siblings have passed through a state
+ *                          where all memory accesses to user-space
+ *                          addresses match program order between entry
+ *                          to and return from the system call
+ *                          (non-running threads are de facto in such a
+ *                          state). This only covers threads from the
+ *                          same processes as the caller thread. This
+ *                          command returns 0. The "expedited" commands
+ *                          complete faster than the non-expedited ones,
+ *                          they never block, but have the downside of
+ *                          causing extra overhead.
  *
  * Command to be passed to the membarrier system call. The commands need to
  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
  * the value 0.
  */
 enum membarrier_cmd {
-	MEMBARRIER_CMD_QUERY = 0,
-	MEMBARRIER_CMD_SHARED = (1 << 0),
+	MEMBARRIER_CMD_QUERY			= 0,
+	MEMBARRIER_CMD_SHARED			= (1 << 0),
+	/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
+	/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED	= (1 << 3),
 };
 
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 4cb8e8b23c6e..9c323a6daa46 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
 obj-$(CONFIG_TORTURE_TEST) += torture.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
 
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
deleted file mode 100644
index 9f9284f37f8d..000000000000
--- a/kernel/membarrier.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
- *
- * membarrier system call
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#include <linux/syscalls.h>
-#include <linux/membarrier.h>
-#include <linux/tick.h>
-
-/*
- * Bitmask made from a "or" of all commands within enum membarrier_cmd,
- * except MEMBARRIER_CMD_QUERY.
- */
-#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
-
-/**
- * sys_membarrier - issue memory barriers on a set of threads
- * @cmd:   Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
- *
- * If this system call is not implemented, -ENOSYS is returned. If the
- * command specified does not exist, or if the command argument is invalid,
- * this system call returns -EINVAL. For a given command, with flags argument
- * set to 0, this system call is guaranteed to always return the same value
- * until reboot.
- *
- * All memory accesses performed in program order from each targeted thread
- * is guaranteed to be ordered with respect to sys_membarrier(). If we use
- * the semantic "barrier()" to represent a compiler barrier forcing memory
- * accesses to be performed in program order across the barrier, and
- * smp_mb() to represent explicit memory barriers forcing full memory
- * ordering across the barrier, we have the following ordering table for
- * each pair of barrier(), sys_membarrier() and smp_mb():
- *
- * The pair ordering is detailed as (O: ordered, X: not ordered):
- *
- *                        barrier()   smp_mb() sys_membarrier()
- *        barrier()          X           X            O
- *        smp_mb()           X           O            O
- *        sys_membarrier()   O           O            O
- */
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
-{
-	/* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
-	if (tick_nohz_full_enabled())
-		return -ENOSYS;
-	if (unlikely(flags))
-		return -EINVAL;
-	switch (cmd) {
-	case MEMBARRIER_CMD_QUERY:
-		return MEMBARRIER_CMD_BITMASK;
-	case MEMBARRIER_CMD_SHARED:
-		if (num_online_cpus() > 1)
-			synchronize_sched();
-		return 0;
-	default:
-		return -EINVAL;
-	}
-}
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 53f0164ed362..78f54932ea1d 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bfee6ea7db49..f77269c6c2f8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2640,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	prev_state = prev->state;
 	vtime_task_switch(prev);
 	perf_event_task_sched_in(prev, current);
+	/*
+	 * The membarrier system call requires a full memory barrier
+	 * after storing to rq->curr, before going back to user-space.
+	 *
+	 * TODO: This smp_mb__after_unlock_lock can go away if PPC end
+	 * up adding a full barrier to switch_mm(), or we should figure
+	 * out if a smp_mb__after_unlock_lock is really the proper API
+	 * to use.
+	 */
+	smp_mb__after_unlock_lock();
 	finish_lock_switch(rq, prev);
 	finish_arch_post_lock_switch();
 
@@ -3329,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt)
 	if (likely(prev != next)) {
 		rq->nr_switches++;
 		rq->curr = next;
+		/*
+		 * The membarrier system call requires each architecture
+		 * to have a full memory barrier after updating
+		 * rq->curr, before returning to user-space. For TSO
+		 * (e.g. x86), the architecture must provide its own
+		 * barrier in switch_mm(). For weakly ordered machines
+		 * for which spin_unlock() acts as a full memory
+		 * barrier, finish_lock_switch() in common code takes
+		 * care of this barrier. For weakly ordered machines for
+		 * which spin_unlock() acts as a RELEASE barrier (only
+		 * arm64 and PowerPC), arm64 has a full barrier in
+		 * switch_to(), and PowerPC has
+		 * smp_mb__after_unlock_lock() before
+		 * finish_lock_switch().
+		 */
 		++*switch_count;
 
 		trace_sched_switch(preempt, prev, next);
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
new file mode 100644
index 000000000000..a92fddc22747
--- /dev/null
+++ b/kernel/sched/membarrier.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+#include <linux/tick.h>
+#include <linux/cpumask.h>
+
+#include "sched.h"	/* for cpu_rq(). */
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK	\
+	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+
+static void ipi_mb(void *info)
+{
+	smp_mb();	/* IPIs should be serializing but paranoid. */
+}
+
+static void membarrier_private_expedited(void)
+{
+	int cpu;
+	bool fallback = false;
+	cpumask_var_t tmpmask;
+
+	if (num_online_cpus() == 1)
+		return;
+
+	/*
+	 * Matches memory barriers around rq->curr modification in
+	 * scheduler.
+	 */
+	smp_mb();	/* system call entry is not a mb. */
+
+	/*
+	 * Expedited membarrier commands guarantee that they won't
+	 * block, hence the GFP_NOWAIT allocation flag and fallback
+	 * implementation.
+	 */
+	if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+		/* Fallback for OOM. */
+		fallback = true;
+	}
+
+	cpus_read_lock();
+	for_each_online_cpu(cpu) {
+		struct task_struct *p;
+
+		/*
+		 * Skipping the current CPU is OK even through we can be
+		 * migrated at any point. The current CPU, at the point
+		 * where we read raw_smp_processor_id(), is ensured to
+		 * be in program order with respect to the caller
+		 * thread. Therefore, we can skip this CPU from the
+		 * iteration.
+		 */
+		if (cpu == raw_smp_processor_id())
+			continue;
+		rcu_read_lock();
+		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+		if (p && p->mm == current->mm) {
+			if (!fallback)
+				__cpumask_set_cpu(cpu, tmpmask);
+			else
+				smp_call_function_single(cpu, ipi_mb, NULL, 1);
+		}
+		rcu_read_unlock();
+	}
+	if (!fallback) {
+		smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+		free_cpumask_var(tmpmask);
+	}
+	cpus_read_unlock();
+
+	/*
+	 * Memory barrier on the caller thread _after_ we finished
+	 * waiting for the last IPI. Matches memory barriers around
+	 * rq->curr modification in scheduler.
+	 */
+	smp_mb();	/* exit from system call is not a mb */
+}
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd:   Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, not available on the running
+ * kernel, or if the command argument is invalid, this system call
+ * returns -EINVAL. For a given command, with flags argument set to 0,
+ * this system call is guaranteed to always return the same value until
+ * reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ *                        barrier()   smp_mb() sys_membarrier()
+ *        barrier()          X           X            O
+ *        smp_mb()           X           O            O
+ *        sys_membarrier()   O           O            O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+	if (unlikely(flags))
+		return -EINVAL;
+	switch (cmd) {
+	case MEMBARRIER_CMD_QUERY:
+	{
+		int cmd_mask = MEMBARRIER_CMD_BITMASK;
+
+		if (tick_nohz_full_enabled())
+			cmd_mask &= ~MEMBARRIER_CMD_SHARED;
+		return cmd_mask;
+	}
+	case MEMBARRIER_CMD_SHARED:
+		/* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+		if (tick_nohz_full_enabled())
+			return -EINVAL;
+		if (num_online_cpus() > 1)
+			synchronize_sched();
+		return 0;
+	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+		membarrier_private_expedited();
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
-- 
2.5.2

^ permalink raw reply related	[flat|nested] 68+ messages in thread

end of thread, other threads:[~2017-07-31 22:53 UTC | newest]

Thread overview: 68+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-07-24 21:57 [PATCH tip/core/rcu 0/5] Related non-RCU updates Paul E. McKenney
2017-07-24 21:58 ` [PATCH tip/core/rcu 1/5] module: Fix pr_fmt() bug for header use of printk Paul E. McKenney
2017-07-24 21:58 ` [PATCH tip/core/rcu 2/5] init_task: Remove redundant INIT_TASK_RCU_TREE_PREEMPT() macro Paul E. McKenney
2017-07-24 21:58 ` [PATCH tip/core/rcu 3/5] EXPERIMENTAL sched: Allow migrating kthreads into online but inactive CPUs Paul E. McKenney
2017-07-24 21:58 ` [PATCH tip/core/rcu 4/5] sys_membarrier: Add expedited option Paul E. McKenney
2017-07-25  4:27   ` Boqun Feng
2017-07-25 16:24     ` Paul E. McKenney
2017-07-25 13:21   ` Mathieu Desnoyers
2017-07-25 16:48     ` Paul E. McKenney
2017-07-25 16:33   ` Peter Zijlstra
2017-07-25 16:49     ` Paul E. McKenney
2017-07-25 16:59       ` Peter Zijlstra
2017-07-25 17:17         ` Paul E. McKenney
2017-07-25 18:53           ` Peter Zijlstra
2017-07-25 19:36             ` Paul E. McKenney
2017-07-25 20:24               ` Peter Zijlstra
2017-07-25 21:19                 ` Paul E. McKenney
2017-07-25 21:55                   ` Peter Zijlstra
2017-07-25 22:39                     ` Mathieu Desnoyers
2017-07-25 22:50                     ` Mathieu Desnoyers
2017-07-26  0:01                       ` Paul E. McKenney
2017-07-26  7:46                       ` Peter Zijlstra
2017-07-26 15:42                         ` Paul E. McKenney
2017-07-26 18:01                           ` Mathieu Desnoyers
2017-07-26 18:30                             ` Paul E. McKenney
2017-07-26 20:37                               ` Mathieu Desnoyers
2017-07-26 21:11                                 ` Paul E. McKenney
2017-07-27  1:45                                   ` Paul E. McKenney
2017-07-27 12:39                                     ` Mathieu Desnoyers
2017-07-27 14:44                                       ` Paul E. McKenney
2017-07-27 10:24                               ` Peter Zijlstra
2017-07-27 14:52                                 ` Paul E. McKenney
2017-07-27  8:53                             ` Peter Zijlstra
2017-07-27 10:09                               ` Peter Zijlstra
2017-07-27 10:22                               ` Will Deacon
2017-07-27 13:14                               ` Paul E. McKenney
2017-07-25 23:59                     ` Paul E. McKenney
2017-07-26  7:41                       ` Peter Zijlstra
2017-07-26 15:41                         ` Paul E. McKenney
2017-07-27  8:30                           ` Peter Zijlstra
2017-07-27 13:08                             ` Paul E. McKenney
2017-07-27 13:49                               ` Peter Zijlstra
2017-07-27 14:32                                 ` Paul E. McKenney
2017-07-27 14:36                                   ` Peter Zijlstra
2017-07-27 14:46                                     ` Paul E. McKenney
2017-07-27 13:55                               ` Boqun Feng
2017-07-27 14:16                                 ` Paul E. McKenney
2017-07-27 14:29                                   ` Boqun Feng
2017-07-27 14:36                                     ` Paul E. McKenney
2017-07-27 14:41                                       ` Will Deacon
2017-07-27 14:47                                       ` Boqun Feng
2017-07-27 14:55                                         ` Paul E. McKenney
2017-07-27 13:56                               ` Peter Zijlstra
2017-07-27 15:19                                 ` Peter Zijlstra
2017-07-26  9:36                   ` Will Deacon
2017-07-26 15:46                     ` Paul E. McKenney
2017-07-27 10:14               ` Peter Zijlstra
2017-07-27 12:56                 ` Paul E. McKenney
2017-07-27 13:37                   ` Peter Zijlstra
2017-07-27 14:33                     ` Paul E. McKenney
2017-07-24 21:58 ` [PATCH tip/core/rcu 5/5] EXP: sched/cputime: Fix using smp_processor_id() in preemptible Paul E. McKenney
2017-07-24 22:01   ` Wanpeng Li
2017-07-24 22:29     ` Paul E. McKenney
2017-07-31 22:51 ` [PATCH tip/core/rcu 0/5] Related non-RCU updates Paul E. McKenney
2017-07-31 22:53   ` [PATCH v2 tip/core/rcu 1/4] module: Fix pr_fmt() bug for header use of printk Paul E. McKenney
2017-07-31 22:53   ` [PATCH v2 tip/core/rcu 2/4] init_task: Remove redundant INIT_TASK_RCU_TREE_PREEMPT() macro Paul E. McKenney
2017-07-31 22:53   ` [PATCH v2 tip/core/rcu 3/4] sched: Allow migrating kthreads into online but inactive CPUs Paul E. McKenney
2017-07-31 22:53   ` [PATCH v2 tip/core/rcu 4/4] membarrier: Expedited private command Paul E. McKenney

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).