All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
@ 2019-09-03 20:11 Mathieu Desnoyers
  2019-09-03 20:11 ` [RFC PATCH 2/2] Fix: sched/membarrier: private expedited registration check Mathieu Desnoyers
                   ` (4 more replies)
  0 siblings, 5 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2019-09-03 20:11 UTC (permalink / raw)
  To: Paul E. McKenney, Peter Zijlstra, Linus Torvalds
  Cc: linux-kernel, Mathieu Desnoyers, Oleg Nesterov,
	Eric W. Biederman, Russell King - ARM Linux admin, Chris Metcalf,
	Christoph Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

The membarrier_state field is located within the mm_struct, which
is not guaranteed to exist when used from runqueue-lock-free iteration
on runqueues by the membarrier system call.

Copy the membarrier_state from the mm_struct into the next task_struct
in the scheduler prepare task switch. Upon membarrier registration,
iterate over each runqueue and copy the membarrier_state from the
mm_struct into all currently running task struct which have the same mm
as the current task.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Paul E. McKenney" <paulmck@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Russell King - ARM Linux admin <linux@armlinux.org.uk>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Kirill Tkhai <tkhai@yandex.ru>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h     |  4 ++
 include/linux/sched/mm.h  | 13 +++++++
 kernel/sched/core.c       |  1 +
 kernel/sched/membarrier.c | 78 ++++++++++++++++++++++++++++-----------
 4 files changed, 74 insertions(+), 22 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9f51932bd543..e24d52a4c37a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1130,6 +1130,10 @@ struct task_struct {
 	unsigned long			numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_MEMBARRIER
+	atomic_t membarrier_state;
+#endif
+
 #ifdef CONFIG_RSEQ
 	struct rseq __user *rseq;
 	u32 rseq_sig;
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index 4a7944078cc3..3577cd7b3dbb 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -371,7 +371,17 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
 static inline void membarrier_execve(struct task_struct *t)
 {
 	atomic_set(&t->mm->membarrier_state, 0);
+	atomic_set(&t->membarrier_state, 0);
 }
+
+static inline void membarrier_prepare_task_switch(struct task_struct *t)
+{
+	if (!t->mm)
+		return;
+	atomic_set(&t->membarrier_state,
+		   atomic_read(&t->mm->membarrier_state));
+}
+
 #else
 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
 static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
@@ -386,6 +396,9 @@ static inline void membarrier_execve(struct task_struct *t)
 static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
 {
 }
+static inline void membarrier_prepare_task_switch(struct task_struct *t)
+{
+}
 #endif
 
 #endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 010d578118d6..8d4f1f20db15 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3038,6 +3038,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 	perf_event_task_sched_out(prev, next);
 	rseq_preempt(prev);
 	fire_sched_out_preempt_notifiers(prev, next);
+	membarrier_prepare_task_switch(next);
 	prepare_task(next);
 	prepare_arch_switch(next);
 }
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index aa8d75804108..d564ca1b5d69 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -72,8 +72,8 @@ static int membarrier_global_expedited(void)
 
 		rcu_read_lock();
 		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
-		if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
-				   MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
+		if (p && (atomic_read(&p->membarrier_state) &
+			  MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
 			if (!fallback)
 				__cpumask_set_cpu(cpu, tmpmask);
 			else
@@ -177,6 +177,46 @@ static int membarrier_private_expedited(int flags)
 	return 0;
 }
 
+static void sync_other_runqueues_membarrier_state(struct mm_struct *mm)
+{
+	int cpu;
+
+	if (num_online_cpus() == 1)
+		return;
+
+	/*
+	 * For multi-mm user threads, we need to ensure all future scheduler
+	 * executions will observe @mm's new membarrier state.
+	 */
+	synchronize_rcu();
+
+	/*
+	 * For each cpu runqueue (except the current cpu), if the task's mm
+	 * match @mm, ensure that all @mm's membarrier state set bits are also
+	 * set in in the runqueue's current task membarrier state.
+	 *
+	 * Use an atomic_or() to set the task membarrier state, thus ensuring
+	 * this operation is always additive. This is important in case many
+	 * different membarrier registration commands are invoked concurrently,
+	 * given that they do not hold the mmap_sem.
+	 */
+	cpus_read_lock();
+	for_each_online_cpu(cpu) {
+		struct task_struct *p;
+
+		/* Skip current CPU. */
+		if (cpu == raw_smp_processor_id())
+			continue;
+		rcu_read_lock();
+		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+		if (p && p->mm == mm)
+			atomic_or(atomic_read(&mm->membarrier_state),
+				  &p->membarrier_state);
+		rcu_read_unlock();
+	}
+	cpus_read_unlock();
+}
+
 static int membarrier_register_global_expedited(void)
 {
 	struct task_struct *p = current;
@@ -186,6 +226,8 @@ static int membarrier_register_global_expedited(void)
 	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
 		return 0;
 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
+	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &p->membarrier_state);
+
 	if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
 		/*
 		 * For single mm user, single threaded process, we can
@@ -196,12 +238,7 @@ static int membarrier_register_global_expedited(void)
 		 */
 		smp_mb();
 	} else {
-		/*
-		 * For multi-mm user threads, we need to ensure all
-		 * future scheduler executions will observe the new
-		 * thread flag state for this mm.
-		 */
-		synchronize_rcu();
+		sync_other_runqueues_membarrier_state(mm);
 	}
 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
 		  &mm->membarrier_state);
@@ -213,12 +250,14 @@ static int membarrier_register_private_expedited(int flags)
 {
 	struct task_struct *p = current;
 	struct mm_struct *mm = p->mm;
-	int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY;
+	int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
+	    set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED;
 
 	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 		if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
 			return -EINVAL;
-		state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+		ready_state =
+			MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
 	}
 
 	/*
@@ -226,20 +265,15 @@ static int membarrier_register_private_expedited(int flags)
 	 * groups, which use the same mm. (CLONE_VM but not
 	 * CLONE_THREAD).
 	 */
-	if (atomic_read(&mm->membarrier_state) & state)
+	if ((atomic_read(&mm->membarrier_state) & ready_state))
 		return 0;
-	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
 	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
-		atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
-			  &mm->membarrier_state);
-	if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
-		/*
-		 * Ensure all future scheduler executions will observe the
-		 * new thread flag state for this process.
-		 */
-		synchronize_rcu();
-	}
-	atomic_or(state, &mm->membarrier_state);
+		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+	atomic_or(set_state, &mm->membarrier_state);
+	atomic_or(set_state, &p->membarrier_state);
+	if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1))
+		sync_other_runqueues_membarrier_state(mm);
+	atomic_or(ready_state, &mm->membarrier_state);
 
 	return 0;
 }
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [RFC PATCH 2/2] Fix: sched/membarrier: private expedited registration check
  2019-09-03 20:11 [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load Mathieu Desnoyers
@ 2019-09-03 20:11 ` Mathieu Desnoyers
  2019-09-03 20:24 ` [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load Peter Zijlstra
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2019-09-03 20:11 UTC (permalink / raw)
  To: Paul E. McKenney, Peter Zijlstra, Linus Torvalds
  Cc: linux-kernel, Mathieu Desnoyers, Oleg Nesterov,
	Eric W. Biederman, Russell King - ARM Linux admin, Chris Metcalf,
	Christoph Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

Fix a logic flaw in the way membarrier_register_private_expedited()
handles ready state checks for private expedited sync core and private
expedited registrations.

If a private expedited membarrier registration is first performed, and
then a private expedited sync_core registration is performed, the ready
state check will skip the second registration when it really should not.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Paul E. McKenney" <paulmck@linux.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Russell King - ARM Linux admin <linux@armlinux.org.uk>
Cc: Chris Metcalf <cmetcalf@ezchip.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Kirill Tkhai <tkhai@yandex.ru>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/membarrier.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index d564ca1b5d69..e962d47ea6b4 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -265,7 +265,7 @@ static int membarrier_register_private_expedited(int flags)
 	 * groups, which use the same mm. (CLONE_VM but not
 	 * CLONE_THREAD).
 	 */
-	if ((atomic_read(&mm->membarrier_state) & ready_state))
+	if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
 		return 0;
 	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
 		set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-03 20:11 [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load Mathieu Desnoyers
  2019-09-03 20:11 ` [RFC PATCH 2/2] Fix: sched/membarrier: private expedited registration check Mathieu Desnoyers
@ 2019-09-03 20:24 ` Peter Zijlstra
  2019-09-03 20:36   ` Linus Torvalds
  2019-09-03 20:41   ` Mathieu Desnoyers
  2019-09-03 20:27 ` Linus Torvalds
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 24+ messages in thread
From: Peter Zijlstra @ 2019-09-03 20:24 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Paul E. McKenney, Linus Torvalds, linux-kernel, Oleg Nesterov,
	Eric W. Biederman, Russell King - ARM Linux admin, Chris Metcalf,
	Christoph Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

On Tue, Sep 03, 2019 at 04:11:34PM -0400, Mathieu Desnoyers wrote:

> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 9f51932bd543..e24d52a4c37a 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1130,6 +1130,10 @@ struct task_struct {
>  	unsigned long			numa_pages_migrated;
>  #endif /* CONFIG_NUMA_BALANCING */
>  
> +#ifdef CONFIG_MEMBARRIER
> +	atomic_t membarrier_state;
> +#endif
> +
>  #ifdef CONFIG_RSEQ
>  	struct rseq __user *rseq;
>  	u32 rseq_sig;
> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
> index 4a7944078cc3..3577cd7b3dbb 100644
> --- a/include/linux/sched/mm.h
> +++ b/include/linux/sched/mm.h
> @@ -371,7 +371,17 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
>  static inline void membarrier_execve(struct task_struct *t)
>  {
>  	atomic_set(&t->mm->membarrier_state, 0);
> +	atomic_set(&t->membarrier_state, 0);
>  }
> +
> +static inline void membarrier_prepare_task_switch(struct task_struct *t)
> +{
> +	if (!t->mm)
> +		return;
> +	atomic_set(&t->membarrier_state,
> +		   atomic_read(&t->mm->membarrier_state));
> +}
> +

> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 010d578118d6..8d4f1f20db15 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3038,6 +3038,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
>  	perf_event_task_sched_out(prev, next);
>  	rseq_preempt(prev);
>  	fire_sched_out_preempt_notifiers(prev, next);
> +	membarrier_prepare_task_switch(next);
>  	prepare_task(next);
>  	prepare_arch_switch(next);
>  }


Yuck yuck yuck..

so the problem I have with this is that we add yet another cacheline :/

Why can't we frob this state into a line/word we already have to
unconditionally touch, like the thread_info::flags word for example.

The above also does the store unconditionally, even though, in the most
common case, it won't have to.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-03 20:11 [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load Mathieu Desnoyers
  2019-09-03 20:11 ` [RFC PATCH 2/2] Fix: sched/membarrier: private expedited registration check Mathieu Desnoyers
  2019-09-03 20:24 ` [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load Peter Zijlstra
@ 2019-09-03 20:27 ` Linus Torvalds
  2019-09-03 20:53   ` Mathieu Desnoyers
  2019-09-04 10:53 ` Oleg Nesterov
  2019-09-04 11:11 ` Oleg Nesterov
  4 siblings, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2019-09-03 20:27 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Paul E. McKenney, Peter Zijlstra, Linux List Kernel Mailing,
	Oleg Nesterov, Eric W. Biederman, Russell King - ARM Linux admin,
	Chris Metcalf, Christoph Lameter, Kirill Tkhai, Mike Galbraith,
	Thomas Gleixner, Ingo Molnar

On Tue, Sep 3, 2019 at 1:11 PM Mathieu Desnoyers
<mathieu.desnoyers@efficios.com> wrote:
>
> +       cpus_read_lock();
> +       for_each_online_cpu(cpu) {

This would likely be better off using mm_cpumask(mm) instead of all
online CPU's.

Plus doing the rcu_read_lock() inside the loop seems pointless. Even
with a lot of cores, it's not going to loop _that_ many times for RCU
latency to be an issue.

              Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-03 20:24 ` [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load Peter Zijlstra
@ 2019-09-03 20:36   ` Linus Torvalds
  2019-09-04 15:19     ` Mathieu Desnoyers
  2019-09-03 20:41   ` Mathieu Desnoyers
  1 sibling, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2019-09-03 20:36 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, Paul E. McKenney, Linux List Kernel Mailing,
	Oleg Nesterov, Eric W. Biederman, Russell King - ARM Linux admin,
	Chris Metcalf, Christoph Lameter, Kirill Tkhai, Mike Galbraith,
	Thomas Gleixner, Ingo Molnar

On Tue, Sep 3, 2019 at 1:25 PM Peter Zijlstra <peterz@infradead.org> wrote:
>
> Why can't we frob this state into a line/word we already have to
> unconditionally touch, like the thread_info::flags word for example.

I agree, but we don't have any easily used flags left, I think.

But yes, it would be better to not have membarrier always dirty
another cacheline in the scheduler. So instead of

        atomic_set(&t->membarrier_state,
                   atomic_read(&t->mm->membarrier_state));

it migth be better to do something like

        if (mm->membarrier_state)
                atomic_or(&t->membarrier_state, mm->membarrier_state);

or something along those lines - I think we've already brought in the
'mm' struct into the cache anyway, and we'd not do the write (and
dirty the destination cacheline) for the common case of no membarrier
usage.

But yes, it would be better still if we can re-use some already dirty
cache state.

I wonder if the easiest model might be to just use a percpu variable
instead for the membarrier stuff? It's not like it has to be in
'struct task_struct' at all, I think. We only care about the current
runqueues, and those are percpu anyway.

             Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-03 20:24 ` [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load Peter Zijlstra
  2019-09-03 20:36   ` Linus Torvalds
@ 2019-09-03 20:41   ` Mathieu Desnoyers
  2019-09-04 11:28     ` Peter Zijlstra
  1 sibling, 1 reply; 24+ messages in thread
From: Mathieu Desnoyers @ 2019-09-03 20:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, Linus Torvalds, linux-kernel, Oleg Nesterov,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

----- On Sep 3, 2019, at 4:24 PM, Peter Zijlstra peterz@infradead.org wrote:

> On Tue, Sep 03, 2019 at 04:11:34PM -0400, Mathieu Desnoyers wrote:
> 
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 9f51932bd543..e24d52a4c37a 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1130,6 +1130,10 @@ struct task_struct {
>>  	unsigned long			numa_pages_migrated;
>>  #endif /* CONFIG_NUMA_BALANCING */
>>  
>> +#ifdef CONFIG_MEMBARRIER
>> +	atomic_t membarrier_state;
>> +#endif
>> +
>>  #ifdef CONFIG_RSEQ
>>  	struct rseq __user *rseq;
>>  	u32 rseq_sig;
>> diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
>> index 4a7944078cc3..3577cd7b3dbb 100644
>> --- a/include/linux/sched/mm.h
>> +++ b/include/linux/sched/mm.h
>> @@ -371,7 +371,17 @@ static inline void
>> membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
>>  static inline void membarrier_execve(struct task_struct *t)
>>  {
>>  	atomic_set(&t->mm->membarrier_state, 0);
>> +	atomic_set(&t->membarrier_state, 0);
>>  }
>> +
>> +static inline void membarrier_prepare_task_switch(struct task_struct *t)
>> +{
>> +	if (!t->mm)
>> +		return;
>> +	atomic_set(&t->membarrier_state,
>> +		   atomic_read(&t->mm->membarrier_state));
>> +}
>> +
> 
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 010d578118d6..8d4f1f20db15 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -3038,6 +3038,7 @@ prepare_task_switch(struct rq *rq, struct task_struct
>> *prev,
>>  	perf_event_task_sched_out(prev, next);
>>  	rseq_preempt(prev);
>>  	fire_sched_out_preempt_notifiers(prev, next);
>> +	membarrier_prepare_task_switch(next);
>>  	prepare_task(next);
>>  	prepare_arch_switch(next);
>>  }
> 
> 
> Yuck yuck yuck..
> 
> so the problem I have with this is that we add yet another cacheline :/
> 
> Why can't we frob this state into a line/word we already have to
> unconditionally touch, like the thread_info::flags word for example.
> 
> The above also does the store unconditionally, even though, in the most
> common case, it won't have to.

This approach would require to reserve TIF flags in each supported
architecture, which I would like to avoid if possible.

As discussed on IRC, one alternative for the multi-threaded case would
be to grab the task list lock and iterate over all existing tasks to
set the bit, so we don't have to touch an extra cache line from the
scheduler.

In order to keep the speed of the common single-threaded library
constructor common case fast, we simply set the bit in the current
task struct, and rely on clone() propagating the flag to children
threads (which it already does).

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-03 20:27 ` Linus Torvalds
@ 2019-09-03 20:53   ` Mathieu Desnoyers
  0 siblings, 0 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2019-09-03 20:53 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: paulmck, Peter Zijlstra, linux-kernel, Oleg Nesterov,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

----- On Sep 3, 2019, at 4:27 PM, Linus Torvalds torvalds@linux-foundation.org wrote:

> On Tue, Sep 3, 2019 at 1:11 PM Mathieu Desnoyers
> <mathieu.desnoyers@efficios.com> wrote:
>>
>> +       cpus_read_lock();
>> +       for_each_online_cpu(cpu) {
> 
> This would likely be better off using mm_cpumask(mm) instead of all
> online CPU's.

I've considered using mm_cpumask(mm) in the original implementation of
the membarrier expedited private command, and chose to stick to online
cpu mask instead.

Here was my off-list justification to Peter Zijlstra and Paul E. McKenney:

  If we have an iteration on mm_cpumask in the membarrier code,
  then we additionally need to document that memory barriers are
  required before and/or after all updates to the mm_cpumask, otherwise
  I think we end up in the same situation as with the rq->curr update.
  [...]
  So we'd be sprinkling even more memory barrier comments all over.

Considering the amount of comments that needed to be added around the
scheduler rq->curr update for membarrier, I'm concerned that the amount
of additional analysis, documentation, and design constraints required
to safely use mm_cpumask() from membarrier is not really worth it
compared to iterating on online cpus with cpu hotplug read lock held.

> 
> Plus doing the rcu_read_lock() inside the loop seems pointless. Even
> with a lot of cores, it's not going to loop _that_ many times for RCU
> latency to be an issue.

Good point! I'll keep that in mind for next round if we don't chose an
entirely different way forward.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-03 20:11 [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load Mathieu Desnoyers
                   ` (2 preceding siblings ...)
  2019-09-03 20:27 ` Linus Torvalds
@ 2019-09-04 10:53 ` Oleg Nesterov
  2019-09-04 11:39   ` Peter Zijlstra
  2019-09-04 15:24   ` Mathieu Desnoyers
  2019-09-04 11:11 ` Oleg Nesterov
  4 siblings, 2 replies; 24+ messages in thread
From: Oleg Nesterov @ 2019-09-04 10:53 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Paul E. McKenney, Peter Zijlstra, Linus Torvalds, linux-kernel,
	Eric W. Biederman, Russell King - ARM Linux admin, Chris Metcalf,
	Christoph Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

On 09/03, Mathieu Desnoyers wrote:
>
> @@ -1130,6 +1130,10 @@ struct task_struct {
>  	unsigned long			numa_pages_migrated;
>  #endif /* CONFIG_NUMA_BALANCING */
>
> +#ifdef CONFIG_MEMBARRIER
> +	atomic_t membarrier_state;
> +#endif

...

> +static inline void membarrier_prepare_task_switch(struct task_struct *t)
> +{
> +	if (!t->mm)
> +		return;
> +	atomic_set(&t->membarrier_state,
> +		   atomic_read(&t->mm->membarrier_state));
> +}

Why not

	rq->membarrier_state = next->mm ? t->mm->membarrier_state : 0;

and

	if (cpu_rq(cpu)->membarrier_state & MEMBARRIER_STATE_GLOBAL_EXPEDITED) {
		...
	}

in membarrier_global_expedited() ? (I removed atomic_ to simplify)

IOW, why this new member has to live in task_struct, not in rq?

Oleg.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-03 20:11 [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load Mathieu Desnoyers
                   ` (3 preceding siblings ...)
  2019-09-04 10:53 ` Oleg Nesterov
@ 2019-09-04 11:11 ` Oleg Nesterov
  2019-09-04 16:11   ` Mathieu Desnoyers
  2019-09-08 13:46   ` Mathieu Desnoyers
  4 siblings, 2 replies; 24+ messages in thread
From: Oleg Nesterov @ 2019-09-04 11:11 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Paul E. McKenney, Peter Zijlstra, Linus Torvalds, linux-kernel,
	Eric W. Biederman, Russell King - ARM Linux admin, Chris Metcalf,
	Christoph Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

with or without these changes...

Why do membarrier_register_*_expedited() check get_nr_threads() == 1?
This makes no sense to me, atomic_read(mm_users) == 1 should be enough.


And I am not sure I understand membarrier_mm_sync_core_before_usermode().
OK, membarrier_private_expedited() can race with user -> kernel -> user
transition, but we do not care unless both user's above have the same mm?
Shouldn't membarrier_mm_sync_core_before_usermode() do

	if (current->mm != mm)
		return;

at the start to make it more clear and avoid sync_core_before_usermode()
if possible?

Oleg.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-03 20:41   ` Mathieu Desnoyers
@ 2019-09-04 11:28     ` Peter Zijlstra
  2019-09-04 11:49       ` Peter Zijlstra
  2019-09-04 12:03       ` Oleg Nesterov
  0 siblings, 2 replies; 24+ messages in thread
From: Peter Zijlstra @ 2019-09-04 11:28 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: paulmck, Linus Torvalds, linux-kernel, Oleg Nesterov,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

On Tue, Sep 03, 2019 at 04:41:47PM -0400, Mathieu Desnoyers wrote:
> As discussed on IRC, one alternative for the multi-threaded case would
> be to grab the task list lock and iterate over all existing tasks to
> set the bit, so we don't have to touch an extra cache line from the
> scheduler.
> 
> In order to keep the speed of the common single-threaded library
> constructor common case fast, we simply set the bit in the current
> task struct, and rely on clone() propagating the flag to children
> threads (which it already does).

Something like the completely untested thing below.

And yes, that do_each_thread/while_each_thread thing is unfortunate and
yuck too, but supposedly that's a slow path not many people are expected
to hit anyway, right?

---
 include/linux/sched.h     |  4 ++++
 kernel/sched/membarrier.c | 20 +++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 33b310a826d7..dbafafb8ef40 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1136,6 +1136,10 @@ struct task_struct {
 	unsigned long			numa_pages_migrated;
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_MEMBARRIER
+	atomic_t			membarrier_state;
+#endif
+
 #ifdef CONFIG_RSEQ
 	struct rseq __user *rseq;
 	u32 rseq_sig;
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index aa8d75804108..961f6affbf38 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -72,8 +72,8 @@ static int membarrier_global_expedited(void)
 
 		rcu_read_lock();
 		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
-		if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
-				   MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
+		if (p && (atomic_read(&p->membarrier_state) &
+			  MEMBARRIER_STATE_GLOBAL_EXPEDITED)) {
 			if (!fallback)
 				__cpumask_set_cpu(cpu, tmpmask);
 			else
@@ -185,7 +185,9 @@ static int membarrier_register_global_expedited(void)
 	if (atomic_read(&mm->membarrier_state) &
 	    MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
 		return 0;
+
 	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
+	atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &p->membarrier_state);
 	if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) {
 		/*
 		 * For single mm user, single threaded process, we can
@@ -196,6 +198,17 @@ static int membarrier_register_global_expedited(void)
 		 */
 		smp_mb();
 	} else {
+		struct task_struct *g, *t;
+
+		read_lock(&tasklist_lock);
+		do_each_thread(g, t) {
+			if (t->mm == mm) {
+				atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED,
+					  &t->membarrier_state);
+			}
+		} while_each_thread(g, t);
+		read_unlock(&tasklist_lock);
+
 		/*
 		 * For multi-mm user threads, we need to ensure all
 		 * future scheduler executions will observe the new
@@ -229,9 +242,10 @@ static int membarrier_register_private_expedited(int flags)
 	if (atomic_read(&mm->membarrier_state) & state)
 		return 0;
 	atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
-	if (flags & MEMBARRIER_FLAG_SYNC_CORE)
+	if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
 		atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE,
 			  &mm->membarrier_state);
+	}
 	if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) {
 		/*
 		 * Ensure all future scheduler executions will observe the

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-04 10:53 ` Oleg Nesterov
@ 2019-09-04 11:39   ` Peter Zijlstra
  2019-09-04 15:24   ` Mathieu Desnoyers
  1 sibling, 0 replies; 24+ messages in thread
From: Peter Zijlstra @ 2019-09-04 11:39 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Mathieu Desnoyers, Paul E. McKenney, Linus Torvalds,
	linux-kernel, Eric W. Biederman, Russell King - ARM Linux admin,
	Chris Metcalf, Christoph Lameter, Kirill Tkhai, Mike Galbraith,
	Thomas Gleixner, Ingo Molnar

On Wed, Sep 04, 2019 at 12:53:49PM +0200, Oleg Nesterov wrote:
> On 09/03, Mathieu Desnoyers wrote:
> >
> > @@ -1130,6 +1130,10 @@ struct task_struct {
> >  	unsigned long			numa_pages_migrated;
> >  #endif /* CONFIG_NUMA_BALANCING */
> >
> > +#ifdef CONFIG_MEMBARRIER
> > +	atomic_t membarrier_state;
> > +#endif
> 
> ...
> 
> > +static inline void membarrier_prepare_task_switch(struct task_struct *t)
> > +{
> > +	if (!t->mm)
> > +		return;
> > +	atomic_set(&t->membarrier_state,
> > +		   atomic_read(&t->mm->membarrier_state));
> > +}
> 
> Why not
> 
> 	rq->membarrier_state = next->mm ? t->mm->membarrier_state : 0;
> 
> and
> 
> 	if (cpu_rq(cpu)->membarrier_state & MEMBARRIER_STATE_GLOBAL_EXPEDITED) {
> 		...
> 	}
> 
> in membarrier_global_expedited() ? (I removed atomic_ to simplify)
> 
> IOW, why this new member has to live in task_struct, not in rq?

It could be like the above; but then we're still reading
mm->membarrier_state in the scheduler path.

The patch I just send avoids event that, at the cost of doing a
do_each_thread/while_each_thread iteration in the uncommon case where we
register a process after it grows threads.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-04 11:28     ` Peter Zijlstra
@ 2019-09-04 11:49       ` Peter Zijlstra
  2019-09-04 15:26         ` Mathieu Desnoyers
  2019-09-04 12:03       ` Oleg Nesterov
  1 sibling, 1 reply; 24+ messages in thread
From: Peter Zijlstra @ 2019-09-04 11:49 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: paulmck, Linus Torvalds, linux-kernel, Oleg Nesterov,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

On Wed, Sep 04, 2019 at 01:28:19PM +0200, Peter Zijlstra wrote:
> @@ -196,6 +198,17 @@ static int membarrier_register_global_expedited(void)
>  		 */
>  		smp_mb();
>  	} else {
> +		struct task_struct *g, *t;
> +
> +		read_lock(&tasklist_lock);
> +		do_each_thread(g, t) {
> +			if (t->mm == mm) {
> +				atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED,
> +					  &t->membarrier_state);
> +			}
> +		} while_each_thread(g, t);
> +		read_unlock(&tasklist_lock);
> +
>  		/*
>  		 * For multi-mm user threads, we need to ensure all
>  		 * future scheduler executions will observe the new

Arguably, because this is exposed to unpriv users and a potential
preemption latency issue, we could do it in 3 passes:

	- RCU, mark all found lacking, count
	- RCU, mark all found lacking, count
	- if count of last pass, tasklist_lock

That way, it becomes much harder to trigger the bad case.

Do we worry about that?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-04 11:28     ` Peter Zijlstra
  2019-09-04 11:49       ` Peter Zijlstra
@ 2019-09-04 12:03       ` Oleg Nesterov
  2019-09-04 12:43         ` Peter Zijlstra
  1 sibling, 1 reply; 24+ messages in thread
From: Oleg Nesterov @ 2019-09-04 12:03 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, paulmck, Linus Torvalds, linux-kernel,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

On 09/04, Peter Zijlstra wrote:
>
> +		struct task_struct *g, *t;
> +
> +		read_lock(&tasklist_lock);
> +		do_each_thread(g, t) {

for_each_process_thread() looks better

> +			if (t->mm == mm) {
> +				atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED,
> +					  &t->membarrier_state);
> +			}

then you also need to change dup_task_struct(), it should clear
->membarrier_state unless CLONE_VM.

And probably unuse_mm() should clear current->membarrier_state too.

Hmm. And it can race with copy_process() anyway, tasklist_lock can't
really help. So copy_process() needs to do

	write_lock_irq(&tasklist_lock);
	...

	if (clone_flags & CLONE_VM)
		p->membarrier_state = current->membarrier_state;
	else
		p->membarrier_state = 0;

Oleg.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-04 12:03       ` Oleg Nesterov
@ 2019-09-04 12:43         ` Peter Zijlstra
  2019-09-04 13:17           ` Oleg Nesterov
  0 siblings, 1 reply; 24+ messages in thread
From: Peter Zijlstra @ 2019-09-04 12:43 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Mathieu Desnoyers, paulmck, Linus Torvalds, linux-kernel,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

On Wed, Sep 04, 2019 at 02:03:37PM +0200, Oleg Nesterov wrote:
> On 09/04, Peter Zijlstra wrote:
> >
> > +		struct task_struct *g, *t;
> > +
> > +		read_lock(&tasklist_lock);
> > +		do_each_thread(g, t) {
> 
> for_each_process_thread() looks better

Argh, I always get confused. Why do we have multiple version of this
again?

> > +			if (t->mm == mm) {
> > +				atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED,
> > +					  &t->membarrier_state);
> > +			}
> 
> then you also need to change dup_task_struct(), it should clear
> ->membarrier_state unless CLONE_VM.

Or, as you suggest below.

> And probably unuse_mm() should clear current->membarrier_state too.

How about we hard exclude PF_KTHREAD and ignore {,un}use_mm() entirely?

> Hmm. And it can race with copy_process() anyway, tasklist_lock can't
> really help. So copy_process() needs to do
> 
> 	write_lock_irq(&tasklist_lock);
> 	...
> 
> 	if (clone_flags & CLONE_VM)
> 		p->membarrier_state = current->membarrier_state;
> 	else
> 		p->membarrier_state = 0;

Right you are.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-04 12:43         ` Peter Zijlstra
@ 2019-09-04 13:17           ` Oleg Nesterov
  0 siblings, 0 replies; 24+ messages in thread
From: Oleg Nesterov @ 2019-09-04 13:17 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Mathieu Desnoyers, paulmck, Linus Torvalds, linux-kernel,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

On 09/04, Peter Zijlstra wrote:
>
> On Wed, Sep 04, 2019 at 02:03:37PM +0200, Oleg Nesterov wrote:
> > On 09/04, Peter Zijlstra wrote:
> > >
> > > +		struct task_struct *g, *t;
> > > +
> > > +		read_lock(&tasklist_lock);
> > > +		do_each_thread(g, t) {
> > 
> > for_each_process_thread() looks better
> 
> Argh, I always get confused. Why do we have multiple version of this
> again?

Because I am lazy ;)

but actually for_each_process_thread() is suboptimal, I think you need

	for_each_process(p) {
		if (p->flags & PF_KTHREAD)
			continue;

		if (p->mm != mm)
			continue;

		for_each_thread(p, t)
			atomic_or(t->membarrier_state, ...);
	}

to avoid unnecessary each-thread when group leader has another ->mm.

Unfortunately a zombie leader has ->mm == NULL, so perhaps something like

	for_each_process(p) {
		if (p->flags & PF_KTHREAD)
			continue;

		for_each_thread(p, t) {
			if (unlikely(!t->mm))
				continue;
			if (t->mm != mm)
				break;
			atomic_or(t->membarrier_state, ...);
		}
	}

and to we really need the new atomic_t member? can we use t->atomic_flags
and add PFA_MEMBARRIER_EXPEDITED ?

Oleg.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-03 20:36   ` Linus Torvalds
@ 2019-09-04 15:19     ` Mathieu Desnoyers
  2019-09-04 16:09       ` Peter Zijlstra
  0 siblings, 1 reply; 24+ messages in thread
From: Mathieu Desnoyers @ 2019-09-04 15:19 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Peter Zijlstra, paulmck, linux-kernel, Oleg Nesterov,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

----- On Sep 3, 2019, at 4:36 PM, Linus Torvalds torvalds@linux-foundation.org wrote:

> On Tue, Sep 3, 2019 at 1:25 PM Peter Zijlstra <peterz@infradead.org> wrote:
>>
>> Why can't we frob this state into a line/word we already have to
>> unconditionally touch, like the thread_info::flags word for example.
> 
> I agree, but we don't have any easily used flags left, I think.
> 
> But yes, it would be better to not have membarrier always dirty
> another cacheline in the scheduler. So instead of
> 
>        atomic_set(&t->membarrier_state,
>                   atomic_read(&t->mm->membarrier_state));
> 
> it migth be better to do something like
> 
>        if (mm->membarrier_state)
>                atomic_or(&t->membarrier_state, mm->membarrier_state);
> 
> or something along those lines - I think we've already brought in the
> 'mm' struct into the cache anyway, and we'd not do the write (and
> dirty the destination cacheline) for the common case of no membarrier
> usage.
> 
> But yes, it would be better still if we can re-use some already dirty
> cache state.

Considering the alternative proposed by PeterZ, which is to iterate over
all processes/threads from an unprivileged process, I would be tempted
to put some more thoughts into the mm->membarrier_state cache-line. Do
we expect it to be typically hot ? Is there anything we can do to move
this field into a typically hot mm cacheline ?

I agree with your approach aiming to typically just load that field
(no store in the common case).

> 
> I wonder if the easiest model might be to just use a percpu variable
> instead for the membarrier stuff? It's not like it has to be in
> 'struct task_struct' at all, I think. We only care about the current
> runqueues, and those are percpu anyway.

One issue here is that membarrier iterates over all runqueues without
grabbing any runqueue lock. If we copy that state from mm to rq on
sched switch prepare, we would need to ensure we have the proper
memory barriers between:

prior user-space memory accesses  /  setting the runqueue membarrier state

and

setting the runqueue membarrier state / following user-space memory accesses

Copying the membarrier state into the task struct leverages the fact that
we have documented and guaranteed those barriers around the rq->curr update
in the scheduler.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-04 10:53 ` Oleg Nesterov
  2019-09-04 11:39   ` Peter Zijlstra
@ 2019-09-04 15:24   ` Mathieu Desnoyers
  1 sibling, 0 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2019-09-04 15:24 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: paulmck, Peter Zijlstra, Linus Torvalds, linux-kernel,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

----- On Sep 4, 2019, at 6:53 AM, Oleg Nesterov oleg@redhat.com wrote:

> On 09/03, Mathieu Desnoyers wrote:
>>
>> @@ -1130,6 +1130,10 @@ struct task_struct {
>>  	unsigned long			numa_pages_migrated;
>>  #endif /* CONFIG_NUMA_BALANCING */
>>
>> +#ifdef CONFIG_MEMBARRIER
>> +	atomic_t membarrier_state;
>> +#endif
> 
> ...
> 
>> +static inline void membarrier_prepare_task_switch(struct task_struct *t)
>> +{
>> +	if (!t->mm)
>> +		return;
>> +	atomic_set(&t->membarrier_state,
>> +		   atomic_read(&t->mm->membarrier_state));
>> +}
> 
> Why not
> 
>	rq->membarrier_state = next->mm ? t->mm->membarrier_state : 0;
> 
> and
> 
>	if (cpu_rq(cpu)->membarrier_state & MEMBARRIER_STATE_GLOBAL_EXPEDITED) {
>		...
>	}
> 
> in membarrier_global_expedited() ? (I removed atomic_ to simplify)
> 
> IOW, why this new member has to live in task_struct, not in rq?

As replied to Linus, if we copy the membarrier_state into the rq, we'd need
to ensure we have full memory barriers between:

prior user-space memory accesses  /  setting the runqueue membarrier state

and

setting the runqueue membarrier state / following user-space memory accesses

Because membarrier does not take any runqueue lock when it iterates over
runqueues.

I try to avoid putting too much memory barrier constraints on the scheduler
for membarrier, but if it's really the way forward it could be done.

And the basic question remains: it is acceptable performance-wise to load
mm->membarrier_state from sched switch prepare ?

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-04 11:49       ` Peter Zijlstra
@ 2019-09-04 15:26         ` Mathieu Desnoyers
  0 siblings, 0 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2019-09-04 15:26 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: paulmck, Linus Torvalds, linux-kernel, Oleg Nesterov,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

----- On Sep 4, 2019, at 7:49 AM, Peter Zijlstra peterz@infradead.org wrote:

> On Wed, Sep 04, 2019 at 01:28:19PM +0200, Peter Zijlstra wrote:
>> @@ -196,6 +198,17 @@ static int membarrier_register_global_expedited(void)
>>  		 */
>>  		smp_mb();
>>  	} else {
>> +		struct task_struct *g, *t;
>> +
>> +		read_lock(&tasklist_lock);
>> +		do_each_thread(g, t) {
>> +			if (t->mm == mm) {
>> +				atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED,
>> +					  &t->membarrier_state);
>> +			}
>> +		} while_each_thread(g, t);
>> +		read_unlock(&tasklist_lock);
>> +
>>  		/*
>>  		 * For multi-mm user threads, we need to ensure all
>>  		 * future scheduler executions will observe the new
> 
> Arguably, because this is exposed to unpriv users and a potential
> preemption latency issue, we could do it in 3 passes:
> 
>	- RCU, mark all found lacking, count
>	- RCU, mark all found lacking, count
>	- if count of last pass, tasklist_lock
> 
> That way, it becomes much harder to trigger the bad case.
> 
> Do we worry about that?

Allowing unprivileged processes to iterate over all processes/threads
with the tasklist lock held is something I try to avoid.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-04 15:19     ` Mathieu Desnoyers
@ 2019-09-04 16:09       ` Peter Zijlstra
  2019-09-04 17:12         ` Mathieu Desnoyers
  0 siblings, 1 reply; 24+ messages in thread
From: Peter Zijlstra @ 2019-09-04 16:09 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Linus Torvalds, paulmck, linux-kernel, Oleg Nesterov,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

On Wed, Sep 04, 2019 at 11:19:00AM -0400, Mathieu Desnoyers wrote:
> ----- On Sep 3, 2019, at 4:36 PM, Linus Torvalds torvalds@linux-foundation.org wrote:

> > I wonder if the easiest model might be to just use a percpu variable
> > instead for the membarrier stuff? It's not like it has to be in
> > 'struct task_struct' at all, I think. We only care about the current
> > runqueues, and those are percpu anyway.
> 
> One issue here is that membarrier iterates over all runqueues without
> grabbing any runqueue lock. If we copy that state from mm to rq on
> sched switch prepare, we would need to ensure we have the proper
> memory barriers between:
> 
> prior user-space memory accesses  /  setting the runqueue membarrier state
> 
> and
> 
> setting the runqueue membarrier state / following user-space memory accesses
> 
> Copying the membarrier state into the task struct leverages the fact that
> we have documented and guaranteed those barriers around the rq->curr update
> in the scheduler.

Should be the same as the barriers we already rely on for rq->curr, no?
That is, if we put this before switch_mm() then we have
smp_mb__after_spinlock() and switch_mm() itself.

Also, if we place mm->membarrier_state in the same cacheline as mm->pgd
(which switch_mm() is bound to load) then we should be fine, I think.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-04 11:11 ` Oleg Nesterov
@ 2019-09-04 16:11   ` Mathieu Desnoyers
  2019-09-08 13:46   ` Mathieu Desnoyers
  1 sibling, 0 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2019-09-04 16:11 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: paulmck, Peter Zijlstra, Linus Torvalds, linux-kernel,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

----- On Sep 4, 2019, at 7:11 AM, Oleg Nesterov oleg@redhat.com wrote:

> with or without these changes...
> 
> Why do membarrier_register_*_expedited() check get_nr_threads() == 1?
> This makes no sense to me, atomic_read(mm_users) == 1 should be enough.

Indeed, if every thread within a process hold a mm_users refcount, then
the get_nr_threads() == 1 check becomes redundant.

AFAIR, this check started out as "get_nr_threads() == 1", and then I changed
the code to also cover the multi-process CLONE_VM use-case by adding the
additional check.

> And I am not sure I understand membarrier_mm_sync_core_before_usermode().
> OK, membarrier_private_expedited() can race with user -> kernel -> user
> transition, but we do not care unless both user's above have the same mm?
> Shouldn't membarrier_mm_sync_core_before_usermode() do
> 
>	if (current->mm != mm)
>		return;
> 
> at the start to make it more clear and avoid sync_core_before_usermode()
> if possible?

Indeed, if we have taskA -> kernel -> taskB, it implies that we go through
switch_mm() when scheduling taskB, which provides the required core serializing
guarantees.

Moreover, if we look closely at the call to membarrier_mm_sync_core_before_usermode(),
the mm it receives as parameter is the rq->prev_mm. So using the prev_mm membarrier
state to decide whether we need to issue a sync_core before returning to a
different next mm is not really relevant unless the next mm == rq->prev_mm.

Nothing there seem to be actively buggy, but those are indeed nice cleanups.

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-04 16:09       ` Peter Zijlstra
@ 2019-09-04 17:12         ` Mathieu Desnoyers
  2019-09-04 18:26           ` Peter Zijlstra
  0 siblings, 1 reply; 24+ messages in thread
From: Mathieu Desnoyers @ 2019-09-04 17:12 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, paulmck, linux-kernel, Oleg Nesterov,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

----- On Sep 4, 2019, at 12:09 PM, Peter Zijlstra peterz@infradead.org wrote:

> On Wed, Sep 04, 2019 at 11:19:00AM -0400, Mathieu Desnoyers wrote:
>> ----- On Sep 3, 2019, at 4:36 PM, Linus Torvalds torvalds@linux-foundation.org
>> wrote:
> 
>> > I wonder if the easiest model might be to just use a percpu variable
>> > instead for the membarrier stuff? It's not like it has to be in
>> > 'struct task_struct' at all, I think. We only care about the current
>> > runqueues, and those are percpu anyway.
>> 
>> One issue here is that membarrier iterates over all runqueues without
>> grabbing any runqueue lock. If we copy that state from mm to rq on
>> sched switch prepare, we would need to ensure we have the proper
>> memory barriers between:
>> 
>> prior user-space memory accesses  /  setting the runqueue membarrier state
>> 
>> and
>> 
>> setting the runqueue membarrier state / following user-space memory accesses
>> 
>> Copying the membarrier state into the task struct leverages the fact that
>> we have documented and guaranteed those barriers around the rq->curr update
>> in the scheduler.
> 
> Should be the same as the barriers we already rely on for rq->curr, no?
> That is, if we put this before switch_mm() then we have
> smp_mb__after_spinlock() and switch_mm() itself.

Yes, I think we can piggy-back on the already documented barriers documented around
rq->curr store.

> Also, if we place mm->membarrier_state in the same cacheline as mm->pgd
> (which switch_mm() is bound to load) then we should be fine, I think.

Yes, if we make sure membarrier_prepare_task_switch only updates the
rq->membarrier_state if prev->mm != next->mm, we should be able to avoid
loading next->mm->membarrier_state when switch_mm() is not invoked.

I'll prepare RFC patch implementing this approach.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-04 17:12         ` Mathieu Desnoyers
@ 2019-09-04 18:26           ` Peter Zijlstra
  2019-09-06  0:51             ` Mathieu Desnoyers
  0 siblings, 1 reply; 24+ messages in thread
From: Peter Zijlstra @ 2019-09-04 18:26 UTC (permalink / raw)
  To: Mathieu Desnoyers
  Cc: Linus Torvalds, paulmck, linux-kernel, Oleg Nesterov,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

On Wed, Sep 04, 2019 at 01:12:53PM -0400, Mathieu Desnoyers wrote:
> ----- On Sep 4, 2019, at 12:09 PM, Peter Zijlstra peterz@infradead.org wrote:
> 
> > On Wed, Sep 04, 2019 at 11:19:00AM -0400, Mathieu Desnoyers wrote:
> >> ----- On Sep 3, 2019, at 4:36 PM, Linus Torvalds torvalds@linux-foundation.org
> >> wrote:
> > 
> >> > I wonder if the easiest model might be to just use a percpu variable
> >> > instead for the membarrier stuff? It's not like it has to be in
> >> > 'struct task_struct' at all, I think. We only care about the current
> >> > runqueues, and those are percpu anyway.
> >> 
> >> One issue here is that membarrier iterates over all runqueues without
> >> grabbing any runqueue lock. If we copy that state from mm to rq on
> >> sched switch prepare, we would need to ensure we have the proper
> >> memory barriers between:
> >> 
> >> prior user-space memory accesses  /  setting the runqueue membarrier state
> >> 
> >> and
> >> 
> >> setting the runqueue membarrier state / following user-space memory accesses
> >> 
> >> Copying the membarrier state into the task struct leverages the fact that
> >> we have documented and guaranteed those barriers around the rq->curr update
> >> in the scheduler.
> > 
> > Should be the same as the barriers we already rely on for rq->curr, no?
> > That is, if we put this before switch_mm() then we have
> > smp_mb__after_spinlock() and switch_mm() itself.
> 
> Yes, I think we can piggy-back on the already documented barriers documented around
> rq->curr store.
> 
> > Also, if we place mm->membarrier_state in the same cacheline as mm->pgd
> > (which switch_mm() is bound to load) then we should be fine, I think.
> 
> Yes, if we make sure membarrier_prepare_task_switch only updates the
> rq->membarrier_state if prev->mm != next->mm, we should be able to avoid
> loading next->mm->membarrier_state when switch_mm() is not invoked.
> 
> I'll prepare RFC patch implementing this approach.

Thinking about this a bit; switching it 'on' still requires some
thinking. Consider register on an already threaded process of which
multiple threads are 'current' on multiple CPUs. In that case none of
the rq bits will be set.

Not even synchronize_rcu() is sufficient to force it either, since we
only update on switch_mm() and nothing guarantees we pass that.

One possible approach would be to IPI broadcast (after setting the
->mm->membarrier_State) and having the IPI update the rq state from
'current->mm'.

But possible I'm just confusing evryone again. I'm not having a good day
today.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-04 18:26           ` Peter Zijlstra
@ 2019-09-06  0:51             ` Mathieu Desnoyers
  0 siblings, 0 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2019-09-06  0:51 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Linus Torvalds, paulmck, linux-kernel, Oleg Nesterov,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

----- On Sep 4, 2019, at 2:26 PM, Peter Zijlstra peterz@infradead.org wrote:

> On Wed, Sep 04, 2019 at 01:12:53PM -0400, Mathieu Desnoyers wrote:
>> ----- On Sep 4, 2019, at 12:09 PM, Peter Zijlstra peterz@infradead.org wrote:
>> 
>> > On Wed, Sep 04, 2019 at 11:19:00AM -0400, Mathieu Desnoyers wrote:
>> >> ----- On Sep 3, 2019, at 4:36 PM, Linus Torvalds torvalds@linux-foundation.org
>> >> wrote:
>> > 
>> >> > I wonder if the easiest model might be to just use a percpu variable
>> >> > instead for the membarrier stuff? It's not like it has to be in
>> >> > 'struct task_struct' at all, I think. We only care about the current
>> >> > runqueues, and those are percpu anyway.
>> >> 
>> >> One issue here is that membarrier iterates over all runqueues without
>> >> grabbing any runqueue lock. If we copy that state from mm to rq on
>> >> sched switch prepare, we would need to ensure we have the proper
>> >> memory barriers between:
>> >> 
>> >> prior user-space memory accesses  /  setting the runqueue membarrier state
>> >> 
>> >> and
>> >> 
>> >> setting the runqueue membarrier state / following user-space memory accesses
>> >> 
>> >> Copying the membarrier state into the task struct leverages the fact that
>> >> we have documented and guaranteed those barriers around the rq->curr update
>> >> in the scheduler.
>> > 
>> > Should be the same as the barriers we already rely on for rq->curr, no?
>> > That is, if we put this before switch_mm() then we have
>> > smp_mb__after_spinlock() and switch_mm() itself.
>> 
>> Yes, I think we can piggy-back on the already documented barriers documented
>> around
>> rq->curr store.
>> 
>> > Also, if we place mm->membarrier_state in the same cacheline as mm->pgd
>> > (which switch_mm() is bound to load) then we should be fine, I think.
>> 
>> Yes, if we make sure membarrier_prepare_task_switch only updates the
>> rq->membarrier_state if prev->mm != next->mm, we should be able to avoid
>> loading next->mm->membarrier_state when switch_mm() is not invoked.
>> 
>> I'll prepare RFC patch implementing this approach.
> 
> Thinking about this a bit; switching it 'on' still requires some
> thinking. Consider register on an already threaded process of which
> multiple threads are 'current' on multiple CPUs. In that case none of
> the rq bits will be set.
> 
> Not even synchronize_rcu() is sufficient to force it either, since we
> only update on switch_mm() and nothing guarantees we pass that.
> 
> One possible approach would be to IPI broadcast (after setting the
> ->mm->membarrier_State) and having the IPI update the rq state from
> 'current->mm'.
> 
> But possible I'm just confusing evryone again. I'm not having a good day
> today.

Indeed, switching 'on' requires some care. I implemented the IPI-based
approach as per your suggestion,

Thanks,

Mathieu

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load
  2019-09-04 11:11 ` Oleg Nesterov
  2019-09-04 16:11   ` Mathieu Desnoyers
@ 2019-09-08 13:46   ` Mathieu Desnoyers
  1 sibling, 0 replies; 24+ messages in thread
From: Mathieu Desnoyers @ 2019-09-08 13:46 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: paulmck, Peter Zijlstra, Linus Torvalds, linux-kernel,
	Eric W. Biederman, Russell King, ARM Linux, Chris Metcalf,
	Chris Lameter, Kirill Tkhai, Mike Galbraith, Thomas Gleixner,
	Ingo Molnar

----- On Sep 4, 2019, at 12:11 PM, Oleg Nesterov oleg@redhat.com wrote:

> with or without these changes...
> 
> Why do membarrier_register_*_expedited() check get_nr_threads() == 1?
> This makes no sense to me, atomic_read(mm_users) == 1 should be enough.
> 
> 
> And I am not sure I understand membarrier_mm_sync_core_before_usermode().
> OK, membarrier_private_expedited() can race with user -> kernel -> user
> transition, but we do not care unless both user's above have the same mm?
> Shouldn't membarrier_mm_sync_core_before_usermode() do
> 
>	if (current->mm != mm)
>		return;
> 
> at the start to make it more clear and avoid sync_core_before_usermode()
> if possible?

I think I missed replying to your email. Indeed, you are right, I've added
2 cleanup patches taking care of this in my latest round.

Thanks,

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2019-09-08 13:46 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-03 20:11 [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load Mathieu Desnoyers
2019-09-03 20:11 ` [RFC PATCH 2/2] Fix: sched/membarrier: private expedited registration check Mathieu Desnoyers
2019-09-03 20:24 ` [RFC PATCH 1/2] Fix: sched/membarrier: p->mm->membarrier_state racy load Peter Zijlstra
2019-09-03 20:36   ` Linus Torvalds
2019-09-04 15:19     ` Mathieu Desnoyers
2019-09-04 16:09       ` Peter Zijlstra
2019-09-04 17:12         ` Mathieu Desnoyers
2019-09-04 18:26           ` Peter Zijlstra
2019-09-06  0:51             ` Mathieu Desnoyers
2019-09-03 20:41   ` Mathieu Desnoyers
2019-09-04 11:28     ` Peter Zijlstra
2019-09-04 11:49       ` Peter Zijlstra
2019-09-04 15:26         ` Mathieu Desnoyers
2019-09-04 12:03       ` Oleg Nesterov
2019-09-04 12:43         ` Peter Zijlstra
2019-09-04 13:17           ` Oleg Nesterov
2019-09-03 20:27 ` Linus Torvalds
2019-09-03 20:53   ` Mathieu Desnoyers
2019-09-04 10:53 ` Oleg Nesterov
2019-09-04 11:39   ` Peter Zijlstra
2019-09-04 15:24   ` Mathieu Desnoyers
2019-09-04 11:11 ` Oleg Nesterov
2019-09-04 16:11   ` Mathieu Desnoyers
2019-09-08 13:46   ` Mathieu Desnoyers

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.