All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] membarrier: expedited private command
@ 2017-07-27 18:59 Mathieu Desnoyers
  2017-07-27 19:55 ` Peter Zijlstra
  2017-07-27 20:06 ` Paul E. McKenney
  0 siblings, 2 replies; 6+ messages in thread
From: Mathieu Desnoyers @ 2017-07-27 18:59 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Mathieu Desnoyers, Paul E . McKenney, Boqun Feng

Implement MEMBARRIER_CMD_PRIVATE_EXPEDITED with IPIs using cpumask built
from all runqueues for which current thread's mm is the same as our own.

Scheduler-wise, it requires that we add a memory barrier after context
switching between processes (which have different mm).

It would be interesting to benchmark the overhead of this added barrier
on the performance of context switching between processes. If the
preexisting overhead of switching between mm is high enough, the
overhead of adding this extra barrier may be insignificant.

[ Compile-tested only! ]

CC: Peter Zijlstra <peterz@infradead.org>
CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
CC: Boqun Feng <boqun.feng@gmail.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
---
 include/uapi/linux/membarrier.h |  8 +++--
 kernel/membarrier.c             | 76 ++++++++++++++++++++++++++++++++++++++++-
 kernel/sched/core.c             | 21 ++++++++++++
 3 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
index e0b108bd2624..6a33c5852f6b 100644
--- a/include/uapi/linux/membarrier.h
+++ b/include/uapi/linux/membarrier.h
@@ -40,14 +40,18 @@
  *                          (non-running threads are de facto in such a
  *                          state). This covers threads from all processes
  *                          running on the system. This command returns 0.
+ * TODO: documentation.
  *
  * Command to be passed to the membarrier system call. The commands need to
  * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
  * the value 0.
  */
 enum membarrier_cmd {
-	MEMBARRIER_CMD_QUERY = 0,
-	MEMBARRIER_CMD_SHARED = (1 << 0),
+	MEMBARRIER_CMD_QUERY			= 0,
+	MEMBARRIER_CMD_SHARED			= (1 << 0),
+	/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
+	/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED	= (1 << 3),
 };
 
 #endif /* _UAPI_LINUX_MEMBARRIER_H */
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
index 9f9284f37f8d..8c6c0f96f617 100644
--- a/kernel/membarrier.c
+++ b/kernel/membarrier.c
@@ -19,10 +19,81 @@
 #include <linux/tick.h>
 
 /*
+ * XXX For cpu_rq(). Should we rather move
+ * membarrier_private_expedited() to sched/core.c or create
+ * sched/membarrier.c ?
+ */
+#include "sched/sched.h"
+
+/*
  * Bitmask made from a "or" of all commands within enum membarrier_cmd,
  * except MEMBARRIER_CMD_QUERY.
  */
-#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
+#define MEMBARRIER_CMD_BITMASK	\
+	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+
+static void ipi_mb(void *info)
+{
+	smp_mb();	/* IPIs should be serializing but paranoid. */
+}
+
+static void membarrier_private_expedited_ipi_each(void)
+{
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		struct task_struct *p;
+
+		rcu_read_lock();
+		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+		if (p && p->mm == current->mm)
+			smp_call_function_single(cpu, ipi_mb, NULL, 1);
+		rcu_read_unlock();
+	}
+}
+
+static void membarrier_private_expedited(void)
+{
+	int cpu, this_cpu;
+	cpumask_var_t tmpmask;
+
+	if (num_online_cpus() == 1)
+		return;
+
+	/*
+	 * Matches memory barriers around rq->curr modification in
+	 * scheduler.
+	 */
+	smp_mb();	/* system call entry is not a mb. */
+
+	if (!alloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+		/* Fallback for OOM. */
+		membarrier_private_expedited_ipi_each();
+		goto end;
+	}
+
+	this_cpu = raw_smp_processor_id();
+	for_each_online_cpu(cpu) {
+		struct task_struct *p;
+
+		if (cpu == this_cpu)
+			continue;
+		rcu_read_lock();
+		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+		if (p && p->mm == current->mm)
+			__cpumask_set_cpu(cpu, tmpmask);
+		rcu_read_unlock();
+	}
+	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+	free_cpumask_var(tmpmask);
+end:
+	/*
+	* Memory barrier on the caller thread _after_ we finished
+	* waiting for the last IPI. Matches memory barriers around
+	* rq->curr modification in scheduler.
+	*/
+	smp_mb();	/* exit from system call is not a mb */
+}
 
 /**
  * sys_membarrier - issue memory barriers on a set of threads
@@ -64,6 +135,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
 		if (num_online_cpus() > 1)
 			synchronize_sched();
 		return 0;
+	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+		membarrier_private_expedited();
+		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 17c667b427b4..f171d2aaaf82 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2724,6 +2724,26 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 		put_user(task_pid_vnr(current), current->set_child_tid);
 }
 
+#ifdef CONFIG_MEMBARRIER
+static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm,
+		struct mm_struct *oldmm)
+{
+	if (likely(mm == oldmm))
+		return;		/* Thread context switch, same mm. */
+	/*
+	 * When switching between processes, membarrier expedited
+	 * private requires a memory barrier after we set the current
+	 * task.
+	 */
+	smp_mb();
+}
+#else /* #ifdef CONFIG_MEMBARRIER */
+static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm,
+		struct mm_struct *oldmm)
+{
+}
+#endif /* #else #ifdef CONFIG_MEMBARRIER */
+
 /*
  * context_switch - switch to the new MM and the new thread's register state.
  */
@@ -2737,6 +2757,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 
 	mm = next->mm;
 	oldmm = prev->active_mm;
+	membarrier_expedited_mb_after_set_current(mm, oldmm);
 	/*
 	 * For paravirt, this is coupled with an exit in switch_to to
 	 * combine the page table reload and the switch backend into
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] membarrier: expedited private command
  2017-07-27 18:59 [RFC PATCH] membarrier: expedited private command Mathieu Desnoyers
@ 2017-07-27 19:55 ` Peter Zijlstra
  2017-07-27 20:31   ` Mathieu Desnoyers
  2017-07-27 20:06 ` Paul E. McKenney
  1 sibling, 1 reply; 6+ messages in thread
From: Peter Zijlstra @ 2017-07-27 19:55 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: linux-kernel, Paul E . McKenney, Boqun Feng

On Thu, Jul 27, 2017 at 02:59:43PM -0400, Mathieu Desnoyers wrote:
> diff --git a/kernel/membarrier.c b/kernel/membarrier.c
> index 9f9284f37f8d..8c6c0f96f617 100644
> --- a/kernel/membarrier.c
> +++ b/kernel/membarrier.c
> @@ -19,10 +19,81 @@
>  #include <linux/tick.h>
>  
>  /*
> + * XXX For cpu_rq(). Should we rather move
> + * membarrier_private_expedited() to sched/core.c or create
> + * sched/membarrier.c ?

The later perhaps.

> +static void membarrier_private_expedited(void)
> +{
> +	int cpu, this_cpu;
> +	cpumask_var_t tmpmask;
> +
> +	if (num_online_cpus() == 1)
> +		return;
> +
> +	/*
> +	 * Matches memory barriers around rq->curr modification in
> +	 * scheduler.
> +	 */
> +	smp_mb();	/* system call entry is not a mb. */
> +
> +	if (!alloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {

Why GFP_NOWAIT ? and falback. There seems to be a desire to make this a
nonblocking syscall. Should we document this somewhere?

> +		/* Fallback for OOM. */
> +		membarrier_private_expedited_ipi_each();
> +		goto end;
> +	}
> +
> +	this_cpu = raw_smp_processor_id();

This is a tad dodgy, you might want to put in a comment on how migrating
this thread is ok.

> +	for_each_online_cpu(cpu) {

One would also need cpus_read_lock() if you rely on the online mask.

> +		struct task_struct *p;
> +
> +		if (cpu == this_cpu)
> +			continue;
> +		rcu_read_lock();
> +		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
> +		if (p && p->mm == current->mm)
> +			__cpumask_set_cpu(cpu, tmpmask);
> +		rcu_read_unlock();
> +	}
> +	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
> +	free_cpumask_var(tmpmask);
> +end:
> +	/*
> +	* Memory barrier on the caller thread _after_ we finished
> +	* waiting for the last IPI. Matches memory barriers around
> +	* rq->curr modification in scheduler.
> +	*/
> +	smp_mb();	/* exit from system call is not a mb */
> +}

> @@ -2737,6 +2757,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
>  
>  	mm = next->mm;
>  	oldmm = prev->active_mm;
> +	membarrier_expedited_mb_after_set_current(mm, oldmm);
>  	/*
>  	 * For paravirt, this is coupled with an exit in switch_to to
>  	 * combine the page table reload and the switch backend into

As said on IRC, we have finish_task_switch()->if (mm)
mmdrop(mm)->atomic_dec_and_test() providing a smp_mb(). We just need to
deal with the !mm case.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] membarrier: expedited private command
  2017-07-27 18:59 [RFC PATCH] membarrier: expedited private command Mathieu Desnoyers
  2017-07-27 19:55 ` Peter Zijlstra
@ 2017-07-27 20:06 ` Paul E. McKenney
  2017-07-27 20:18   ` Mathieu Desnoyers
  1 sibling, 1 reply; 6+ messages in thread
From: Paul E. McKenney @ 2017-07-27 20:06 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: Peter Zijlstra, linux-kernel, Boqun Feng

On Thu, Jul 27, 2017 at 02:59:43PM -0400, Mathieu Desnoyers wrote:
> Implement MEMBARRIER_CMD_PRIVATE_EXPEDITED with IPIs using cpumask built
> from all runqueues for which current thread's mm is the same as our own.
> 
> Scheduler-wise, it requires that we add a memory barrier after context
> switching between processes (which have different mm).
> 
> It would be interesting to benchmark the overhead of this added barrier
> on the performance of context switching between processes. If the
> preexisting overhead of switching between mm is high enough, the
> overhead of adding this extra barrier may be insignificant.

I sent this along to the people asking for faster sys_membarrier(),
CCing you, thank you!

Please see other feedback inline below.

> [ Compile-tested only! ]
> 
> CC: Peter Zijlstra <peterz@infradead.org>
> CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> CC: Boqun Feng <boqun.feng@gmail.com>
> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> ---
>  include/uapi/linux/membarrier.h |  8 +++--
>  kernel/membarrier.c             | 76 ++++++++++++++++++++++++++++++++++++++++-
>  kernel/sched/core.c             | 21 ++++++++++++
>  3 files changed, 102 insertions(+), 3 deletions(-)
> 
> diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
> index e0b108bd2624..6a33c5852f6b 100644
> --- a/include/uapi/linux/membarrier.h
> +++ b/include/uapi/linux/membarrier.h
> @@ -40,14 +40,18 @@
>   *                          (non-running threads are de facto in such a
>   *                          state). This covers threads from all processes
>   *                          running on the system. This command returns 0.
> + * TODO: documentation.

How about something like this?

 * @MEMBARRIER_CMD_PRIVATE_EXPEDITED:  IPI each CPU running a thread belonging
 *                          to the same process as the current thread, causing
 *                          the recipient CPU to execute a full memory-barrier
 *                          instruction.  The same-process determination is
 *                          made using the task_struct ->mm field:  If some
 *                          other CPU's currently running task has the same
 *                          value in its ->mm field as the requesting thread,
 *                          that CPU is IPIed.

>   *
>   * Command to be passed to the membarrier system call. The commands need to
>   * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
>   * the value 0.
>   */
>  enum membarrier_cmd {
> -	MEMBARRIER_CMD_QUERY = 0,
> -	MEMBARRIER_CMD_SHARED = (1 << 0),
> +	MEMBARRIER_CMD_QUERY			= 0,
> +	MEMBARRIER_CMD_SHARED			= (1 << 0),
> +	/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
> +	/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
> +	MEMBARRIER_CMD_PRIVATE_EXPEDITED	= (1 << 3),
>  };
> 
>  #endif /* _UAPI_LINUX_MEMBARRIER_H */
> diff --git a/kernel/membarrier.c b/kernel/membarrier.c
> index 9f9284f37f8d..8c6c0f96f617 100644
> --- a/kernel/membarrier.c
> +++ b/kernel/membarrier.c
> @@ -19,10 +19,81 @@
>  #include <linux/tick.h>
> 
>  /*
> + * XXX For cpu_rq(). Should we rather move
> + * membarrier_private_expedited() to sched/core.c or create
> + * sched/membarrier.c ?

Any reason not to move the entirety of membarrier.c there?  Assuming that
Peter is OK with this, of course.

> + */
> +#include "sched/sched.h"
> +
> +/*
>   * Bitmask made from a "or" of all commands within enum membarrier_cmd,
>   * except MEMBARRIER_CMD_QUERY.
>   */
> -#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
> +#define MEMBARRIER_CMD_BITMASK	\
> +	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
> +
> +static void ipi_mb(void *info)
> +{
> +	smp_mb();	/* IPIs should be serializing but paranoid. */

I am good with paranoia!  ;-)

> +}
> +
> +static void membarrier_private_expedited_ipi_each(void)
> +{
> +	int cpu;
> +
> +	for_each_online_cpu(cpu) {
> +		struct task_struct *p;
> +
> +		rcu_read_lock();
> +		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
> +		if (p && p->mm == current->mm)
> +			smp_call_function_single(cpu, ipi_mb, NULL, 1);
> +		rcu_read_unlock();
> +	}
> +}
> +
> +static void membarrier_private_expedited(void)
> +{
> +	int cpu, this_cpu;
> +	cpumask_var_t tmpmask;
> +
> +	if (num_online_cpus() == 1)
> +		return;
> +
> +	/*
> +	 * Matches memory barriers around rq->curr modification in
> +	 * scheduler.
> +	 */
> +	smp_mb();	/* system call entry is not a mb. */
> +
> +	if (!alloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
> +		/* Fallback for OOM. */
> +		membarrier_private_expedited_ipi_each();
> +		goto end;
> +	}
> +
> +	this_cpu = raw_smp_processor_id();
> +	for_each_online_cpu(cpu) {
> +		struct task_struct *p;
> +
> +		if (cpu == this_cpu)
> +			continue;
> +		rcu_read_lock();
> +		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
> +		if (p && p->mm == current->mm)
> +			__cpumask_set_cpu(cpu, tmpmask);
> +		rcu_read_unlock();
> +	}
> +	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
> +	free_cpumask_var(tmpmask);
> +end:
> +	/*
> +	* Memory barrier on the caller thread _after_ we finished
> +	* waiting for the last IPI. Matches memory barriers around
> +	* rq->curr modification in scheduler.
> +	*/
> +	smp_mb();	/* exit from system call is not a mb */
> +}
> 
>  /**
>   * sys_membarrier - issue memory barriers on a set of threads
> @@ -64,6 +135,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
>  		if (num_online_cpus() > 1)
>  			synchronize_sched();
>  		return 0;
> +	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
> +		membarrier_private_expedited();
> +		return 0;
>  	default:
>  		return -EINVAL;
>  	}
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 17c667b427b4..f171d2aaaf82 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2724,6 +2724,26 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
>  		put_user(task_pid_vnr(current), current->set_child_tid);
>  }
> 
> +#ifdef CONFIG_MEMBARRIER
> +static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm,
> +		struct mm_struct *oldmm)
> +{
> +	if (likely(mm == oldmm))
> +		return;		/* Thread context switch, same mm. */
> +	/*
> +	 * When switching between processes, membarrier expedited
> +	 * private requires a memory barrier after we set the current
> +	 * task.
> +	 */
> +	smp_mb();
> +}
> +#else /* #ifdef CONFIG_MEMBARRIER */
> +static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm,
> +		struct mm_struct *oldmm)
> +{
> +}
> +#endif /* #else #ifdef CONFIG_MEMBARRIER */

Why not something like this?  Shorter, easier to read, and should generate
the same code.

static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm,
		struct mm_struct *oldmm)
{
	if (!IS_ENABLED(CONFIG_MEMBARRIER))
		return;
	if (likely(mm == oldmm))
		return;		/* Thread context switch, same mm. */
	/*
	 * When switching between processes, membarrier expedited
	 * private requires a memory barrier after we set the current
	 * task.
	 */
	smp_mb();
}

> +
>  /*
>   * context_switch - switch to the new MM and the new thread's register state.
>   */
> @@ -2737,6 +2757,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
> 
>  	mm = next->mm;
>  	oldmm = prev->active_mm;
> +	membarrier_expedited_mb_after_set_current(mm, oldmm);
>  	/*
>  	 * For paravirt, this is coupled with an exit in switch_to to
>  	 * combine the page table reload and the switch backend into
> -- 
> 2.11.0
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] membarrier: expedited private command
  2017-07-27 20:06 ` Paul E. McKenney
@ 2017-07-27 20:18   ` Mathieu Desnoyers
  2017-07-27 20:26     ` Paul E. McKenney
  0 siblings, 1 reply; 6+ messages in thread
From: Mathieu Desnoyers @ 2017-07-27 20:18 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: Peter Zijlstra, linux-kernel, Boqun Feng


----- On Jul 27, 2017, at 4:06 PM, Paul E. McKenney paulmck@linux.vnet.ibm.com wrote:

> On Thu, Jul 27, 2017 at 02:59:43PM -0400, Mathieu Desnoyers wrote:
>> Implement MEMBARRIER_CMD_PRIVATE_EXPEDITED with IPIs using cpumask built
>> from all runqueues for which current thread's mm is the same as our own.
>> 
>> Scheduler-wise, it requires that we add a memory barrier after context
>> switching between processes (which have different mm).
>> 
>> It would be interesting to benchmark the overhead of this added barrier
>> on the performance of context switching between processes. If the
>> preexisting overhead of switching between mm is high enough, the
>> overhead of adding this extra barrier may be insignificant.
> 
> I sent this along to the people asking for faster sys_membarrier(),
> CCing you, thank you!

Thanks! More below,

> 
> Please see other feedback inline below.
> 
>> [ Compile-tested only! ]
>> 
>> CC: Peter Zijlstra <peterz@infradead.org>
>> CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>> CC: Boqun Feng <boqun.feng@gmail.com>
>> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
>> ---
>>  include/uapi/linux/membarrier.h |  8 +++--
>>  kernel/membarrier.c             | 76 ++++++++++++++++++++++++++++++++++++++++-
>>  kernel/sched/core.c             | 21 ++++++++++++
>>  3 files changed, 102 insertions(+), 3 deletions(-)
>> 
>> diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
>> index e0b108bd2624..6a33c5852f6b 100644
>> --- a/include/uapi/linux/membarrier.h
>> +++ b/include/uapi/linux/membarrier.h
>> @@ -40,14 +40,18 @@
>>   *                          (non-running threads are de facto in such a
>>   *                          state). This covers threads from all processes
>>   *                          running on the system. This command returns 0.
>> + * TODO: documentation.
> 
> How about something like this?
> 
> * @MEMBARRIER_CMD_PRIVATE_EXPEDITED:  IPI each CPU running a thread belonging
> *                          to the same process as the current thread, causing
> *                          the recipient CPU to execute a full memory-barrier
> *                          instruction.  The same-process determination is
> *                          made using the task_struct ->mm field:  If some
> *                          other CPU's currently running task has the same
> *                          value in its ->mm field as the requesting thread,
> *                          that CPU is IPIed.
> 

I would prefer:

 * @MEMBARRIER_CMD_PRIVATE_EXPEDITED:
 *                          Execute a memory barrier on each running
 *                          thread belonging to the same process as the current
 *                          thread. Upon return from system call, the
 *                          caller thread is ensured that all its running
 *                          threads siblings have passed through a state
 *                          where all memory accesses to user-space
 *                          addresses match program order between entry
 *                          to and return from the system call
 *                          (non-running threads are de facto in such a
 *                          state). This only covers threads from the
 *                          same processes as the caller thread. This
 *                          command returns 0.

mainly because this is a uapi/ header, installed into distribution headers. So
I want to remove any implementation reference from that text.


>>   *
>>   * Command to be passed to the membarrier system call. The commands need to
>>   * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
>>   * the value 0.
>>   */
>>  enum membarrier_cmd {
>> -	MEMBARRIER_CMD_QUERY = 0,
>> -	MEMBARRIER_CMD_SHARED = (1 << 0),
>> +	MEMBARRIER_CMD_QUERY			= 0,
>> +	MEMBARRIER_CMD_SHARED			= (1 << 0),
>> +	/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
>> +	/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
>> +	MEMBARRIER_CMD_PRIVATE_EXPEDITED	= (1 << 3),
>>  };
>> 
>>  #endif /* _UAPI_LINUX_MEMBARRIER_H */
>> diff --git a/kernel/membarrier.c b/kernel/membarrier.c
>> index 9f9284f37f8d..8c6c0f96f617 100644
>> --- a/kernel/membarrier.c
>> +++ b/kernel/membarrier.c
>> @@ -19,10 +19,81 @@
>>  #include <linux/tick.h>
>> 
>>  /*
>> + * XXX For cpu_rq(). Should we rather move
>> + * membarrier_private_expedited() to sched/core.c or create
>> + * sched/membarrier.c ?
> 
> Any reason not to move the entirety of membarrier.c there?  Assuming that
> Peter is OK with this, of course.

I'm planning to do exactly this for v2.

> 
>> + */
>> +#include "sched/sched.h"
>> +
>> +/*
>>   * Bitmask made from a "or" of all commands within enum membarrier_cmd,
>>   * except MEMBARRIER_CMD_QUERY.
>>   */
>> -#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
>> +#define MEMBARRIER_CMD_BITMASK	\
>> +	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
>> +
>> +static void ipi_mb(void *info)
>> +{
>> +	smp_mb();	/* IPIs should be serializing but paranoid. */
> 
> I am good with paranoia!  ;-)
> 
>> +}
>> +
>> +static void membarrier_private_expedited_ipi_each(void)
>> +{
>> +	int cpu;
>> +
>> +	for_each_online_cpu(cpu) {
>> +		struct task_struct *p;
>> +
>> +		rcu_read_lock();
>> +		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
>> +		if (p && p->mm == current->mm)
>> +			smp_call_function_single(cpu, ipi_mb, NULL, 1);
>> +		rcu_read_unlock();
>> +	}
>> +}
>> +
>> +static void membarrier_private_expedited(void)
>> +{
>> +	int cpu, this_cpu;
>> +	cpumask_var_t tmpmask;
>> +
>> +	if (num_online_cpus() == 1)
>> +		return;
>> +
>> +	/*
>> +	 * Matches memory barriers around rq->curr modification in
>> +	 * scheduler.
>> +	 */
>> +	smp_mb();	/* system call entry is not a mb. */
>> +
>> +	if (!alloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
>> +		/* Fallback for OOM. */
>> +		membarrier_private_expedited_ipi_each();
>> +		goto end;
>> +	}
>> +
>> +	this_cpu = raw_smp_processor_id();
>> +	for_each_online_cpu(cpu) {
>> +		struct task_struct *p;
>> +
>> +		if (cpu == this_cpu)
>> +			continue;
>> +		rcu_read_lock();
>> +		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
>> +		if (p && p->mm == current->mm)
>> +			__cpumask_set_cpu(cpu, tmpmask);
>> +		rcu_read_unlock();
>> +	}
>> +	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
>> +	free_cpumask_var(tmpmask);
>> +end:
>> +	/*
>> +	* Memory barrier on the caller thread _after_ we finished
>> +	* waiting for the last IPI. Matches memory barriers around
>> +	* rq->curr modification in scheduler.
>> +	*/
>> +	smp_mb();	/* exit from system call is not a mb */
>> +}
>> 
>>  /**
>>   * sys_membarrier - issue memory barriers on a set of threads
>> @@ -64,6 +135,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
>>  		if (num_online_cpus() > 1)
>>  			synchronize_sched();
>>  		return 0;
>> +	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
>> +		membarrier_private_expedited();
>> +		return 0;
>>  	default:
>>  		return -EINVAL;
>>  	}
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 17c667b427b4..f171d2aaaf82 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -2724,6 +2724,26 @@ asmlinkage __visible void schedule_tail(struct
>> task_struct *prev)
>>  		put_user(task_pid_vnr(current), current->set_child_tid);
>>  }
>> 
>> +#ifdef CONFIG_MEMBARRIER
>> +static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm,
>> +		struct mm_struct *oldmm)
>> +{
>> +	if (likely(mm == oldmm))
>> +		return;		/* Thread context switch, same mm. */
>> +	/*
>> +	 * When switching between processes, membarrier expedited
>> +	 * private requires a memory barrier after we set the current
>> +	 * task.
>> +	 */
>> +	smp_mb();
>> +}
>> +#else /* #ifdef CONFIG_MEMBARRIER */
>> +static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm,
>> +		struct mm_struct *oldmm)
>> +{
>> +}
>> +#endif /* #else #ifdef CONFIG_MEMBARRIER */
> 
> Why not something like this?  Shorter, easier to read, and should generate
> the same code.

Sure, I'll use this in my v2.

Thanks!

Mathieu

> 
> static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm,
>		struct mm_struct *oldmm)
> {
>	if (!IS_ENABLED(CONFIG_MEMBARRIER))
>		return;
>	if (likely(mm == oldmm))
>		return;		/* Thread context switch, same mm. */
>	/*
>	 * When switching between processes, membarrier expedited
>	 * private requires a memory barrier after we set the current
>	 * task.
>	 */
>	smp_mb();
> }
> 
>> +
>>  /*
>>   * context_switch - switch to the new MM and the new thread's register state.
>>   */
>> @@ -2737,6 +2757,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
>> 
>>  	mm = next->mm;
>>  	oldmm = prev->active_mm;
>> +	membarrier_expedited_mb_after_set_current(mm, oldmm);
>>  	/*
>>  	 * For paravirt, this is coupled with an exit in switch_to to
>>  	 * combine the page table reload and the switch backend into
>> --
>> 2.11.0

-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] membarrier: expedited private command
  2017-07-27 20:18   ` Mathieu Desnoyers
@ 2017-07-27 20:26     ` Paul E. McKenney
  0 siblings, 0 replies; 6+ messages in thread
From: Paul E. McKenney @ 2017-07-27 20:26 UTC (permalink / raw)
  To: Mathieu Desnoyers; +Cc: Peter Zijlstra, linux-kernel, Boqun Feng

On Thu, Jul 27, 2017 at 08:18:47PM +0000, Mathieu Desnoyers wrote:
> 
> ----- On Jul 27, 2017, at 4:06 PM, Paul E. McKenney paulmck@linux.vnet.ibm.com wrote:
> 
> > On Thu, Jul 27, 2017 at 02:59:43PM -0400, Mathieu Desnoyers wrote:
> >> Implement MEMBARRIER_CMD_PRIVATE_EXPEDITED with IPIs using cpumask built
> >> from all runqueues for which current thread's mm is the same as our own.
> >> 
> >> Scheduler-wise, it requires that we add a memory barrier after context
> >> switching between processes (which have different mm).
> >> 
> >> It would be interesting to benchmark the overhead of this added barrier
> >> on the performance of context switching between processes. If the
> >> preexisting overhead of switching between mm is high enough, the
> >> overhead of adding this extra barrier may be insignificant.
> > 
> > I sent this along to the people asking for faster sys_membarrier(),
> > CCing you, thank you!
> 
> Thanks! More below,

At at least one had it pass their initial testing.

> > Please see other feedback inline below.
> > 
> >> [ Compile-tested only! ]
> >> 
> >> CC: Peter Zijlstra <peterz@infradead.org>
> >> CC: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> >> CC: Boqun Feng <boqun.feng@gmail.com>
> >> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> >> ---
> >>  include/uapi/linux/membarrier.h |  8 +++--
> >>  kernel/membarrier.c             | 76 ++++++++++++++++++++++++++++++++++++++++-
> >>  kernel/sched/core.c             | 21 ++++++++++++
> >>  3 files changed, 102 insertions(+), 3 deletions(-)
> >> 
> >> diff --git a/include/uapi/linux/membarrier.h b/include/uapi/linux/membarrier.h
> >> index e0b108bd2624..6a33c5852f6b 100644
> >> --- a/include/uapi/linux/membarrier.h
> >> +++ b/include/uapi/linux/membarrier.h
> >> @@ -40,14 +40,18 @@
> >>   *                          (non-running threads are de facto in such a
> >>   *                          state). This covers threads from all processes
> >>   *                          running on the system. This command returns 0.
> >> + * TODO: documentation.
> > 
> > How about something like this?
> > 
> > * @MEMBARRIER_CMD_PRIVATE_EXPEDITED:  IPI each CPU running a thread belonging
> > *                          to the same process as the current thread, causing
> > *                          the recipient CPU to execute a full memory-barrier
> > *                          instruction.  The same-process determination is
> > *                          made using the task_struct ->mm field:  If some
> > *                          other CPU's currently running task has the same
> > *                          value in its ->mm field as the requesting thread,
> > *                          that CPU is IPIed.
> > 
> 
> I would prefer:
> 
>  * @MEMBARRIER_CMD_PRIVATE_EXPEDITED:
>  *                          Execute a memory barrier on each running
>  *                          thread belonging to the same process as the current
>  *                          thread. Upon return from system call, the
>  *                          caller thread is ensured that all its running
>  *                          threads siblings have passed through a state
>  *                          where all memory accesses to user-space
>  *                          addresses match program order between entry
>  *                          to and return from the system call
>  *                          (non-running threads are de facto in such a
>  *                          state). This only covers threads from the
>  *                          same processes as the caller thread. This
>  *                          command returns 0.
> 
> mainly because this is a uapi/ header, installed into distribution headers. So
> I want to remove any implementation reference from that text.

Fine by me, as long as there is documentation.  ;-)

> >>   *
> >>   * Command to be passed to the membarrier system call. The commands need to
> >>   * be a single bit each, except for MEMBARRIER_CMD_QUERY which is assigned to
> >>   * the value 0.
> >>   */
> >>  enum membarrier_cmd {
> >> -	MEMBARRIER_CMD_QUERY = 0,
> >> -	MEMBARRIER_CMD_SHARED = (1 << 0),
> >> +	MEMBARRIER_CMD_QUERY			= 0,
> >> +	MEMBARRIER_CMD_SHARED			= (1 << 0),
> >> +	/* reserved for MEMBARRIER_CMD_SHARED_EXPEDITED (1 << 1) */
> >> +	/* reserved for MEMBARRIER_CMD_PRIVATE (1 << 2) */
> >> +	MEMBARRIER_CMD_PRIVATE_EXPEDITED	= (1 << 3),
> >>  };
> >> 
> >>  #endif /* _UAPI_LINUX_MEMBARRIER_H */
> >> diff --git a/kernel/membarrier.c b/kernel/membarrier.c
> >> index 9f9284f37f8d..8c6c0f96f617 100644
> >> --- a/kernel/membarrier.c
> >> +++ b/kernel/membarrier.c
> >> @@ -19,10 +19,81 @@
> >>  #include <linux/tick.h>
> >> 
> >>  /*
> >> + * XXX For cpu_rq(). Should we rather move
> >> + * membarrier_private_expedited() to sched/core.c or create
> >> + * sched/membarrier.c ?
> > 
> > Any reason not to move the entirety of membarrier.c there?  Assuming that
> > Peter is OK with this, of course.
> 
> I'm planning to do exactly this for v2.

Fair enough.

> >> + */
> >> +#include "sched/sched.h"
> >> +
> >> +/*
> >>   * Bitmask made from a "or" of all commands within enum membarrier_cmd,
> >>   * except MEMBARRIER_CMD_QUERY.
> >>   */
> >> -#define MEMBARRIER_CMD_BITMASK	(MEMBARRIER_CMD_SHARED)
> >> +#define MEMBARRIER_CMD_BITMASK	\
> >> +	(MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
> >> +
> >> +static void ipi_mb(void *info)
> >> +{
> >> +	smp_mb();	/* IPIs should be serializing but paranoid. */
> > 
> > I am good with paranoia!  ;-)
> > 
> >> +}
> >> +
> >> +static void membarrier_private_expedited_ipi_each(void)
> >> +{
> >> +	int cpu;
> >> +
> >> +	for_each_online_cpu(cpu) {
> >> +		struct task_struct *p;
> >> +
> >> +		rcu_read_lock();
> >> +		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
> >> +		if (p && p->mm == current->mm)
> >> +			smp_call_function_single(cpu, ipi_mb, NULL, 1);
> >> +		rcu_read_unlock();
> >> +	}
> >> +}
> >> +
> >> +static void membarrier_private_expedited(void)
> >> +{
> >> +	int cpu, this_cpu;
> >> +	cpumask_var_t tmpmask;
> >> +
> >> +	if (num_online_cpus() == 1)
> >> +		return;
> >> +
> >> +	/*
> >> +	 * Matches memory barriers around rq->curr modification in
> >> +	 * scheduler.
> >> +	 */
> >> +	smp_mb();	/* system call entry is not a mb. */
> >> +
> >> +	if (!alloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
> >> +		/* Fallback for OOM. */
> >> +		membarrier_private_expedited_ipi_each();
> >> +		goto end;
> >> +	}
> >> +
> >> +	this_cpu = raw_smp_processor_id();
> >> +	for_each_online_cpu(cpu) {
> >> +		struct task_struct *p;
> >> +
> >> +		if (cpu == this_cpu)
> >> +			continue;
> >> +		rcu_read_lock();
> >> +		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
> >> +		if (p && p->mm == current->mm)
> >> +			__cpumask_set_cpu(cpu, tmpmask);
> >> +		rcu_read_unlock();
> >> +	}
> >> +	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
> >> +	free_cpumask_var(tmpmask);
> >> +end:
> >> +	/*
> >> +	* Memory barrier on the caller thread _after_ we finished
> >> +	* waiting for the last IPI. Matches memory barriers around
> >> +	* rq->curr modification in scheduler.
> >> +	*/
> >> +	smp_mb();	/* exit from system call is not a mb */
> >> +}
> >> 
> >>  /**
> >>   * sys_membarrier - issue memory barriers on a set of threads
> >> @@ -64,6 +135,9 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
> >>  		if (num_online_cpus() > 1)
> >>  			synchronize_sched();
> >>  		return 0;
> >> +	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
> >> +		membarrier_private_expedited();
> >> +		return 0;
> >>  	default:
> >>  		return -EINVAL;
> >>  	}
> >> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> >> index 17c667b427b4..f171d2aaaf82 100644
> >> --- a/kernel/sched/core.c
> >> +++ b/kernel/sched/core.c
> >> @@ -2724,6 +2724,26 @@ asmlinkage __visible void schedule_tail(struct
> >> task_struct *prev)
> >>  		put_user(task_pid_vnr(current), current->set_child_tid);
> >>  }
> >> 
> >> +#ifdef CONFIG_MEMBARRIER
> >> +static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm,
> >> +		struct mm_struct *oldmm)
> >> +{
> >> +	if (likely(mm == oldmm))
> >> +		return;		/* Thread context switch, same mm. */
> >> +	/*
> >> +	 * When switching between processes, membarrier expedited
> >> +	 * private requires a memory barrier after we set the current
> >> +	 * task.
> >> +	 */
> >> +	smp_mb();
> >> +}
> >> +#else /* #ifdef CONFIG_MEMBARRIER */
> >> +static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm,
> >> +		struct mm_struct *oldmm)
> >> +{
> >> +}
> >> +#endif /* #else #ifdef CONFIG_MEMBARRIER */
> > 
> > Why not something like this?  Shorter, easier to read, and should generate
> > the same code.
> 
> Sure, I'll use this in my v2.

Sounds good!

							Thanx, Paul

> Thanks!
> 
> Mathieu
> 
> > 
> > static void membarrier_expedited_mb_after_set_current(struct mm_struct *mm,
> >		struct mm_struct *oldmm)
> > {
> >	if (!IS_ENABLED(CONFIG_MEMBARRIER))
> >		return;
> >	if (likely(mm == oldmm))
> >		return;		/* Thread context switch, same mm. */
> >	/*
> >	 * When switching between processes, membarrier expedited
> >	 * private requires a memory barrier after we set the current
> >	 * task.
> >	 */
> >	smp_mb();
> > }
> > 
> >> +
> >>  /*
> >>   * context_switch - switch to the new MM and the new thread's register state.
> >>   */
> >> @@ -2737,6 +2757,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
> >> 
> >>  	mm = next->mm;
> >>  	oldmm = prev->active_mm;
> >> +	membarrier_expedited_mb_after_set_current(mm, oldmm);
> >>  	/*
> >>  	 * For paravirt, this is coupled with an exit in switch_to to
> >>  	 * combine the page table reload and the switch backend into
> >> --
> >> 2.11.0
> 
> -- 
> Mathieu Desnoyers
> EfficiOS Inc.
> http://www.efficios.com
> 

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH] membarrier: expedited private command
  2017-07-27 19:55 ` Peter Zijlstra
@ 2017-07-27 20:31   ` Mathieu Desnoyers
  0 siblings, 0 replies; 6+ messages in thread
From: Mathieu Desnoyers @ 2017-07-27 20:31 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: linux-kernel, Paul E. McKenney, Boqun Feng

----- On Jul 27, 2017, at 3:55 PM, Peter Zijlstra peterz@infradead.org wrote:

> On Thu, Jul 27, 2017 at 02:59:43PM -0400, Mathieu Desnoyers wrote:
>> diff --git a/kernel/membarrier.c b/kernel/membarrier.c
>> index 9f9284f37f8d..8c6c0f96f617 100644
>> --- a/kernel/membarrier.c
>> +++ b/kernel/membarrier.c
>> @@ -19,10 +19,81 @@
>>  #include <linux/tick.h>
>>  
>>  /*
>> + * XXX For cpu_rq(). Should we rather move
>> + * membarrier_private_expedited() to sched/core.c or create
>> + * sched/membarrier.c ?
> 
> The later perhaps.

Allright, will do that in v2.

> 
>> +static void membarrier_private_expedited(void)
>> +{
>> +	int cpu, this_cpu;
>> +	cpumask_var_t tmpmask;
>> +
>> +	if (num_online_cpus() == 1)
>> +		return;
>> +
>> +	/*
>> +	 * Matches memory barriers around rq->curr modification in
>> +	 * scheduler.
>> +	 */
>> +	smp_mb();	/* system call entry is not a mb. */
>> +
>> +	if (!alloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
> 
> Why GFP_NOWAIT ? and falback. There seems to be a desire to make this a
> nonblocking syscall. Should we document this somewhere?

blocking a synchronization system call due to memory allocation
pressure seemed like an unwanted effect back in 2010, so I kept
the same approach. Perhaps we could state that all the "expedited"
commands should be non-blocking ?

> 
>> +		/* Fallback for OOM. */
>> +		membarrier_private_expedited_ipi_each();
>> +		goto end;
>> +	}
>> +
>> +	this_cpu = raw_smp_processor_id();
> 
> This is a tad dodgy, you might want to put in a comment on how migrating
> this thread is ok.

How about this ?

        /*
         * Skipping the current CPU is OK even through we can be
         * migrated at any point. The current CPU, at the point where we
         * read raw_smp_processor_id(), is ensured to be in program
         * order with respect to the caller thread. Therefore, we can
         * skip this CPU from the iteration.
         */

> 
>> +	for_each_online_cpu(cpu) {
> 
> One would also need cpus_read_lock() if you rely on the online mask.

OK.

> 
>> +		struct task_struct *p;
>> +
>> +		if (cpu == this_cpu)
>> +			continue;
>> +		rcu_read_lock();
>> +		p = task_rcu_dereference(&cpu_rq(cpu)->curr);
>> +		if (p && p->mm == current->mm)
>> +			__cpumask_set_cpu(cpu, tmpmask);
>> +		rcu_read_unlock();
>> +	}
>> +	smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
>> +	free_cpumask_var(tmpmask);
>> +end:
>> +	/*
>> +	* Memory barrier on the caller thread _after_ we finished
>> +	* waiting for the last IPI. Matches memory barriers around
>> +	* rq->curr modification in scheduler.
>> +	*/
>> +	smp_mb();	/* exit from system call is not a mb */
>> +}
> 
>> @@ -2737,6 +2757,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
>>  
>>  	mm = next->mm;
>>  	oldmm = prev->active_mm;
>> +	membarrier_expedited_mb_after_set_current(mm, oldmm);
>>  	/*
>>  	 * For paravirt, this is coupled with an exit in switch_to to
>>  	 * combine the page table reload and the switch backend into
> 
> As said on IRC, we have finish_task_switch()->if (mm)
> mmdrop(mm)->atomic_dec_and_test() providing a smp_mb(). We just need to
> deal with the !mm case.

Yes, I have a v2 brewing that includes this change :)

Thanks!

Mathieu


-- 
Mathieu Desnoyers
EfficiOS Inc.
http://www.efficios.com

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2017-07-27 20:29 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-07-27 18:59 [RFC PATCH] membarrier: expedited private command Mathieu Desnoyers
2017-07-27 19:55 ` Peter Zijlstra
2017-07-27 20:31   ` Mathieu Desnoyers
2017-07-27 20:06 ` Paul E. McKenney
2017-07-27 20:18   ` Mathieu Desnoyers
2017-07-27 20:26     ` Paul E. McKenney

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.