linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [patch v8 03/10] task isolation: sync vmstats on return to userspace
@ 2021-12-08 16:09 Marcelo Tosatti
  2022-01-21 12:06 ` Frederic Weisbecker
  0 siblings, 1 reply; 4+ messages in thread
From: Marcelo Tosatti @ 2021-12-08 16:09 UTC (permalink / raw)
  To: linux-kernel
  Cc: Nitesh Lal, Nicolas Saenz Julienne, Frederic Weisbecker,
	Christoph Lameter, Juri Lelli, Peter Zijlstra, Alex Belits,
	Peter Xu, Thomas Gleixner, Daniel Bristot de Oliveira,
	Marcelo Tosatti

The logic to disable vmstat worker thread, when entering
nohz full, does not cover all scenarios. For example, it is possible
for the following to happen:

1) enter nohz_full, which calls refresh_cpu_vm_stats, syncing the stats.
2) app runs mlock, which increases counters for mlock'ed pages.
3) start -RT loop

Since refresh_cpu_vm_stats from nohz_full logic can happen _before_
the mlock, vmstat shepherd can restart vmstat worker thread on
the CPU in question.

To fix this, use the task isolation prctl interface to quiesce 
deferred actions when returning to userspace.

Keep task_isol_has_work returning 0 until all elements
are in place.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
v6: modify exit_to_user_mode_loop to cover exceptions and interrupts
v5: no changes
v4: add oneshot mode support

 include/linux/task_isolation.h |   16 ++++++++++++++++
 include/linux/vmstat.h         |    8 ++++++++
 kernel/entry/common.c          |   15 +++++++++++----
 kernel/task_isolation.c        |   21 +++++++++++++++++++++
 mm/vmstat.c                    |   21 +++++++++++++++++++++
 5 files changed, 77 insertions(+), 4 deletions(-)

Index: linux-2.6/include/linux/task_isolation.h
===================================================================
--- linux-2.6.orig/include/linux/task_isolation.h
+++ linux-2.6/include/linux/task_isolation.h
@@ -40,8 +40,19 @@ int prctl_task_isolation_activate_set(un
 
 int __copy_task_isolation(struct task_struct *tsk);
 
+void isolation_exit_to_user_mode(void);
+
+static inline int task_isol_has_work(void)
+{
+	return 0;
+}
+
 #else
 
+static void isolation_exit_to_user_mode(void)
+{
+}
+
 static inline void tsk_isol_free(struct task_struct *tsk)
 {
 }
@@ -86,6 +97,11 @@ static inline int prctl_task_isolation_a
 	return -EOPNOTSUPP;
 }
 
+static inline int task_isol_has_work(void)
+{
+	return 0;
+}
+
 #endif /* CONFIG_CPU_ISOLATION */
 
 #endif /* __LINUX_TASK_ISOL_H */
Index: linux-2.6/include/linux/vmstat.h
===================================================================
--- linux-2.6.orig/include/linux/vmstat.h
+++ linux-2.6/include/linux/vmstat.h
@@ -21,6 +21,14 @@ int sysctl_vm_numa_stat_handler(struct c
 		void *buffer, size_t *length, loff_t *ppos);
 #endif
 
+#ifdef CONFIG_SMP
+void sync_vmstat(void);
+#else
+static inline void sync_vmstat(void)
+{
+}
+#endif
+
 struct reclaim_stat {
 	unsigned nr_dirty;
 	unsigned nr_unqueued_dirty;
Index: linux-2.6/kernel/entry/common.c
===================================================================
--- linux-2.6.orig/kernel/entry/common.c
+++ linux-2.6/kernel/entry/common.c
@@ -6,6 +6,7 @@
 #include <linux/livepatch.h>
 #include <linux/audit.h>
 #include <linux/tick.h>
+#include <linux/task_isolation.h>
 
 #include "common.h"
 
@@ -149,13 +150,14 @@ static void handle_signal_work(struct pt
 }
 
 static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
-					    unsigned long ti_work)
+					    unsigned long ti_work,
+					    unsigned long tsk_isol_work)
 {
 	/*
 	 * Before returning to user space ensure that all pending work
 	 * items have been completed.
 	 */
-	while (ti_work & EXIT_TO_USER_MODE_WORK) {
+	while ((ti_work & EXIT_TO_USER_MODE_WORK) || tsk_isol_work) {
 
 		local_irq_enable_exit_to_user(ti_work);
 
@@ -177,6 +179,9 @@ static unsigned long exit_to_user_mode_l
 		/* Architecture specific TIF work */
 		arch_exit_to_user_mode_work(regs, ti_work);
 
+		if (tsk_isol_work)
+			isolation_exit_to_user_mode();
+
 		/*
 		 * Disable interrupts and reevaluate the work flags as they
 		 * might have changed while interrupts and preemption was
@@ -188,6 +193,7 @@ static unsigned long exit_to_user_mode_l
 		tick_nohz_user_enter_prepare();
 
 		ti_work = READ_ONCE(current_thread_info()->flags);
+		tsk_isol_work = task_isol_has_work();
 	}
 
 	/* Return the latest work state for arch_exit_to_user_mode() */
@@ -197,14 +203,15 @@ static unsigned long exit_to_user_mode_l
 static void exit_to_user_mode_prepare(struct pt_regs *regs)
 {
 	unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
+	unsigned long tsk_isol_work = task_isol_has_work();
 
 	lockdep_assert_irqs_disabled();
 
 	/* Flush pending rcuog wakeup before the last need_resched() check */
 	tick_nohz_user_enter_prepare();
 
-	if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK))
-		ti_work = exit_to_user_mode_loop(regs, ti_work);
+	if (unlikely((ti_work & EXIT_TO_USER_MODE_WORK) || tsk_isol_work))
+		ti_work = exit_to_user_mode_loop(regs, ti_work, tsk_isol_work);
 
 	arch_exit_to_user_mode_prepare(regs, ti_work);
 
Index: linux-2.6/kernel/task_isolation.c
===================================================================
--- linux-2.6.orig/kernel/task_isolation.c
+++ linux-2.6/kernel/task_isolation.c
@@ -18,6 +18,8 @@
 #include <linux/sysfs.h>
 #include <linux/init.h>
 #include <linux/sched/task.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
 
 void __tsk_isol_free(struct task_struct *tsk)
 {
@@ -348,3 +350,22 @@ int prctl_task_isolation_activate_get(un
 
 	return 0;
 }
+
+void isolation_exit_to_user_mode(void)
+{
+	struct isol_info *i;
+
+	i = current->isol_info;
+	if (!i)
+		return;
+
+	if (i->active_mask != ISOL_F_QUIESCE)
+		return;
+
+	if (i->quiesce_mask & ISOL_F_QUIESCE_VMSTATS) {
+		sync_vmstat();
+		if (i->oneshot_mask & ISOL_F_QUIESCE_VMSTATS)
+			i->active_mask &= ~ISOL_F_QUIESCE_VMSTATS;
+	}
+}
+EXPORT_SYMBOL_GPL(isolation_exit_to_user_mode);
Index: linux-2.6/mm/vmstat.c
===================================================================
--- linux-2.6.orig/mm/vmstat.c
+++ linux-2.6/mm/vmstat.c
@@ -2015,6 +2015,27 @@ static void vmstat_shepherd(struct work_
 		round_jiffies_relative(sysctl_stat_interval));
 }
 
+void sync_vmstat(void)
+{
+	int cpu;
+
+	cpu = get_cpu();
+
+	refresh_cpu_vm_stats(false);
+	put_cpu();
+
+	/*
+	 * If task is migrated to another CPU between put_cpu
+	 * and cancel_delayed_work_sync, the code below might
+	 * cancel vmstat_update work for a different cpu
+	 * (than the one from which the vmstats were flushed).
+	 *
+	 * However, vmstat shepherd will re-enable it later,
+	 * so its harmless.
+	 */
+	cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
+}
+
 static void __init start_shepherd_timer(void)
 {
 	int cpu;



^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [patch v8 03/10] task isolation: sync vmstats on return to userspace
  2021-12-08 16:09 [patch v8 03/10] task isolation: sync vmstats on return to userspace Marcelo Tosatti
@ 2022-01-21 12:06 ` Frederic Weisbecker
  2022-01-27 16:47   ` Marcelo Tosatti
  0 siblings, 1 reply; 4+ messages in thread
From: Frederic Weisbecker @ 2022-01-21 12:06 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: linux-kernel, Nitesh Lal, Nicolas Saenz Julienne,
	Christoph Lameter, Juri Lelli, Peter Zijlstra, Alex Belits,
	Peter Xu, Thomas Gleixner, Daniel Bristot de Oliveira

On Wed, Dec 08, 2021 at 01:09:09PM -0300, Marcelo Tosatti wrote:
> Index: linux-2.6/include/linux/task_isolation.h
> ===================================================================
> --- linux-2.6.orig/include/linux/task_isolation.h
> +++ linux-2.6/include/linux/task_isolation.h
> @@ -40,8 +40,19 @@ int prctl_task_isolation_activate_set(un
>  
>  int __copy_task_isolation(struct task_struct *tsk);
>  
> +void isolation_exit_to_user_mode(void);
> +
> +static inline int task_isol_has_work(void)
> +{
> +	return 0;
> +}
> +
>  #else
>  
> +static void isolation_exit_to_user_mode(void)
> +{
> +}
> +
>  static inline void tsk_isol_free(struct task_struct *tsk)
>  {
>  }
> @@ -86,6 +97,11 @@ static inline int prctl_task_isolation_a
>  	return -EOPNOTSUPP;
>  }
>  
> +static inline int task_isol_has_work(void)
> +{
> +	return 0;
> +}
> +

It would be nice to have a coherent greppable task_isol_*() namespace instead
of random scattered tsk_*(), isolation_*() stuff...

task_isol_exit_to_user_mode()
task_isol_free()
task_isol_copy_process()
task_isol_had_work()
...

> @@ -149,13 +150,14 @@ static void handle_signal_work(struct pt
>  }
>  
>  static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> -					    unsigned long ti_work)
> +					    unsigned long ti_work,
> +					    unsigned long tsk_isol_work)
>  {
>  	/*
>  	 * Before returning to user space ensure that all pending work
>  	 * items have been completed.
>  	 */
> -	while (ti_work & EXIT_TO_USER_MODE_WORK) {
> +	while ((ti_work & EXIT_TO_USER_MODE_WORK) || tsk_isol_work) {

So there is a dependency on CONFIG_GENERIC_ENTRY. Then you need to split that
from CONFIG_CPU_ISOLATION:

config TASK_ISOLATION
       bool "Task isolation prctl()"
       depends on GENERIC_ENTRY
       help "...."

>  
>  		local_irq_enable_exit_to_user(ti_work);
>  
> @@ -177,6 +179,9 @@ static unsigned long exit_to_user_mode_l
>  		/* Architecture specific TIF work */
>  		arch_exit_to_user_mode_work(regs, ti_work);
>  
> +		if (tsk_isol_work)
> +			isolation_exit_to_user_mode();
> +
>  		/*
>  		 * Disable interrupts and reevaluate the work flags as they
>  		 * might have changed while interrupts and preemption was
> @@ -188,6 +193,7 @@ static unsigned long exit_to_user_mode_l
>  		tick_nohz_user_enter_prepare();
>  
>  		ti_work = READ_ONCE(current_thread_info()->flags);
> +		tsk_isol_work = task_isol_has_work();

Shouldn't it be a TIF_FLAG part of EXIT_TO_USER_MODE_WORK instead?

Thanks.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [patch v8 03/10] task isolation: sync vmstats on return to userspace
  2022-01-21 12:06 ` Frederic Weisbecker
@ 2022-01-27 16:47   ` Marcelo Tosatti
  2022-01-27 18:01     ` Frederic Weisbecker
  0 siblings, 1 reply; 4+ messages in thread
From: Marcelo Tosatti @ 2022-01-27 16:47 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: linux-kernel, Nitesh Lal, Nicolas Saenz Julienne,
	Christoph Lameter, Juri Lelli, Peter Zijlstra, Alex Belits,
	Peter Xu, Thomas Gleixner, Daniel Bristot de Oliveira

On Fri, Jan 21, 2022 at 01:06:10PM +0100, Frederic Weisbecker wrote:
> On Wed, Dec 08, 2021 at 01:09:09PM -0300, Marcelo Tosatti wrote:
> > Index: linux-2.6/include/linux/task_isolation.h
> > ===================================================================
> > --- linux-2.6.orig/include/linux/task_isolation.h
> > +++ linux-2.6/include/linux/task_isolation.h
> > @@ -40,8 +40,19 @@ int prctl_task_isolation_activate_set(un
> >  
> >  int __copy_task_isolation(struct task_struct *tsk);
> >  
> > +void isolation_exit_to_user_mode(void);
> > +
> > +static inline int task_isol_has_work(void)
> > +{
> > +	return 0;
> > +}
> > +
> >  #else
> >  
> > +static void isolation_exit_to_user_mode(void)
> > +{
> > +}
> > +
> >  static inline void tsk_isol_free(struct task_struct *tsk)
> >  {
> >  }
> > @@ -86,6 +97,11 @@ static inline int prctl_task_isolation_a
> >  	return -EOPNOTSUPP;
> >  }
> >  
> > +static inline int task_isol_has_work(void)
> > +{
> > +	return 0;
> > +}
> > +
> 
> It would be nice to have a coherent greppable task_isol_*() namespace instead
> of random scattered tsk_*(), isolation_*() stuff...
> 
> task_isol_exit_to_user_mode()
> task_isol_free()
> task_isol_copy_process()
> task_isol_had_work()
> ...
> 
> > @@ -149,13 +150,14 @@ static void handle_signal_work(struct pt
> >  }
> >  
> >  static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
> > -					    unsigned long ti_work)
> > +					    unsigned long ti_work,
> > +					    unsigned long tsk_isol_work)
> >  {
> >  	/*
> >  	 * Before returning to user space ensure that all pending work
> >  	 * items have been completed.
> >  	 */
> > -	while (ti_work & EXIT_TO_USER_MODE_WORK) {
> > +	while ((ti_work & EXIT_TO_USER_MODE_WORK) || tsk_isol_work) {
> 
> So there is a dependency on CONFIG_GENERIC_ENTRY. Then you need to split that
> from CONFIG_CPU_ISOLATION:
> 
> config TASK_ISOLATION
>        bool "Task isolation prctl()"
>        depends on GENERIC_ENTRY
>        help "...."
> 
> >  
> >  		local_irq_enable_exit_to_user(ti_work);
> >  
> > @@ -177,6 +179,9 @@ static unsigned long exit_to_user_mode_l
> >  		/* Architecture specific TIF work */
> >  		arch_exit_to_user_mode_work(regs, ti_work);
> >  
> > +		if (tsk_isol_work)
> > +			isolation_exit_to_user_mode();
> > +
> >  		/*
> >  		 * Disable interrupts and reevaluate the work flags as they
> >  		 * might have changed while interrupts and preemption was
> > @@ -188,6 +193,7 @@ static unsigned long exit_to_user_mode_l
> >  		tick_nohz_user_enter_prepare();
> >  
> >  		ti_work = READ_ONCE(current_thread_info()->flags);
> > +		tsk_isol_work = task_isol_has_work();
> 
> Shouldn't it be a TIF_FLAG part of EXIT_TO_USER_MODE_WORK instead?
> 
> Thanks.

static inline int task_isol_has_work(void)
{
       int cpu, ret;
       struct isol_info *i;

       if (likely(current->task_isol_info == NULL))
               return 0;

       i = current->task_isol_info;
       if (i->active_mask != ISOL_F_QUIESCE)
               return 0;

       if (!(i->quiesce_mask & ISOL_F_QUIESCE_VMSTATS))
               return 0;

       cpu = get_cpu();
       ret = per_cpu(vmstat_dirty, cpu);
       put_cpu();

       return ret;
}

Well, whether its necessary to call task_isol_exit_to_user_mode depends
on the state of the enabled/disabled masks _and_ vmstat dirty bit
information.

It seems awkward, to me, to condense all that information in a single bit.

Addressed all other comments, thanks.


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [patch v8 03/10] task isolation: sync vmstats on return to userspace
  2022-01-27 16:47   ` Marcelo Tosatti
@ 2022-01-27 18:01     ` Frederic Weisbecker
  0 siblings, 0 replies; 4+ messages in thread
From: Frederic Weisbecker @ 2022-01-27 18:01 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: linux-kernel, Nitesh Lal, Nicolas Saenz Julienne,
	Christoph Lameter, Juri Lelli, Peter Zijlstra, Alex Belits,
	Peter Xu, Thomas Gleixner, Daniel Bristot de Oliveira

On Thu, Jan 27, 2022 at 01:47:07PM -0300, Marcelo Tosatti wrote:
> > > @@ -177,6 +179,9 @@ static unsigned long exit_to_user_mode_l
> > >  		/* Architecture specific TIF work */
> > >  		arch_exit_to_user_mode_work(regs, ti_work);
> > >  
> > > +		if (tsk_isol_work)
> > > +			isolation_exit_to_user_mode();
> > > +
> > >  		/*
> > >  		 * Disable interrupts and reevaluate the work flags as they
> > >  		 * might have changed while interrupts and preemption was
> > > @@ -188,6 +193,7 @@ static unsigned long exit_to_user_mode_l
> > >  		tick_nohz_user_enter_prepare();
> > >  
> > >  		ti_work = READ_ONCE(current_thread_info()->flags);
> > > +		tsk_isol_work = task_isol_has_work();
> > 
> > Shouldn't it be a TIF_FLAG part of EXIT_TO_USER_MODE_WORK instead?
> > 
> > Thanks.
> 
> static inline int task_isol_has_work(void)
> {
>        int cpu, ret;
>        struct isol_info *i;
> 
>        if (likely(current->task_isol_info == NULL))
>                return 0;
> 
>        i = current->task_isol_info;
>        if (i->active_mask != ISOL_F_QUIESCE)
>                return 0;
> 
>        if (!(i->quiesce_mask & ISOL_F_QUIESCE_VMSTATS))
>                return 0;
> 
>        cpu = get_cpu();
>        ret = per_cpu(vmstat_dirty, cpu);
>        put_cpu();
> 
>        return ret;
> }
> 
> Well, whether its necessary to call task_isol_exit_to_user_mode depends
> on the state of the enabled/disabled masks _and_ vmstat dirty bit
> information.
> 
> It seems awkward, to me, to condense all that information in a single bit.
> 
> Addressed all other comments, thanks.

You're unconditionally adding overhead to the syscall fastpath when it would
be so easy to set a TIF_FLAG as long as (current->task_isol_info->quiesce_mask
!= 0). vmstat_dirty can be checked afterward.

I suspect your patchset will sell much better if you join the common slowpath
single condition in EXIT_TO_USER_MODE_WORK.

Thanks.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-01-27 18:01 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-08 16:09 [patch v8 03/10] task isolation: sync vmstats on return to userspace Marcelo Tosatti
2022-01-21 12:06 ` Frederic Weisbecker
2022-01-27 16:47   ` Marcelo Tosatti
2022-01-27 18:01     ` Frederic Weisbecker

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).