linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/1] cgroup: make per-cgroup pressure stall tracking configurable
@ 2021-05-13 17:53 Suren Baghdasaryan
  2021-05-14 11:41 ` Peter Zijlstra
  0 siblings, 1 reply; 10+ messages in thread
From: Suren Baghdasaryan @ 2021-05-13 17:53 UTC (permalink / raw)
  To: tj
  Cc: hannes, lizefan.x, mingo, peterz, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, minchan, corbet,
	bristot, paulmck, rdunlap, akpm, tglx, macro, viresh.kumar,
	mike.kravetz, linux-doc, linux-kernel, cgroups, surenb,
	kernel-team

PSI accounts stalls for each cgroup separately and aggregates it at each
level of the hierarchy. This causes additional overhead with psi_avgs_work
being called for each cgroup in the hierarchy. psi_avgs_work has been
highly optimized, however on systems with large number of cgroups the
overhead becomes noticeable.
Systems which use PSI only at the system level could avoid this overhead
if PSI can be configured to skip per-cgroup stall accounting.
Add "cgroup_disable=pressure" kernel command-line option to allow
requesting system-wide only pressure stall accounting. When set, it
keeps system-wide accounting under /proc/pressure/ but skips accounting
for individual cgroups and does not expose PSI nodes in cgroup hierarchy.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 .../admin-guide/kernel-parameters.txt         |  9 +++-
 include/linux/cgroup-defs.h                   |  1 +
 include/linux/cgroup.h                        |  7 +++
 kernel/cgroup/cgroup.c                        | 46 +++++++++++++++++++
 kernel/sched/psi.c                            |  8 +++-
 5 files changed, 67 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index cb89dbdedc46..653c62142f07 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -497,16 +497,21 @@
 	ccw_timeout_log	[S390]
 			See Documentation/s390/common_io.rst for details.
 
-	cgroup_disable=	[KNL] Disable a particular controller
-			Format: {name of the controller(s) to disable}
+	cgroup_disable=	[KNL] Disable a particular controller or optional feature
+			Format: {name of the controller(s) or feature(s) to disable}
 			The effects of cgroup_disable=foo are:
 			- foo isn't auto-mounted if you mount all cgroups in
 			  a single hierarchy
 			- foo isn't visible as an individually mountable
 			  subsystem
+			- if foo is an optional feature then the feature is
+			  disabled and corresponding cgroup files are not
+			  created
 			{Currently only "memory" controller deal with this and
 			cut the overhead, others just disable the usage. So
 			only cgroup_disable=memory is actually worthy}
+			Specifying "pressure" disables per-cgroup pressure
+			stall information accounting feature
 
 	cgroup_no_v1=	[KNL] Disable cgroup controllers and named hierarchies in v1
 			Format: { { controller | "all" | "named" }
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 559ee05f86b2..671f55cac0f0 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -110,6 +110,7 @@ enum {
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
 	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
 	CFTYPE_DEBUG		= (1 << 5),	/* create when cgroup_debug */
+	CFTYPE_PRESSURE		= (1 << 6),	/* only if pressure feature is enabled */
 
 	/* internal flags, do not use outside cgroup core proper */
 	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4f2f79de083e..b929f589968b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -676,6 +676,8 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
 	return &cgrp->psi;
 }
 
+bool cgroup_psi_enabled(void);
+
 static inline void cgroup_init_kthreadd(void)
 {
 	/*
@@ -735,6 +737,11 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
 	return NULL;
 }
 
+static inline bool cgroup_psi_enabled(void)
+{
+	return false;
+}
+
 static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
 					       struct cgroup *ancestor)
 {
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e049edd66776..c4b16c82e199 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -209,6 +209,22 @@ struct cgroup_namespace init_cgroup_ns = {
 static struct file_system_type cgroup2_fs_type;
 static struct cftype cgroup_base_files[];
 
+/* cgroup optional features */
+enum cgroup_opt_features {
+#ifdef CONFIG_PSI
+	OPT_FEATURE_PRESSURE,
+#endif
+	OPT_FEATURE_COUNT
+};
+
+static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
+#ifdef CONFIG_PSI
+	"pressure",
+#endif
+};
+
+static u16 cgroup_feature_disable_mask __read_mostly;
+
 static int cgroup_apply_control(struct cgroup *cgrp);
 static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
 static void css_task_iter_skip(struct css_task_iter *it,
@@ -3631,6 +3647,18 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
 {
 	psi_trigger_replace(&of->priv, NULL);
 }
+
+bool cgroup_psi_enabled(void)
+{
+	return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
+}
+
+#else /* CONFIG_PSI */
+bool cgroup_psi_enabled(void)
+{
+	return false;
+}
+
 #endif /* CONFIG_PSI */
 
 static int cgroup_freeze_show(struct seq_file *seq, void *v)
@@ -3881,6 +3909,8 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
 restart:
 	for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
 		/* does cft->flags tell us to skip this file on @cgrp? */
+		if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+			continue;
 		if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
 			continue;
 		if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@@ -3958,6 +3988,9 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 
 		WARN_ON(cft->ss || cft->kf_ops);
 
+		if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+			continue;
+
 		if (cft->seq_start)
 			kf_ops = &cgroup_kf_ops;
 		else
@@ -4866,6 +4899,7 @@ static struct cftype cgroup_base_files[] = {
 #ifdef CONFIG_PSI
 	{
 		.name = "io.pressure",
+		.flags = CFTYPE_PRESSURE,
 		.seq_show = cgroup_io_pressure_show,
 		.write = cgroup_io_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -4873,6 +4907,7 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "memory.pressure",
+		.flags = CFTYPE_PRESSURE,
 		.seq_show = cgroup_memory_pressure_show,
 		.write = cgroup_memory_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -4880,6 +4915,7 @@ static struct cftype cgroup_base_files[] = {
 	},
 	{
 		.name = "cpu.pressure",
+		.flags = CFTYPE_PRESSURE,
 		.seq_show = cgroup_cpu_pressure_show,
 		.write = cgroup_cpu_pressure_write,
 		.poll = cgroup_pressure_poll,
@@ -6216,6 +6252,13 @@ static int __init cgroup_disable(char *str)
 				continue;
 			cgroup_disable_mask |= 1 << i;
 		}
+
+		for (i = 0; i < OPT_FEATURE_COUNT; i++) {
+			if (strcmp(token, cgroup_opt_feature_names[i]))
+				continue;
+			cgroup_feature_disable_mask |= 1 << i;
+			break;
+		}
 	}
 	return 1;
 }
@@ -6514,6 +6557,9 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
 		if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
 			continue;
 
+		if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+			continue;
+
 		if (prefix)
 			ret += snprintf(buf + ret, size - ret, "%s.", prefix);
 
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index cc25a3cff41f..c73efd7d4fba 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -747,9 +747,12 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
 #ifdef CONFIG_CGROUPS
 	struct cgroup *cgroup = NULL;
 
-	if (!*iter)
+	if (!*iter) {
+		/* Skip to psi_system if per-cgroup accounting is disabled */
+		if (!cgroup_psi_enabled())
+			goto update_sys;
 		cgroup = task->cgroups->dfl_cgrp;
-	else if (*iter == &psi_system)
+	} else if (*iter == &psi_system)
 		return NULL;
 	else
 		cgroup = cgroup_parent(*iter);
@@ -758,6 +761,7 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
 		*iter = cgroup;
 		return cgroup_psi(cgroup);
 	}
+update_sys:
 #else
 	if (*iter)
 		return NULL;
-- 
2.31.1.607.g51e8a6a459-goog


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] cgroup: make per-cgroup pressure stall tracking configurable
  2021-05-13 17:53 [PATCH 1/1] cgroup: make per-cgroup pressure stall tracking configurable Suren Baghdasaryan
@ 2021-05-14 11:41 ` Peter Zijlstra
  2021-05-14 15:54   ` Suren Baghdasaryan
  0 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2021-05-14 11:41 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: tj, hannes, lizefan.x, mingo, juri.lelli, vincent.guittot,
	dietmar.eggemann, rostedt, bsegall, mgorman, minchan, corbet,
	bristot, paulmck, rdunlap, akpm, tglx, macro, viresh.kumar,
	mike.kravetz, linux-doc, linux-kernel, cgroups, kernel-team

On Thu, May 13, 2021 at 10:53:49AM -0700, Suren Baghdasaryan wrote:

> +bool cgroup_psi_enabled(void)
> +{
> +	return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
> +}

> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index cc25a3cff41f..c73efd7d4fba 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -747,9 +747,12 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
>  #ifdef CONFIG_CGROUPS
>  	struct cgroup *cgroup = NULL;
>  
> -	if (!*iter)
> +	if (!*iter) {
> +		/* Skip to psi_system if per-cgroup accounting is disabled */
> +		if (!cgroup_psi_enabled())
> +			goto update_sys;
>  		cgroup = task->cgroups->dfl_cgrp;
> -	else if (*iter == &psi_system)
> +	} else if (*iter == &psi_system)
>  		return NULL;
>  	else
>  		cgroup = cgroup_parent(*iter);
> @@ -758,6 +761,7 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
>  		*iter = cgroup;
>  		return cgroup_psi(cgroup);
>  	}
> +update_sys:
>  #else
>  	if (*iter)
>  		return NULL;

I'm confused; shouldn't that do the same as that #else branch? Also, can
you pretty please make cgroup_psi_enabled() a static_key ?

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] cgroup: make per-cgroup pressure stall tracking configurable
  2021-05-14 11:41 ` Peter Zijlstra
@ 2021-05-14 15:54   ` Suren Baghdasaryan
  2021-05-14 17:52     ` Peter Zijlstra
  0 siblings, 1 reply; 10+ messages in thread
From: Suren Baghdasaryan @ 2021-05-14 15:54 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Tejun Heo, Johannes Weiner, lizefan.x, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Benjamin Segall, mgorman, Minchan Kim, Jonathan Corbet, bristot,
	Paul E . McKenney, rdunlap, Andrew Morton, Thomas Gleixner,
	macro, Viresh Kumar, mike.kravetz, linux-doc, LKML,
	cgroups mailinglist, kernel-team

On Fri, May 14, 2021 at 4:42 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Thu, May 13, 2021 at 10:53:49AM -0700, Suren Baghdasaryan wrote:
>
> > +bool cgroup_psi_enabled(void)
> > +{
> > +     return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
> > +}
>
> > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > index cc25a3cff41f..c73efd7d4fba 100644
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -747,9 +747,12 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> >  #ifdef CONFIG_CGROUPS
> >       struct cgroup *cgroup = NULL;
> >
> > -     if (!*iter)
> > +     if (!*iter) {
> > +             /* Skip to psi_system if per-cgroup accounting is disabled */
> > +             if (!cgroup_psi_enabled())
> > +                     goto update_sys;
> >               cgroup = task->cgroups->dfl_cgrp;
> > -     else if (*iter == &psi_system)
> > +     } else if (*iter == &psi_system)
> >               return NULL;
> >       else
> >               cgroup = cgroup_parent(*iter);
> > @@ -758,6 +761,7 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> >               *iter = cgroup;
> >               return cgroup_psi(cgroup);
> >       }
> > +update_sys:
> >  #else
> >       if (*iter)
> >               return NULL;
>
> I'm confused; shouldn't that do the same as that #else branch?

Correct, for this function CONFIG_CGROUPS=n and
cgroup_disable=pressure are treated the same. True, from the code it's
not very obvious. Do you have some refactoring in mind that would make
it more explicit?

>Also, can you pretty please make cgroup_psi_enabled() a static_key ?

Certainly, will post an update on Monday.
Thanks for the feedback, Peter!

>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] cgroup: make per-cgroup pressure stall tracking configurable
  2021-05-14 15:54   ` Suren Baghdasaryan
@ 2021-05-14 17:52     ` Peter Zijlstra
  2021-05-14 18:20       ` Suren Baghdasaryan
  0 siblings, 1 reply; 10+ messages in thread
From: Peter Zijlstra @ 2021-05-14 17:52 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Tejun Heo, Johannes Weiner, lizefan.x, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Benjamin Segall, mgorman, Minchan Kim, Jonathan Corbet, bristot,
	Paul E . McKenney, rdunlap, Andrew Morton, Thomas Gleixner,
	macro, Viresh Kumar, mike.kravetz, linux-doc, LKML,
	cgroups mailinglist, kernel-team

On Fri, May 14, 2021 at 08:54:47AM -0700, Suren Baghdasaryan wrote:

> Correct, for this function CONFIG_CGROUPS=n and
> cgroup_disable=pressure are treated the same. True, from the code it's
> not very obvious. Do you have some refactoring in mind that would make
> it more explicit?

Does this make sense?

--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -744,24 +744,26 @@ static void psi_group_change(struct psi_
 
 static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
 {
+	if (cgroup_psi_enabled()) {
 #ifdef CONFIG_CGROUPS
-	struct cgroup *cgroup = NULL;
+		struct cgroup *cgroup = NULL;
 
-	if (!*iter)
-		cgroup = task->cgroups->dfl_cgrp;
-	else if (*iter == &psi_system)
-		return NULL;
-	else
-		cgroup = cgroup_parent(*iter);
+		if (!*iter)
+			cgroup = task->cgroups->dfl_cgrp;
+		else if (*iter == &psi_system)
+			return NULL;
+		else
+			cgroup = cgroup_parent(*iter);
 
-	if (cgroup && cgroup_parent(cgroup)) {
-		*iter = cgroup;
-		return cgroup_psi(cgroup);
-	}
-#else
-	if (*iter)
-		return NULL;
+		if (cgroup && cgroup_parent(cgroup)) {
+			*iter = cgroup;
+			return cgroup_psi(cgroup);
+		}
 #endif
+	} else {
+		if (*iter)
+			return NULL;
+	}
 	*iter = &psi_system;
 	return &psi_system;
 }

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] cgroup: make per-cgroup pressure stall tracking configurable
  2021-05-14 17:52     ` Peter Zijlstra
@ 2021-05-14 18:20       ` Suren Baghdasaryan
  2021-05-14 18:50         ` Suren Baghdasaryan
  0 siblings, 1 reply; 10+ messages in thread
From: Suren Baghdasaryan @ 2021-05-14 18:20 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Tejun Heo, Johannes Weiner, lizefan.x, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Benjamin Segall, mgorman, Minchan Kim, Jonathan Corbet, bristot,
	Paul E . McKenney, rdunlap, Andrew Morton, Thomas Gleixner,
	macro, Viresh Kumar, mike.kravetz, linux-doc, LKML,
	cgroups mailinglist, kernel-team

On Fri, May 14, 2021 at 10:52 AM Peter Zijlstra <peterz@infradead.org> wrote:
>
> On Fri, May 14, 2021 at 08:54:47AM -0700, Suren Baghdasaryan wrote:
>
> > Correct, for this function CONFIG_CGROUPS=n and
> > cgroup_disable=pressure are treated the same. True, from the code it's
> > not very obvious. Do you have some refactoring in mind that would make
> > it more explicit?
>
> Does this make sense?
>
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -744,24 +744,26 @@ static void psi_group_change(struct psi_
>
>  static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
>  {
> +       if (cgroup_psi_enabled()) {
>  #ifdef CONFIG_CGROUPS
> -       struct cgroup *cgroup = NULL;
> +               struct cgroup *cgroup = NULL;
>
> -       if (!*iter)
> -               cgroup = task->cgroups->dfl_cgrp;
> -       else if (*iter == &psi_system)
> -               return NULL;
> -       else
> -               cgroup = cgroup_parent(*iter);
> +               if (!*iter)
> +                       cgroup = task->cgroups->dfl_cgrp;
> +               else if (*iter == &psi_system)
> +                       return NULL;
> +               else
> +                       cgroup = cgroup_parent(*iter);
>
> -       if (cgroup && cgroup_parent(cgroup)) {
> -               *iter = cgroup;
> -               return cgroup_psi(cgroup);
> -       }
> -#else
> -       if (*iter)
> -               return NULL;
> +               if (cgroup && cgroup_parent(cgroup)) {
> +                       *iter = cgroup;
> +                       return cgroup_psi(cgroup);
> +               }
>  #endif
> +       } else {
> +               if (*iter)
> +                       return NULL;
> +       }
>         *iter = &psi_system;
>         return &psi_system;
>  }

Hmm. Looks like the case when cgroup_psi_enabled()==true and
CONFIG_CGROUPS=n would miss the "if (*iter) return NULL;" condition.
Effectively with CONFIG_CGROUPS=n this becomes:

       if (cgroup_psi_enabled()) {           <== assume this is true
#ifdef CONFIG_CGROUPS                <== compiled out
#endif
       } else {
               if (*iter)                                  <== this
statement will never execute
                       return NULL;
       }
       *iter = &psi_system;
        return &psi_system;

>
> --
> To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] cgroup: make per-cgroup pressure stall tracking configurable
  2021-05-14 18:20       ` Suren Baghdasaryan
@ 2021-05-14 18:50         ` Suren Baghdasaryan
  2021-05-16 19:52           ` Suren Baghdasaryan
  0 siblings, 1 reply; 10+ messages in thread
From: Suren Baghdasaryan @ 2021-05-14 18:50 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Tejun Heo, Johannes Weiner, lizefan.x, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Benjamin Segall, mgorman, Minchan Kim, Jonathan Corbet, bristot,
	Paul E . McKenney, rdunlap, Andrew Morton, Thomas Gleixner,
	macro, Viresh Kumar, mike.kravetz, linux-doc, LKML,
	cgroups mailinglist, kernel-team

On Fri, May 14, 2021 at 11:20 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Fri, May 14, 2021 at 10:52 AM Peter Zijlstra <peterz@infradead.org> wrote:
> >
> > On Fri, May 14, 2021 at 08:54:47AM -0700, Suren Baghdasaryan wrote:
> >
> > > Correct, for this function CONFIG_CGROUPS=n and
> > > cgroup_disable=pressure are treated the same. True, from the code it's
> > > not very obvious. Do you have some refactoring in mind that would make
> > > it more explicit?
> >
> > Does this make sense?
> >
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -744,24 +744,26 @@ static void psi_group_change(struct psi_
> >
> >  static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> >  {
> > +       if (cgroup_psi_enabled()) {
> >  #ifdef CONFIG_CGROUPS
> > -       struct cgroup *cgroup = NULL;
> > +               struct cgroup *cgroup = NULL;
> >
> > -       if (!*iter)
> > -               cgroup = task->cgroups->dfl_cgrp;
> > -       else if (*iter == &psi_system)
> > -               return NULL;
> > -       else
> > -               cgroup = cgroup_parent(*iter);
> > +               if (!*iter)
> > +                       cgroup = task->cgroups->dfl_cgrp;
> > +               else if (*iter == &psi_system)
> > +                       return NULL;
> > +               else
> > +                       cgroup = cgroup_parent(*iter);
> >
> > -       if (cgroup && cgroup_parent(cgroup)) {
> > -               *iter = cgroup;
> > -               return cgroup_psi(cgroup);
> > -       }
> > -#else
> > -       if (*iter)
> > -               return NULL;
> > +               if (cgroup && cgroup_parent(cgroup)) {
> > +                       *iter = cgroup;
> > +                       return cgroup_psi(cgroup);
> > +               }
> >  #endif
> > +       } else {
> > +               if (*iter)
> > +                       return NULL;
> > +       }
> >         *iter = &psi_system;
> >         return &psi_system;
> >  }
>
> Hmm. Looks like the case when cgroup_psi_enabled()==true and
> CONFIG_CGROUPS=n would miss the "if (*iter) return NULL;" condition.
> Effectively with CONFIG_CGROUPS=n this becomes:
>
>        if (cgroup_psi_enabled()) {           <== assume this is true
> #ifdef CONFIG_CGROUPS                <== compiled out
> #endif
>        } else {
>                if (*iter)                                  <== this
> statement will never execute
>                        return NULL;
>        }
>        *iter = &psi_system;
>         return &psi_system;
>

Ah, sorry. I forgot that CONFIG_CGROUPS=n would force
cgroup_psi_enabled()==false (the way function is defined in cgroup.h),
so (CONFIG_CGROUPS=n && cgroup_psi_enabled()==true) is an invalid
configuration. I think adding a comment to your suggestion would make
it more clear.
So your suggestion seems to work. I'll test it and include it in the
next revision. Thanks!


> >
> > --
> > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
> >

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] cgroup: make per-cgroup pressure stall tracking configurable
  2021-05-14 18:50         ` Suren Baghdasaryan
@ 2021-05-16 19:52           ` Suren Baghdasaryan
  2021-05-17 18:31             ` Johannes Weiner
  0 siblings, 1 reply; 10+ messages in thread
From: Suren Baghdasaryan @ 2021-05-16 19:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Tejun Heo, Johannes Weiner, lizefan.x, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Benjamin Segall, mgorman, Minchan Kim, Jonathan Corbet, bristot,
	Paul E . McKenney, rdunlap, Andrew Morton, Thomas Gleixner,
	macro, Viresh Kumar, mike.kravetz, linux-doc, LKML,
	cgroups mailinglist, kernel-team

On Fri, May 14, 2021 at 11:50 AM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Fri, May 14, 2021 at 11:20 AM Suren Baghdasaryan <surenb@google.com> wrote:
> >
> > On Fri, May 14, 2021 at 10:52 AM Peter Zijlstra <peterz@infradead.org> wrote:
> > >
> > > On Fri, May 14, 2021 at 08:54:47AM -0700, Suren Baghdasaryan wrote:
> > >
> > > > Correct, for this function CONFIG_CGROUPS=n and
> > > > cgroup_disable=pressure are treated the same. True, from the code it's
> > > > not very obvious. Do you have some refactoring in mind that would make
> > > > it more explicit?
> > >
> > > Does this make sense?
> > >
> > > --- a/kernel/sched/psi.c
> > > +++ b/kernel/sched/psi.c
> > > @@ -744,24 +744,26 @@ static void psi_group_change(struct psi_
> > >
> > >  static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> > >  {
> > > +       if (cgroup_psi_enabled()) {
> > >  #ifdef CONFIG_CGROUPS
> > > -       struct cgroup *cgroup = NULL;
> > > +               struct cgroup *cgroup = NULL;
> > >
> > > -       if (!*iter)
> > > -               cgroup = task->cgroups->dfl_cgrp;
> > > -       else if (*iter == &psi_system)
> > > -               return NULL;
> > > -       else
> > > -               cgroup = cgroup_parent(*iter);
> > > +               if (!*iter)
> > > +                       cgroup = task->cgroups->dfl_cgrp;
> > > +               else if (*iter == &psi_system)
> > > +                       return NULL;
> > > +               else
> > > +                       cgroup = cgroup_parent(*iter);
> > >
> > > -       if (cgroup && cgroup_parent(cgroup)) {
> > > -               *iter = cgroup;
> > > -               return cgroup_psi(cgroup);
> > > -       }
> > > -#else
> > > -       if (*iter)
> > > -               return NULL;
> > > +               if (cgroup && cgroup_parent(cgroup)) {
> > > +                       *iter = cgroup;
> > > +                       return cgroup_psi(cgroup);
> > > +               }
> > >  #endif
> > > +       } else {
> > > +               if (*iter)
> > > +                       return NULL;
> > > +       }
> > >         *iter = &psi_system;
> > >         return &psi_system;
> > >  }
> >
> > Hmm. Looks like the case when cgroup_psi_enabled()==true and
> > CONFIG_CGROUPS=n would miss the "if (*iter) return NULL;" condition.
> > Effectively with CONFIG_CGROUPS=n this becomes:
> >
> >        if (cgroup_psi_enabled()) {           <== assume this is true
> > #ifdef CONFIG_CGROUPS                <== compiled out
> > #endif
> >        } else {
> >                if (*iter)                                  <== this
> > statement will never execute
> >                        return NULL;
> >        }
> >        *iter = &psi_system;
> >         return &psi_system;
> >
>
> Ah, sorry. I forgot that CONFIG_CGROUPS=n would force
> cgroup_psi_enabled()==false (the way function is defined in cgroup.h),
> so (CONFIG_CGROUPS=n && cgroup_psi_enabled()==true) is an invalid
> configuration. I think adding a comment to your suggestion would make
> it more clear.
> So your suggestion seems to work. I'll test it and include it in the
> next revision. Thanks!

After reworking the code to add a static key I had to expand the
#ifdef CONFIG_CGROUPS section, so I think a code refactoring below
would make sense. It localizes config-specific code and it has the
same exact code for CONFIG_CGROUPS=n and for
cgroup_psi_enabled()==false. WDYT?:

--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -181,6 +181,7 @@ struct psi_group psi_system = {
 };

 static void psi_avgs_work(struct work_struct *work);
+static void cgroup_iterator_init(void);

 static void group_init(struct psi_group *group)
 {
@@ -211,6 +212,8 @@ void __init psi_init(void)
                 return;
         }

+        cgroup_iterator_init();
+
         psi_period = jiffies_to_nsecs(PSI_FREQ);
         group_init(&psi_system);
 }
@@ -742,11 +745,31 @@ static void psi_group_change(struct psi_group
*group, int cpu,
                 schedule_delayed_work(&group->avgs_work, PSI_FREQ);
 }

-static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+static inline struct psi_group *sys_group_iterator(struct task_struct *task,
+                                                   void **iter)
 {
+        *iter = &psi_system;
+        return &psi_system;
+}
+
 #ifdef CONFIG_CGROUPS
+
+DEFINE_STATIC_KEY_FALSE(psi_cgroups_disabled);
+
+static void cgroup_iterator_init(void)
+{
+        if (!cgroup_psi_enabled())
+                static_branch_enable(&psi_cgroups_disabled);
+}
+
+static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+{
         struct cgroup *cgroup = NULL;

+        /* Skip to psi_system if per-cgroup accounting is disabled */
+        if (static_branch_unlikely(&psi_cgroups_disabled))
+                return *iter ? NULL : sys_group_iterator(task, iter);
+
         if (!*iter)
                 cgroup = task->cgroups->dfl_cgrp;
         else if (*iter == &psi_system)
@@ -758,14 +781,20 @@ static struct psi_group *iterate_groups(struct
task_struct *task, void **iter)
                 *iter = cgroup;
                 return cgroup_psi(cgroup);
         }
-#else
-        if (*iter)
-                return NULL;
-#endif
-        *iter = &psi_system;
-        return &psi_system;
+
+        return sys_group_iterator(task, iter);
 }

+#else /* CONFIG_CGROUPS */
+static inline void cgroup_iterator_init(void) {}
+
+static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+{
+        return *iter ? NULL : sys_group_iterator(task, iter);
+}
+
+#endif /* CONFIG_CGROUPS */
+


>
>
> > >
> > > --
> > > To unsubscribe from this group and stop receiving emails from it, send an email to kernel-team+unsubscribe@android.com.
> > >

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] cgroup: make per-cgroup pressure stall tracking configurable
  2021-05-16 19:52           ` Suren Baghdasaryan
@ 2021-05-17 18:31             ` Johannes Weiner
  2021-05-17 20:02               ` Suren Baghdasaryan
  0 siblings, 1 reply; 10+ messages in thread
From: Johannes Weiner @ 2021-05-17 18:31 UTC (permalink / raw)
  To: Suren Baghdasaryan
  Cc: Peter Zijlstra, Tejun Heo, lizefan.x, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Benjamin Segall, mgorman, Minchan Kim, Jonathan Corbet, bristot,
	Paul E . McKenney, rdunlap, Andrew Morton, Thomas Gleixner,
	macro, Viresh Kumar, mike.kravetz, linux-doc, LKML,
	cgroups mailinglist, kernel-team

On Sun, May 16, 2021 at 12:52:32PM -0700, Suren Baghdasaryan wrote:
> After reworking the code to add a static key I had to expand the
> #ifdef CONFIG_CGROUPS section, so I think a code refactoring below
> would make sense. It localizes config-specific code and it has the
> same exact code for CONFIG_CGROUPS=n and for
> cgroup_psi_enabled()==false. WDYT?:
> 
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -181,6 +181,7 @@ struct psi_group psi_system = {
>  };
> 
>  static void psi_avgs_work(struct work_struct *work);
> +static void cgroup_iterator_init(void);
> 
>  static void group_init(struct psi_group *group)
>  {
> @@ -211,6 +212,8 @@ void __init psi_init(void)
>                  return;
>          }
> 
> +        cgroup_iterator_init();
> +
>          psi_period = jiffies_to_nsecs(PSI_FREQ);
>          group_init(&psi_system);
>  }
> @@ -742,11 +745,31 @@ static void psi_group_change(struct psi_group
> *group, int cpu,
>                  schedule_delayed_work(&group->avgs_work, PSI_FREQ);
>  }
> 
> -static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> +static inline struct psi_group *sys_group_iterator(struct task_struct *task,
> +                                                   void **iter)
>  {
> +        *iter = &psi_system;
> +        return &psi_system;
> +}
> +
>  #ifdef CONFIG_CGROUPS
> +
> +DEFINE_STATIC_KEY_FALSE(psi_cgroups_disabled);
> +
> +static void cgroup_iterator_init(void)
> +{
> +        if (!cgroup_psi_enabled())
> +                static_branch_enable(&psi_cgroups_disabled);
> +}
> +
> +static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> +{
>          struct cgroup *cgroup = NULL;
> 
> +        /* Skip to psi_system if per-cgroup accounting is disabled */
> +        if (static_branch_unlikely(&psi_cgroups_disabled))
> +                return *iter ? NULL : sys_group_iterator(task, iter);
> +
>          if (!*iter)
>                  cgroup = task->cgroups->dfl_cgrp;

That looks over-engineered. You have to check iter whether cgroups are
enabled or not. Pulling the jump label check up doesn't save anything,
but it ends up duplicating code.

What you had in the beginning was better, it just had the system label
in an unexpected place where it would check iter twice in a row.

The (*iter == &psi_system) check inside the cgroups branch has the
same purpose as the (*iter) check in the else branch. We could
consolidate that by pulling it up front.

If we wrap the entire cgroup iteration block into the static branch,
IMO it becomes a bit clearer as well.

How about this?

static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
{
	if (*iter == &psi_system)
		return NULL;

#ifdef CONFIG_CGROUPS
	if (!static_branch_likely(&psi_cgroups_disabled)) {
		struct cgroup *cgroup = NULL;

		if (!*iter)
			cgroup = task->cgroups->dfl_cgrp;
		else
			cgroup = cgroup_parent(*iter);

		if (cgroup && cgroup_parent(cgroup)) {
			*iter = cgroup;
			return cgroup_psi(cgroup);
		}
	}
#endif

	*iter = &psi_system;
	return &psi_system;
}

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] cgroup: make per-cgroup pressure stall tracking configurable
  2021-05-17 18:31             ` Johannes Weiner
@ 2021-05-17 20:02               ` Suren Baghdasaryan
  2021-05-18  2:05                 ` Suren Baghdasaryan
  0 siblings, 1 reply; 10+ messages in thread
From: Suren Baghdasaryan @ 2021-05-17 20:02 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Peter Zijlstra, Tejun Heo, lizefan.x, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Benjamin Segall, mgorman, Minchan Kim, Jonathan Corbet, bristot,
	Paul E . McKenney, rdunlap, Andrew Morton, Thomas Gleixner,
	macro, Viresh Kumar, mike.kravetz, linux-doc, LKML,
	cgroups mailinglist, kernel-team

On Mon, May 17, 2021 at 11:31 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Sun, May 16, 2021 at 12:52:32PM -0700, Suren Baghdasaryan wrote:
> > After reworking the code to add a static key I had to expand the
> > #ifdef CONFIG_CGROUPS section, so I think a code refactoring below
> > would make sense. It localizes config-specific code and it has the
> > same exact code for CONFIG_CGROUPS=n and for
> > cgroup_psi_enabled()==false. WDYT?:
> >
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -181,6 +181,7 @@ struct psi_group psi_system = {
> >  };
> >
> >  static void psi_avgs_work(struct work_struct *work);
> > +static void cgroup_iterator_init(void);
> >
> >  static void group_init(struct psi_group *group)
> >  {
> > @@ -211,6 +212,8 @@ void __init psi_init(void)
> >                  return;
> >          }
> >
> > +        cgroup_iterator_init();
> > +
> >          psi_period = jiffies_to_nsecs(PSI_FREQ);
> >          group_init(&psi_system);
> >  }
> > @@ -742,11 +745,31 @@ static void psi_group_change(struct psi_group
> > *group, int cpu,
> >                  schedule_delayed_work(&group->avgs_work, PSI_FREQ);
> >  }
> >
> > -static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> > +static inline struct psi_group *sys_group_iterator(struct task_struct *task,
> > +                                                   void **iter)
> >  {
> > +        *iter = &psi_system;
> > +        return &psi_system;
> > +}
> > +
> >  #ifdef CONFIG_CGROUPS
> > +
> > +DEFINE_STATIC_KEY_FALSE(psi_cgroups_disabled);
> > +
> > +static void cgroup_iterator_init(void)
> > +{
> > +        if (!cgroup_psi_enabled())
> > +                static_branch_enable(&psi_cgroups_disabled);
> > +}
> > +
> > +static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> > +{
> >          struct cgroup *cgroup = NULL;
> >
> > +        /* Skip to psi_system if per-cgroup accounting is disabled */
> > +        if (static_branch_unlikely(&psi_cgroups_disabled))
> > +                return *iter ? NULL : sys_group_iterator(task, iter);
> > +
> >          if (!*iter)
> >                  cgroup = task->cgroups->dfl_cgrp;
>
> That looks over-engineered. You have to check iter whether cgroups are
> enabled or not. Pulling the jump label check up doesn't save anything,
> but it ends up duplicating code.
>
> What you had in the beginning was better, it just had the system label
> in an unexpected place where it would check iter twice in a row.
>
> The (*iter == &psi_system) check inside the cgroups branch has the
> same purpose as the (*iter) check in the else branch. We could
> consolidate that by pulling it up front.
>
> If we wrap the entire cgroup iteration block into the static branch,
> IMO it becomes a bit clearer as well.
>
> How about this?
>
> static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> {
>         if (*iter == &psi_system)
>                 return NULL;
>
> #ifdef CONFIG_CGROUPS
>         if (!static_branch_likely(&psi_cgroups_disabled)) {
>                 struct cgroup *cgroup = NULL;
>
>                 if (!*iter)
>                         cgroup = task->cgroups->dfl_cgrp;
>                 else
>                         cgroup = cgroup_parent(*iter);
>
>                 if (cgroup && cgroup_parent(cgroup)) {
>                         *iter = cgroup;
>                         return cgroup_psi(cgroup);
>                 }
>         }
> #endif
>
>         *iter = &psi_system;
>         return &psi_system;
> }

This looks great to me. Will use it in the next version. Thanks!

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] cgroup: make per-cgroup pressure stall tracking configurable
  2021-05-17 20:02               ` Suren Baghdasaryan
@ 2021-05-18  2:05                 ` Suren Baghdasaryan
  0 siblings, 0 replies; 10+ messages in thread
From: Suren Baghdasaryan @ 2021-05-18  2:05 UTC (permalink / raw)
  To: Johannes Weiner
  Cc: Peter Zijlstra, Tejun Heo, lizefan.x, Ingo Molnar, Juri Lelli,
	Vincent Guittot, Dietmar Eggemann, Steven Rostedt,
	Benjamin Segall, mgorman, Minchan Kim, Jonathan Corbet, bristot,
	Paul E . McKenney, rdunlap, Andrew Morton, Thomas Gleixner,
	macro, Viresh Kumar, mike.kravetz, linux-doc, LKML,
	cgroups mailinglist, kernel-team

On Mon, May 17, 2021 at 1:02 PM Suren Baghdasaryan <surenb@google.com> wrote:
>
> On Mon, May 17, 2021 at 11:31 AM Johannes Weiner <hannes@cmpxchg.org> wrote:
> >
> > On Sun, May 16, 2021 at 12:52:32PM -0700, Suren Baghdasaryan wrote:
> > > After reworking the code to add a static key I had to expand the
> > > #ifdef CONFIG_CGROUPS section, so I think a code refactoring below
> > > would make sense. It localizes config-specific code and it has the
> > > same exact code for CONFIG_CGROUPS=n and for
> > > cgroup_psi_enabled()==false. WDYT?:
> > >
> > > --- a/kernel/sched/psi.c
> > > +++ b/kernel/sched/psi.c
> > > @@ -181,6 +181,7 @@ struct psi_group psi_system = {
> > >  };
> > >
> > >  static void psi_avgs_work(struct work_struct *work);
> > > +static void cgroup_iterator_init(void);
> > >
> > >  static void group_init(struct psi_group *group)
> > >  {
> > > @@ -211,6 +212,8 @@ void __init psi_init(void)
> > >                  return;
> > >          }
> > >
> > > +        cgroup_iterator_init();
> > > +
> > >          psi_period = jiffies_to_nsecs(PSI_FREQ);
> > >          group_init(&psi_system);
> > >  }
> > > @@ -742,11 +745,31 @@ static void psi_group_change(struct psi_group
> > > *group, int cpu,
> > >                  schedule_delayed_work(&group->avgs_work, PSI_FREQ);
> > >  }
> > >
> > > -static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> > > +static inline struct psi_group *sys_group_iterator(struct task_struct *task,
> > > +                                                   void **iter)
> > >  {
> > > +        *iter = &psi_system;
> > > +        return &psi_system;
> > > +}
> > > +
> > >  #ifdef CONFIG_CGROUPS
> > > +
> > > +DEFINE_STATIC_KEY_FALSE(psi_cgroups_disabled);
> > > +
> > > +static void cgroup_iterator_init(void)
> > > +{
> > > +        if (!cgroup_psi_enabled())
> > > +                static_branch_enable(&psi_cgroups_disabled);
> > > +}
> > > +
> > > +static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> > > +{
> > >          struct cgroup *cgroup = NULL;
> > >
> > > +        /* Skip to psi_system if per-cgroup accounting is disabled */
> > > +        if (static_branch_unlikely(&psi_cgroups_disabled))
> > > +                return *iter ? NULL : sys_group_iterator(task, iter);
> > > +
> > >          if (!*iter)
> > >                  cgroup = task->cgroups->dfl_cgrp;
> >
> > That looks over-engineered. You have to check iter whether cgroups are
> > enabled or not. Pulling the jump label check up doesn't save anything,
> > but it ends up duplicating code.
> >
> > What you had in the beginning was better, it just had the system label
> > in an unexpected place where it would check iter twice in a row.
> >
> > The (*iter == &psi_system) check inside the cgroups branch has the
> > same purpose as the (*iter) check in the else branch. We could
> > consolidate that by pulling it up front.
> >
> > If we wrap the entire cgroup iteration block into the static branch,
> > IMO it becomes a bit clearer as well.
> >
> > How about this?
> >
> > static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
> > {
> >         if (*iter == &psi_system)
> >                 return NULL;
> >
> > #ifdef CONFIG_CGROUPS
> >         if (!static_branch_likely(&psi_cgroups_disabled)) {
> >                 struct cgroup *cgroup = NULL;
> >
> >                 if (!*iter)
> >                         cgroup = task->cgroups->dfl_cgrp;
> >                 else
> >                         cgroup = cgroup_parent(*iter);
> >
> >                 if (cgroup && cgroup_parent(cgroup)) {
> >                         *iter = cgroup;
> >                         return cgroup_psi(cgroup);
> >                 }
> >         }
> > #endif
> >
> >         *iter = &psi_system;
> >         return &psi_system;
> > }
>
> This looks great to me. Will use it in the next version. Thanks!

V2 is posted at https://lore.kernel.org/patchwork/patch/1430980

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2021-05-18  2:05 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-13 17:53 [PATCH 1/1] cgroup: make per-cgroup pressure stall tracking configurable Suren Baghdasaryan
2021-05-14 11:41 ` Peter Zijlstra
2021-05-14 15:54   ` Suren Baghdasaryan
2021-05-14 17:52     ` Peter Zijlstra
2021-05-14 18:20       ` Suren Baghdasaryan
2021-05-14 18:50         ` Suren Baghdasaryan
2021-05-16 19:52           ` Suren Baghdasaryan
2021-05-17 18:31             ` Johannes Weiner
2021-05-17 20:02               ` Suren Baghdasaryan
2021-05-18  2:05                 ` Suren Baghdasaryan

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).