All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling
@ 2021-06-02 14:58 Dietmar Eggemann
  2021-06-03  3:01 ` Xuewen Yan
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Dietmar Eggemann @ 2021-06-02 14:58 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Xuewen Yan, Vincent Donnefort
  Cc: Juri Lelli, Vincent Guittot, Steven Rostedt, Patrick Bellasi,
	Quentin Perret, linux-kernel

The util_est internal UTIL_AVG_UNCHANGED flag which is used to prevent
unnecessary util_est updates uses the LSB of util_est.enqueued. It is
exposed via _task_util_est() (and task_util_est()).

Commit 92a801e5d5b7 ("sched/fair: Mask UTIL_AVG_UNCHANGED usages")
mentions that the LSB is lost for util_est resolution but
find_energy_efficient_cpu() checks if task_util_est() returns 0 to
return prev_cpu early.

_task_util_est() returns the max value of util_est.ewma and
util_est.enqueued or'ed w/ UTIL_AVG_UNCHANGED.
So task_util_est() returning the max of task_util() and
_task_util_est() will never return 0 under the default
SCHED_FEAT(UTIL_EST, true).

To fix this use the MSB of util_est.enqueued instead and keep the flag
util_est internal, i.e. don't export it via _task_util_est().

The maximal possible util_avg value for a task is 1024 so the MSB of
'unsigned int util_est.enqueued' isn't used to store a util value.

As a caveat the code behind the util_est_se trace point has to filter
UTIL_AVG_UNCHANGED to see the real util_est.enqueued value which should
be easy to do.

This also fixes an issue report by Xuewen Yan that util_est_update()
only used UTIL_AVG_UNCHANGED for the subtrahend of the equation:

  last_enqueued_diff = ue.enqueued - (task_util() | UTIL_AVG_UNCHANGED)

Fixes: b89997aa88f0b sched/pelt: Fix task util_est update filtering
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
---

v1->v2:

1) Move UTIL_AVG_UNCHANGED definition into struct util_est.
2) Hide UTIL_AVG_UNCHANGED in proc_sched_show_task().

 include/linux/sched.h |  8 ++++++++
 kernel/sched/debug.c  |  3 ++-
 kernel/sched/fair.c   |  5 +++--
 kernel/sched/pelt.h   | 11 +----------
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c7e7d50e2fdc..ac5a7d29fd4f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -350,11 +350,19 @@ struct load_weight {
  * Only for tasks we track a moving average of the past instantaneous
  * estimated utilization. This allows to absorb sporadic drops in utilization
  * of an otherwise almost periodic task.
+ *
+ * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+ * updates. When a task is dequeued, its util_est should not be updated if its
+ * util_avg has not been updated in the meantime.
+ * This information is mapped into the MSB bit of util_est.enqueued at dequeue
+ * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
+ * for a task) it is safe to use MSB.
  */
 struct util_est {
 	unsigned int			enqueued;
 	unsigned int			ewma;
 #define UTIL_EST_WEIGHT_SHIFT		2
+#define UTIL_AVG_UNCHANGED		0x80000000
 } __attribute__((__aligned__(sizeof(u64))));
 
 /*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 3bdee5fd7d29..0c5ec2776ddf 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -885,6 +885,7 @@ static const struct seq_operations sched_debug_sops = {
 #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
 #define __P(F) __PS(#F, F)
 #define   P(F) __PS(#F, p->F)
+#define   PM(F, M) __PS(#F, p->F & (M))
 #define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
 #define __PN(F) __PSN(#F, F)
 #define   PN(F) __PSN(#F, p->F)
@@ -1011,7 +1012,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 	P(se.avg.util_avg);
 	P(se.avg.last_update_time);
 	P(se.avg.util_est.ewma);
-	P(se.avg.util_est.enqueued);
+	PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
 #endif
 #ifdef CONFIG_UCLAMP_TASK
 	__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a2c30e52de76..56747df3f796 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3856,7 +3856,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
 {
 	struct util_est ue = READ_ONCE(p->se.avg.util_est);
 
-	return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
+	return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
 }
 
 static inline unsigned long task_util_est(struct task_struct *p)
@@ -3956,7 +3956,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	 * Reset EWMA on utilization increases, the moving average is used only
 	 * to smooth utilization decreases.
 	 */
-	ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+	ue.enqueued = task_util(p);
 	if (sched_feat(UTIL_EST_FASTUP)) {
 		if (ue.ewma < ue.enqueued) {
 			ue.ewma = ue.enqueued;
@@ -4005,6 +4005,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	ue.ewma  += last_ewma_diff;
 	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
 done:
+	ue.enqueued |= UTIL_AVG_UNCHANGED;
 	WRITE_ONCE(p->se.avg.util_est, ue);
 
 	trace_sched_util_est_se_tp(&p->se);
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 9ed6d8c414ad..e06071bf3472 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -42,15 +42,6 @@ static inline u32 get_pelt_divider(struct sched_avg *avg)
 	return LOAD_AVG_MAX - 1024 + avg->period_contrib;
 }
 
-/*
- * When a task is dequeued, its estimated utilization should not be update if
- * its util_avg has not been updated at least once.
- * This flag is used to synchronize util_avg updates with util_est updates.
- * We map this information into the LSB bit of the utilization saved at
- * dequeue time (i.e. util_est.dequeued).
- */
-#define UTIL_AVG_UNCHANGED 0x1
-
 static inline void cfs_se_util_change(struct sched_avg *avg)
 {
 	unsigned int enqueued;
@@ -58,7 +49,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
 	if (!sched_feat(UTIL_EST))
 		return;
 
-	/* Avoid store if the flag has been already set */
+	/* Avoid store if the flag has been already reset */
 	enqueued = avg->util_est.enqueued;
 	if (!(enqueued & UTIL_AVG_UNCHANGED))
 		return;
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling
  2021-06-02 14:58 [PATCH v2] sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling Dietmar Eggemann
@ 2021-06-03  3:01 ` Xuewen Yan
  2021-06-03  8:29 ` Vincent Donnefort
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Xuewen Yan @ 2021-06-03  3:01 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: Ingo Molnar, Peter Zijlstra, Vincent Donnefort, Juri Lelli,
	Vincent Guittot, Steven Rostedt, Patrick Bellasi, Quentin Perret,
	linux-kernel

Hi


On Wed, Jun 2, 2021 at 10:58 PM Dietmar Eggemann
<dietmar.eggemann@arm.com> wrote:
>
> The util_est internal UTIL_AVG_UNCHANGED flag which is used to prevent
> unnecessary util_est updates uses the LSB of util_est.enqueued. It is
> exposed via _task_util_est() (and task_util_est()).
>
> Commit 92a801e5d5b7 ("sched/fair: Mask UTIL_AVG_UNCHANGED usages")
> mentions that the LSB is lost for util_est resolution but
> find_energy_efficient_cpu() checks if task_util_est() returns 0 to
> return prev_cpu early.
>
> _task_util_est() returns the max value of util_est.ewma and
> util_est.enqueued or'ed w/ UTIL_AVG_UNCHANGED.
> So task_util_est() returning the max of task_util() and
> _task_util_est() will never return 0 under the default
> SCHED_FEAT(UTIL_EST, true).
>
> To fix this use the MSB of util_est.enqueued instead and keep the flag
> util_est internal, i.e. don't export it via _task_util_est().
>
> The maximal possible util_avg value for a task is 1024 so the MSB of
> 'unsigned int util_est.enqueued' isn't used to store a util value.
>
> As a caveat the code behind the util_est_se trace point has to filter
> UTIL_AVG_UNCHANGED to see the real util_est.enqueued value which should
> be easy to do.
>
> This also fixes an issue report by Xuewen Yan that util_est_update()
> only used UTIL_AVG_UNCHANGED for the subtrahend of the equation:
>
>   last_enqueued_diff = ue.enqueued - (task_util() | UTIL_AVG_UNCHANGED)

It makes sense for the issue.

>
> Fixes: b89997aa88f0b sched/pelt: Fix task util_est update filtering
> Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>

Reviewed-by: Xuewen Yan <xuewen.yan@unisoc.com>

> ---
>
> v1->v2:
>
> 1) Move UTIL_AVG_UNCHANGED definition into struct util_est.
> 2) Hide UTIL_AVG_UNCHANGED in proc_sched_show_task().
>
>  include/linux/sched.h |  8 ++++++++
>  kernel/sched/debug.c  |  3 ++-
>  kernel/sched/fair.c   |  5 +++--
>  kernel/sched/pelt.h   | 11 +----------
>  4 files changed, 14 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index c7e7d50e2fdc..ac5a7d29fd4f 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -350,11 +350,19 @@ struct load_weight {
>   * Only for tasks we track a moving average of the past instantaneous
>   * estimated utilization. This allows to absorb sporadic drops in utilization
>   * of an otherwise almost periodic task.
> + *
> + * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
> + * updates. When a task is dequeued, its util_est should not be updated if its
> + * util_avg has not been updated in the meantime.
> + * This information is mapped into the MSB bit of util_est.enqueued at dequeue
> + * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
> + * for a task) it is safe to use MSB.
>   */
>  struct util_est {
>         unsigned int                    enqueued;
>         unsigned int                    ewma;
>  #define UTIL_EST_WEIGHT_SHIFT          2
> +#define UTIL_AVG_UNCHANGED             0x80000000
>  } __attribute__((__aligned__(sizeof(u64))));
>
>  /*
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 3bdee5fd7d29..0c5ec2776ddf 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -885,6 +885,7 @@ static const struct seq_operations sched_debug_sops = {
>  #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
>  #define __P(F) __PS(#F, F)
>  #define   P(F) __PS(#F, p->F)
> +#define   PM(F, M) __PS(#F, p->F & (M))
>  #define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
>  #define __PN(F) __PSN(#F, F)
>  #define   PN(F) __PSN(#F, p->F)
> @@ -1011,7 +1012,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
>         P(se.avg.util_avg);
>         P(se.avg.last_update_time);
>         P(se.avg.util_est.ewma);
> -       P(se.avg.util_est.enqueued);
> +       PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
>  #endif
>  #ifdef CONFIG_UCLAMP_TASK
>         __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a2c30e52de76..56747df3f796 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3856,7 +3856,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
>  {
>         struct util_est ue = READ_ONCE(p->se.avg.util_est);
>
> -       return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
> +       return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
>  }
>
>  static inline unsigned long task_util_est(struct task_struct *p)
> @@ -3956,7 +3956,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
>          * Reset EWMA on utilization increases, the moving average is used only
>          * to smooth utilization decreases.
>          */
> -       ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
> +       ue.enqueued = task_util(p);
>         if (sched_feat(UTIL_EST_FASTUP)) {
>                 if (ue.ewma < ue.enqueued) {
>                         ue.ewma = ue.enqueued;
> @@ -4005,6 +4005,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
>         ue.ewma  += last_ewma_diff;
>         ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
>  done:
> +       ue.enqueued |= UTIL_AVG_UNCHANGED;
>         WRITE_ONCE(p->se.avg.util_est, ue);
>
>         trace_sched_util_est_se_tp(&p->se);
> diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
> index 9ed6d8c414ad..e06071bf3472 100644
> --- a/kernel/sched/pelt.h
> +++ b/kernel/sched/pelt.h
> @@ -42,15 +42,6 @@ static inline u32 get_pelt_divider(struct sched_avg *avg)
>         return LOAD_AVG_MAX - 1024 + avg->period_contrib;
>  }
>
> -/*
> - * When a task is dequeued, its estimated utilization should not be update if
> - * its util_avg has not been updated at least once.
> - * This flag is used to synchronize util_avg updates with util_est updates.
> - * We map this information into the LSB bit of the utilization saved at
> - * dequeue time (i.e. util_est.dequeued).
> - */
> -#define UTIL_AVG_UNCHANGED 0x1
> -
>  static inline void cfs_se_util_change(struct sched_avg *avg)
>  {
>         unsigned int enqueued;
> @@ -58,7 +49,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
>         if (!sched_feat(UTIL_EST))
>                 return;
>
> -       /* Avoid store if the flag has been already set */
> +       /* Avoid store if the flag has been already reset */
>         enqueued = avg->util_est.enqueued;
>         if (!(enqueued & UTIL_AVG_UNCHANGED))
>                 return;
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling
  2021-06-02 14:58 [PATCH v2] sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling Dietmar Eggemann
  2021-06-03  3:01 ` Xuewen Yan
@ 2021-06-03  8:29 ` Vincent Donnefort
  2021-06-03 12:48 ` Vincent Guittot
  2021-06-04 13:38 ` [tip: sched/urgent] " tip-bot2 for Dietmar Eggemann
  3 siblings, 0 replies; 5+ messages in thread
From: Vincent Donnefort @ 2021-06-03  8:29 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: Ingo Molnar, Peter Zijlstra, Xuewen Yan, Juri Lelli,
	Vincent Guittot, Steven Rostedt, Patrick Bellasi, Quentin Perret,
	linux-kernel

On Wed, Jun 02, 2021 at 04:58:08PM +0200, Dietmar Eggemann wrote:
> The util_est internal UTIL_AVG_UNCHANGED flag which is used to prevent
> unnecessary util_est updates uses the LSB of util_est.enqueued. It is
> exposed via _task_util_est() (and task_util_est()).
> 
> Commit 92a801e5d5b7 ("sched/fair: Mask UTIL_AVG_UNCHANGED usages")
> mentions that the LSB is lost for util_est resolution but
> find_energy_efficient_cpu() checks if task_util_est() returns 0 to
> return prev_cpu early.
> 
> _task_util_est() returns the max value of util_est.ewma and
> util_est.enqueued or'ed w/ UTIL_AVG_UNCHANGED.
> So task_util_est() returning the max of task_util() and
> _task_util_est() will never return 0 under the default
> SCHED_FEAT(UTIL_EST, true).
> 
> To fix this use the MSB of util_est.enqueued instead and keep the flag
> util_est internal, i.e. don't export it via _task_util_est().
> 
> The maximal possible util_avg value for a task is 1024 so the MSB of
> 'unsigned int util_est.enqueued' isn't used to store a util value.
> 
> As a caveat the code behind the util_est_se trace point has to filter
> UTIL_AVG_UNCHANGED to see the real util_est.enqueued value which should
> be easy to do.
> 
> This also fixes an issue report by Xuewen Yan that util_est_update()
> only used UTIL_AVG_UNCHANGED for the subtrahend of the equation:
> 
>   last_enqueued_diff = ue.enqueued - (task_util() | UTIL_AVG_UNCHANGED)
> 
> Fixes: b89997aa88f0b sched/pelt: Fix task util_est update filtering
> Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>

Reviewed-by: Vincent Donnefort <vincent.donnefort@arm.com>

> ---
> 
> v1->v2:
> 
> 1) Move UTIL_AVG_UNCHANGED definition into struct util_est.
> 2) Hide UTIL_AVG_UNCHANGED in proc_sched_show_task().
> 
>  include/linux/sched.h |  8 ++++++++
>  kernel/sched/debug.c  |  3 ++-
>  kernel/sched/fair.c   |  5 +++--
>  kernel/sched/pelt.h   | 11 +----------
>  4 files changed, 14 insertions(+), 13 deletions(-)
> 
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index c7e7d50e2fdc..ac5a7d29fd4f 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -350,11 +350,19 @@ struct load_weight {
>   * Only for tasks we track a moving average of the past instantaneous
>   * estimated utilization. This allows to absorb sporadic drops in utilization
>   * of an otherwise almost periodic task.
> + *
> + * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
> + * updates. When a task is dequeued, its util_est should not be updated if its
> + * util_avg has not been updated in the meantime.
> + * This information is mapped into the MSB bit of util_est.enqueued at dequeue
> + * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
> + * for a task) it is safe to use MSB.
>   */
>  struct util_est {
>  	unsigned int			enqueued;
>  	unsigned int			ewma;
>  #define UTIL_EST_WEIGHT_SHIFT		2
> +#define UTIL_AVG_UNCHANGED		0x80000000
>  } __attribute__((__aligned__(sizeof(u64))));
>  
>  /*
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 3bdee5fd7d29..0c5ec2776ddf 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -885,6 +885,7 @@ static const struct seq_operations sched_debug_sops = {
>  #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
>  #define __P(F) __PS(#F, F)
>  #define   P(F) __PS(#F, p->F)
> +#define   PM(F, M) __PS(#F, p->F & (M))
>  #define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
>  #define __PN(F) __PSN(#F, F)
>  #define   PN(F) __PSN(#F, p->F)
> @@ -1011,7 +1012,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
>  	P(se.avg.util_avg);
>  	P(se.avg.last_update_time);
>  	P(se.avg.util_est.ewma);
> -	P(se.avg.util_est.enqueued);
> +	PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
>  #endif
>  #ifdef CONFIG_UCLAMP_TASK
>  	__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a2c30e52de76..56747df3f796 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3856,7 +3856,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
>  {
>  	struct util_est ue = READ_ONCE(p->se.avg.util_est);
>  
> -	return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
> +	return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
>  }
>  
>  static inline unsigned long task_util_est(struct task_struct *p)
> @@ -3956,7 +3956,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
>  	 * Reset EWMA on utilization increases, the moving average is used only
>  	 * to smooth utilization decreases.
>  	 */
> -	ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
> +	ue.enqueued = task_util(p);
>  	if (sched_feat(UTIL_EST_FASTUP)) {
>  		if (ue.ewma < ue.enqueued) {
>  			ue.ewma = ue.enqueued;
> @@ -4005,6 +4005,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
>  	ue.ewma  += last_ewma_diff;
>  	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
>  done:
> +	ue.enqueued |= UTIL_AVG_UNCHANGED;
>  	WRITE_ONCE(p->se.avg.util_est, ue);
>  
>  	trace_sched_util_est_se_tp(&p->se);
> diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
> index 9ed6d8c414ad..e06071bf3472 100644
> --- a/kernel/sched/pelt.h
> +++ b/kernel/sched/pelt.h
> @@ -42,15 +42,6 @@ static inline u32 get_pelt_divider(struct sched_avg *avg)
>  	return LOAD_AVG_MAX - 1024 + avg->period_contrib;
>  }
>  
> -/*
> - * When a task is dequeued, its estimated utilization should not be update if
> - * its util_avg has not been updated at least once.
> - * This flag is used to synchronize util_avg updates with util_est updates.
> - * We map this information into the LSB bit of the utilization saved at
> - * dequeue time (i.e. util_est.dequeued).
> - */
> -#define UTIL_AVG_UNCHANGED 0x1
> -
>  static inline void cfs_se_util_change(struct sched_avg *avg)
>  {
>  	unsigned int enqueued;
> @@ -58,7 +49,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
>  	if (!sched_feat(UTIL_EST))
>  		return;
>  
> -	/* Avoid store if the flag has been already set */
> +	/* Avoid store if the flag has been already reset */
>  	enqueued = avg->util_est.enqueued;
>  	if (!(enqueued & UTIL_AVG_UNCHANGED))
>  		return;
> -- 
> 2.25.1
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling
  2021-06-02 14:58 [PATCH v2] sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling Dietmar Eggemann
  2021-06-03  3:01 ` Xuewen Yan
  2021-06-03  8:29 ` Vincent Donnefort
@ 2021-06-03 12:48 ` Vincent Guittot
  2021-06-04 13:38 ` [tip: sched/urgent] " tip-bot2 for Dietmar Eggemann
  3 siblings, 0 replies; 5+ messages in thread
From: Vincent Guittot @ 2021-06-03 12:48 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: Ingo Molnar, Peter Zijlstra, Xuewen Yan, Vincent Donnefort,
	Juri Lelli, Steven Rostedt, Patrick Bellasi, Quentin Perret,
	linux-kernel

On Wed, 2 Jun 2021 at 16:58, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
>
> The util_est internal UTIL_AVG_UNCHANGED flag which is used to prevent
> unnecessary util_est updates uses the LSB of util_est.enqueued. It is
> exposed via _task_util_est() (and task_util_est()).
>
> Commit 92a801e5d5b7 ("sched/fair: Mask UTIL_AVG_UNCHANGED usages")
> mentions that the LSB is lost for util_est resolution but
> find_energy_efficient_cpu() checks if task_util_est() returns 0 to
> return prev_cpu early.
>
> _task_util_est() returns the max value of util_est.ewma and
> util_est.enqueued or'ed w/ UTIL_AVG_UNCHANGED.
> So task_util_est() returning the max of task_util() and
> _task_util_est() will never return 0 under the default
> SCHED_FEAT(UTIL_EST, true).
>
> To fix this use the MSB of util_est.enqueued instead and keep the flag
> util_est internal, i.e. don't export it via _task_util_est().
>
> The maximal possible util_avg value for a task is 1024 so the MSB of
> 'unsigned int util_est.enqueued' isn't used to store a util value.
>
> As a caveat the code behind the util_est_se trace point has to filter
> UTIL_AVG_UNCHANGED to see the real util_est.enqueued value which should
> be easy to do.
>
> This also fixes an issue report by Xuewen Yan that util_est_update()
> only used UTIL_AVG_UNCHANGED for the subtrahend of the equation:
>
>   last_enqueued_diff = ue.enqueued - (task_util() | UTIL_AVG_UNCHANGED)
>
> Fixes: b89997aa88f0b sched/pelt: Fix task util_est update filtering
> Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>

Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>

> ---
>
> v1->v2:
>
> 1) Move UTIL_AVG_UNCHANGED definition into struct util_est.
> 2) Hide UTIL_AVG_UNCHANGED in proc_sched_show_task().
>
>  include/linux/sched.h |  8 ++++++++
>  kernel/sched/debug.c  |  3 ++-
>  kernel/sched/fair.c   |  5 +++--
>  kernel/sched/pelt.h   | 11 +----------
>  4 files changed, 14 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index c7e7d50e2fdc..ac5a7d29fd4f 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -350,11 +350,19 @@ struct load_weight {
>   * Only for tasks we track a moving average of the past instantaneous
>   * estimated utilization. This allows to absorb sporadic drops in utilization
>   * of an otherwise almost periodic task.
> + *
> + * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
> + * updates. When a task is dequeued, its util_est should not be updated if its
> + * util_avg has not been updated in the meantime.
> + * This information is mapped into the MSB bit of util_est.enqueued at dequeue
> + * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
> + * for a task) it is safe to use MSB.
>   */
>  struct util_est {
>         unsigned int                    enqueued;
>         unsigned int                    ewma;
>  #define UTIL_EST_WEIGHT_SHIFT          2
> +#define UTIL_AVG_UNCHANGED             0x80000000
>  } __attribute__((__aligned__(sizeof(u64))));
>
>  /*
> diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
> index 3bdee5fd7d29..0c5ec2776ddf 100644
> --- a/kernel/sched/debug.c
> +++ b/kernel/sched/debug.c
> @@ -885,6 +885,7 @@ static const struct seq_operations sched_debug_sops = {
>  #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
>  #define __P(F) __PS(#F, F)
>  #define   P(F) __PS(#F, p->F)
> +#define   PM(F, M) __PS(#F, p->F & (M))
>  #define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
>  #define __PN(F) __PSN(#F, F)
>  #define   PN(F) __PSN(#F, p->F)
> @@ -1011,7 +1012,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
>         P(se.avg.util_avg);
>         P(se.avg.last_update_time);
>         P(se.avg.util_est.ewma);
> -       P(se.avg.util_est.enqueued);
> +       PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
>  #endif
>  #ifdef CONFIG_UCLAMP_TASK
>         __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a2c30e52de76..56747df3f796 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3856,7 +3856,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
>  {
>         struct util_est ue = READ_ONCE(p->se.avg.util_est);
>
> -       return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
> +       return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
>  }
>
>  static inline unsigned long task_util_est(struct task_struct *p)
> @@ -3956,7 +3956,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
>          * Reset EWMA on utilization increases, the moving average is used only
>          * to smooth utilization decreases.
>          */
> -       ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
> +       ue.enqueued = task_util(p);
>         if (sched_feat(UTIL_EST_FASTUP)) {
>                 if (ue.ewma < ue.enqueued) {
>                         ue.ewma = ue.enqueued;
> @@ -4005,6 +4005,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
>         ue.ewma  += last_ewma_diff;
>         ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
>  done:
> +       ue.enqueued |= UTIL_AVG_UNCHANGED;
>         WRITE_ONCE(p->se.avg.util_est, ue);
>
>         trace_sched_util_est_se_tp(&p->se);
> diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
> index 9ed6d8c414ad..e06071bf3472 100644
> --- a/kernel/sched/pelt.h
> +++ b/kernel/sched/pelt.h
> @@ -42,15 +42,6 @@ static inline u32 get_pelt_divider(struct sched_avg *avg)
>         return LOAD_AVG_MAX - 1024 + avg->period_contrib;
>  }
>
> -/*
> - * When a task is dequeued, its estimated utilization should not be update if
> - * its util_avg has not been updated at least once.
> - * This flag is used to synchronize util_avg updates with util_est updates.
> - * We map this information into the LSB bit of the utilization saved at
> - * dequeue time (i.e. util_est.dequeued).
> - */
> -#define UTIL_AVG_UNCHANGED 0x1
> -
>  static inline void cfs_se_util_change(struct sched_avg *avg)
>  {
>         unsigned int enqueued;
> @@ -58,7 +49,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
>         if (!sched_feat(UTIL_EST))
>                 return;
>
> -       /* Avoid store if the flag has been already set */
> +       /* Avoid store if the flag has been already reset */
>         enqueued = avg->util_est.enqueued;
>         if (!(enqueued & UTIL_AVG_UNCHANGED))
>                 return;
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [tip: sched/urgent] sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling
  2021-06-02 14:58 [PATCH v2] sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling Dietmar Eggemann
                   ` (2 preceding siblings ...)
  2021-06-03 12:48 ` Vincent Guittot
@ 2021-06-04 13:38 ` tip-bot2 for Dietmar Eggemann
  3 siblings, 0 replies; 5+ messages in thread
From: tip-bot2 for Dietmar Eggemann @ 2021-06-04 13:38 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: Dietmar Eggemann, Peter Zijlstra (Intel),
	Xuewen Yan, Vincent Donnefort, Vincent Guittot, x86,
	linux-kernel

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID:     68d7a190682aa4eb02db477328088ebad15acc83
Gitweb:        https://git.kernel.org/tip/68d7a190682aa4eb02db477328088ebad15acc83
Author:        Dietmar Eggemann <dietmar.eggemann@arm.com>
AuthorDate:    Wed, 02 Jun 2021 16:58:08 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Thu, 03 Jun 2021 15:47:23 +02:00

sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling

The util_est internal UTIL_AVG_UNCHANGED flag which is used to prevent
unnecessary util_est updates uses the LSB of util_est.enqueued. It is
exposed via _task_util_est() (and task_util_est()).

Commit 92a801e5d5b7 ("sched/fair: Mask UTIL_AVG_UNCHANGED usages")
mentions that the LSB is lost for util_est resolution but
find_energy_efficient_cpu() checks if task_util_est() returns 0 to
return prev_cpu early.

_task_util_est() returns the max value of util_est.ewma and
util_est.enqueued or'ed w/ UTIL_AVG_UNCHANGED.
So task_util_est() returning the max of task_util() and
_task_util_est() will never return 0 under the default
SCHED_FEAT(UTIL_EST, true).

To fix this use the MSB of util_est.enqueued instead and keep the flag
util_est internal, i.e. don't export it via _task_util_est().

The maximal possible util_avg value for a task is 1024 so the MSB of
'unsigned int util_est.enqueued' isn't used to store a util value.

As a caveat the code behind the util_est_se trace point has to filter
UTIL_AVG_UNCHANGED to see the real util_est.enqueued value which should
be easy to do.

This also fixes an issue report by Xuewen Yan that util_est_update()
only used UTIL_AVG_UNCHANGED for the subtrahend of the equation:

  last_enqueued_diff = ue.enqueued - (task_util() | UTIL_AVG_UNCHANGED)

Fixes: b89997aa88f0b sched/pelt: Fix task util_est update filtering
Signed-off-by: Dietmar Eggemann <dietmar.eggemann@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Xuewen Yan <xuewen.yan@unisoc.com>
Reviewed-by: Vincent Donnefort <vincent.donnefort@arm.com>
Reviewed-by: Vincent Guittot <vincent.guittot@linaro.org>
Link: https://lore.kernel.org/r/20210602145808.1562603-1-dietmar.eggemann@arm.com
---
 include/linux/sched.h |  8 ++++++++
 kernel/sched/debug.c  |  3 ++-
 kernel/sched/fair.c   |  5 +++--
 kernel/sched/pelt.h   | 11 +----------
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2c8813..28a98fc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -350,11 +350,19 @@ struct load_weight {
  * Only for tasks we track a moving average of the past instantaneous
  * estimated utilization. This allows to absorb sporadic drops in utilization
  * of an otherwise almost periodic task.
+ *
+ * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg
+ * updates. When a task is dequeued, its util_est should not be updated if its
+ * util_avg has not been updated in the meantime.
+ * This information is mapped into the MSB bit of util_est.enqueued at dequeue
+ * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg
+ * for a task) it is safe to use MSB.
  */
 struct util_est {
 	unsigned int			enqueued;
 	unsigned int			ewma;
 #define UTIL_EST_WEIGHT_SHIFT		2
+#define UTIL_AVG_UNCHANGED		0x80000000
 } __attribute__((__aligned__(sizeof(u64))));
 
 /*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 9c882f2..c5aacbd 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -885,6 +885,7 @@ static const struct seq_operations sched_debug_sops = {
 #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
 #define __P(F) __PS(#F, F)
 #define   P(F) __PS(#F, p->F)
+#define   PM(F, M) __PS(#F, p->F & (M))
 #define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F)))
 #define __PN(F) __PSN(#F, F)
 #define   PN(F) __PSN(#F, p->F)
@@ -1011,7 +1012,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 	P(se.avg.util_avg);
 	P(se.avg.last_update_time);
 	P(se.avg.util_est.ewma);
-	P(se.avg.util_est.enqueued);
+	PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED);
 #endif
 #ifdef CONFIG_UCLAMP_TASK
 	__PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7b98fb3..2c8a935 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3907,7 +3907,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
 {
 	struct util_est ue = READ_ONCE(p->se.avg.util_est);
 
-	return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
+	return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
 }
 
 static inline unsigned long task_util_est(struct task_struct *p)
@@ -4007,7 +4007,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	 * Reset EWMA on utilization increases, the moving average is used only
 	 * to smooth utilization decreases.
 	 */
-	ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+	ue.enqueued = task_util(p);
 	if (sched_feat(UTIL_EST_FASTUP)) {
 		if (ue.ewma < ue.enqueued) {
 			ue.ewma = ue.enqueued;
@@ -4056,6 +4056,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
 	ue.ewma  += last_ewma_diff;
 	ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
 done:
+	ue.enqueued |= UTIL_AVG_UNCHANGED;
 	WRITE_ONCE(p->se.avg.util_est, ue);
 
 	trace_sched_util_est_se_tp(&p->se);
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 1462846..cfe94ff 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -42,15 +42,6 @@ static inline u32 get_pelt_divider(struct sched_avg *avg)
 	return LOAD_AVG_MAX - 1024 + avg->period_contrib;
 }
 
-/*
- * When a task is dequeued, its estimated utilization should not be update if
- * its util_avg has not been updated at least once.
- * This flag is used to synchronize util_avg updates with util_est updates.
- * We map this information into the LSB bit of the utilization saved at
- * dequeue time (i.e. util_est.dequeued).
- */
-#define UTIL_AVG_UNCHANGED 0x1
-
 static inline void cfs_se_util_change(struct sched_avg *avg)
 {
 	unsigned int enqueued;
@@ -58,7 +49,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
 	if (!sched_feat(UTIL_EST))
 		return;
 
-	/* Avoid store if the flag has been already set */
+	/* Avoid store if the flag has been already reset */
 	enqueued = avg->util_est.enqueued;
 	if (!(enqueued & UTIL_AVG_UNCHANGED))
 		return;

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-06-04 13:38 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-02 14:58 [PATCH v2] sched/fair: Fix util_est UTIL_AVG_UNCHANGED handling Dietmar Eggemann
2021-06-03  3:01 ` Xuewen Yan
2021-06-03  8:29 ` Vincent Donnefort
2021-06-03 12:48 ` Vincent Guittot
2021-06-04 13:38 ` [tip: sched/urgent] " tip-bot2 for Dietmar Eggemann

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.