Re: [PATCH 4/7 v3] sched: propagate load during synchronous attach/detach

From: Wanpeng Li <kernellwp@gmail.com>
To: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>,
	Ingo Molnar <mingo@kernel.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	Yuyang Du <yuyang.du@intel.com>,
	Morten Rasmussen <Morten.Rasmussen@arm.com>,
	"linaro-kernel@lists.linaro.org" <linaro-kernel@lists.linaro.org>,
	Dietmar Eggemann <dietmar.eggemann@arm.com>,
	Paul Turner <pjt@google.com>,
	Benjamin Segall <bsegall@google.com>
Subject: Re: [PATCH 4/7 v3] sched: propagate load during synchronous attach/detach
Date: Mon, 19 Sep 2016 11:19:17 +0800	[thread overview]
Message-ID: <CANRm+Cxc+xmGksgo4mB+ygNqEAXCPib3jFsP+s6+JpupCjmtkA@mail.gmail.com> (raw)
In-Reply-To: <1473666472-13749-5-git-send-email-vincent.guittot@linaro.org>

2016-09-12 15:47 GMT+08:00 Vincent Guittot <vincent.guittot@linaro.org>:
> When a task moves from/to a cfs_rq, we set a flag which is then used to
> propagate the change at parent level (sched_entity and cfs_rq) during
> next update. If the cfs_rq is throttled, the flag will stay pending until
> the cfs_rw is unthrottled.
>
> For propagating the utilization, we copy the utilization of child cfs_rq to
> the sched_entity.
>
> For propagating the load, we have to take into account the load of the
> whole task group in order to evaluate the load of the sched_entity.
> Similarly to what was done before the rewrite of PELT, we add a correction
> factor in case the task group's load is less than its share so it will
> contribute the same load of a task of equal weight.
>
> Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
> ---
>  kernel/sched/fair.c  | 170 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>  kernel/sched/sched.h |   1 +
>  2 files changed, 170 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 0aa1d7d..e4015f6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3017,6 +3017,132 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
>         }
>  }
>
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +/* Take into account change of utilization of a child task group */
> +static inline void
> +update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> +       struct cfs_rq *gcfs_rq =  group_cfs_rq(se);
> +       long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
> +
> +       /* Nothing to update */
> +       if (!delta)
> +               return;
> +
> +       /* Set new sched_entity's utilizaton */

s/utilizaton/utilization

> +       se->avg.util_avg = gcfs_rq->avg.util_avg;
> +       se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
> +
> +       /* Update parent cfs_rq utilization */
> +       cfs_rq->avg.util_avg =  max_t(long, cfs_rq->avg.util_avg + delta, 0);
> +       cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
> +}
> +
> +/* Take into account change of load of a child task group */
> +static inline void
> +update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
> +{
> +       struct cfs_rq *gcfs_rq = group_cfs_rq(se);
> +       long delta, load = gcfs_rq->avg.load_avg;
> +
> +       /* If the load of group cfs_rq is null, the load of the
> +        * sched_entity will also be null so we can skip the formula
> +        */
> +       if (load) {
> +               long tg_load;
> +
> +               /* Get tg's load and ensure tg_load > 0 */
> +               tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
> +
> +               /* Ensure tg_load >= load and updated with current load*/
> +               tg_load -= gcfs_rq->tg_load_avg_contrib;
> +               tg_load += load;
> +
> +               /* scale gcfs_rq's load into tg's shares*/
> +               load *= scale_load_down(gcfs_rq->tg->shares);
> +               load /= tg_load;
> +
> +               /*
> +                * we need to compute a correction term in the case that the
> +                * task group is consuming <1 cpu so that we would contribute
> +                * the same load as a task of equal weight.
> +               */
> +               if (tg_load < scale_load_down(gcfs_rq->tg->shares)) {
> +                       load *= tg_load;
> +                       load /= scale_load_down(gcfs_rq->tg->shares);
> +               }
> +       }
> +
> +       delta = load - se->avg.load_avg;
> +
> +       /* Nothing to update */
> +       if (!delta)
> +               return;
> +
> +       /* Set new sched_entity's load */
> +       se->avg.load_avg = load;
> +       se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
> +
> +       /* Update parent cfs_rq load */
> +       cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg + delta, 0);
> +       cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
> +}
> +
> +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
> +{
> +       /* set cfs_rq's flag */
> +       cfs_rq->propagate_avg = 1;
> +}
> +
> +static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
> +{
> +       /* Get my cfs_rq */
> +       struct cfs_rq *cfs_rq = group_cfs_rq(se);
> +
> +       /* Nothing to propagate */
> +       if (!cfs_rq->propagate_avg)
> +               return 0;
> +
> +       /* Clear my cfs_rq's flag */
> +       cfs_rq->propagate_avg = 0;
> +
> +       return 1;
> +}
> +
> +/* Update task and its cfs_rq load average */
> +static inline int propagate_entity_load_avg(struct sched_entity *se)
> +{
> +       struct cfs_rq *cfs_rq;
> +
> +       if (entity_is_task(se))
> +               return 0;
> +
> +       if (!test_and_clear_tg_cfs_propagate(se))
> +               return 0;
> +
> +       /* Get parent cfs_rq */
> +       cfs_rq = cfs_rq_of(se);
> +
> +       /* Propagate to parent */
> +       set_tg_cfs_propagate(cfs_rq);
> +
> +       /* Update utilization */
> +       update_tg_cfs_util(cfs_rq, se);
> +
> +       /* Update load */
> +       update_tg_cfs_load(cfs_rq, se);
> +
> +       return 1;
> +}
> +#else
> +static inline int propagate_entity_load_avg(struct sched_entity *se)
> +{
> +       return 0;
> +}
> +
> +static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
> +#endif
> +
>  /*
>   * Unsigned subtract and clamp on underflow.
>   *
> @@ -3093,6 +3219,7 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg,
>         u64 now = cfs_rq_clock_task(cfs_rq);
>         struct rq *rq = rq_of(cfs_rq);
>         int cpu = cpu_of(rq);
> +       int decayed;
>
>         /*
>          * Track task load average for carrying it to new CPU after migrated, and
> @@ -3103,7 +3230,11 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg,
>                           se->on_rq * scale_load_down(se->load.weight),
>                           cfs_rq->curr == se, NULL);
>
> -       if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
> +       decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
> +
> +       decayed |= propagate_entity_load_avg(se);
> +
> +       if (decayed && update_tg)
>                 update_tg_load_avg(cfs_rq, 0);
>  }
>
> @@ -3122,6 +3253,7 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
>         cfs_rq->avg.load_sum += se->avg.load_sum;
>         cfs_rq->avg.util_avg += se->avg.util_avg;
>         cfs_rq->avg.util_sum += se->avg.util_sum;
> +       set_tg_cfs_propagate(cfs_rq);
>
>         cfs_rq_util_change(cfs_rq);
>  }
> @@ -3141,6 +3273,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
>         sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
>         sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
>         sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
> +       set_tg_cfs_propagate(cfs_rq);
>
>         cfs_rq_util_change(cfs_rq);
>  }
> @@ -8499,6 +8632,22 @@ static void detach_task_cfs_rq(struct task_struct *p)
>         update_load_avg(se, 0, 0);
>         detach_entity_load_avg(cfs_rq, se);
>         update_tg_load_avg(cfs_rq, false);
> +
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +       /*
> +        * Propagate the detach across the tg tree to make it visible to the
> +        * root
> +        */
> +       se = se->parent;
> +       for_each_sched_entity(se) {
> +               cfs_rq = cfs_rq_of(se);
> +
> +               if (cfs_rq_throttled(cfs_rq))
> +                       break;
> +
> +               update_load_avg(se, 1, 0);
> +       }
> +#endif
>  }
>
>  static void attach_entity_cfs_rq(struct sched_entity *se)
> @@ -8517,6 +8666,22 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
>         update_load_avg(se, 0, !sched_feat(ATTACH_AGE_LOAD));
>         attach_entity_load_avg(cfs_rq, se);
>         update_tg_load_avg(cfs_rq, false);
> +
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +       /*
> +        * Propagate the attach across the tg tree to make it visible to the
> +        * root
> +        */
> +       se = se->parent;
> +       for_each_sched_entity(se) {
> +               cfs_rq = cfs_rq_of(se);
> +
> +               if (cfs_rq_throttled(cfs_rq))
> +                       break;
> +
> +               update_load_avg(se, 1, 0);
> +       }
> +#endif
>  }
>
>  static void attach_task_cfs_rq(struct task_struct *p)
> @@ -8578,6 +8743,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
>         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
>  #endif
>  #ifdef CONFIG_SMP
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +       cfs_rq->propagate_avg = 0;
> +#endif
>         atomic_long_set(&cfs_rq->removed_load_avg, 0);
>         atomic_long_set(&cfs_rq->removed_util_avg, 0);
>  #endif
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 483616a..0517a9e 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -397,6 +397,7 @@ struct cfs_rq {
>         unsigned long runnable_load_avg;
>  #ifdef CONFIG_FAIR_GROUP_SCHED
>         unsigned long tg_load_avg_contrib;
> +       unsigned long propagate_avg;
>  #endif
>         atomic_long_t removed_load_avg, removed_util_avg;
>  #ifndef CONFIG_64BIT
> --
> 1.9.1
>