All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] sched: fix group_entity's share update
@ 2016-12-21 15:50 Vincent Guittot
  2017-01-04 17:20 ` Dietmar Eggemann
  2017-01-14 12:48 ` [tip:sched/core] sched/core: Fix " tip-bot for Vincent Guittot
  0 siblings, 2 replies; 4+ messages in thread
From: Vincent Guittot @ 2016-12-21 15:50 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel; +Cc: stable, pjt, Vincent Guittot

The update of the share of a cfs_rq is done when its load_avg is updated
but before the group_entity's load_avg has been updated for the past time
slot. This generates wrong load_avg accounting which can be significant
when small tasks are involved in the scheduling.

Let take the example of a task a that is dequeued of its task group A:
   root
  (cfs_rq)
    \
    (se)
     A
    (cfs_rq)
      \
      (se)
       a

Task "a" was the only task in task group A which becomes idle when a is
dequeued.

We have the sequence:

- dequeue_entity a->se
    - update_load_avg(a->se)
    - dequeue_entity_load_avg(A->cfs_rq, a->se)
    - update_cfs_shares(A->cfs_rq)
	A->cfs_rq->load.weight == 0
        A->se->load.weight is updated with the new share (0 in this case)
- dequeue_entity A->se
    - update_load_avg(A->se) but its weight is now null so the last time
      slot (up to a tick) will be accounted with a weight of 0 instead of
      its real weight during the time slot. The last time slot will be
      accounted as an idle one whereas it was a running one.

If the running time of task a is short enough that no tick happens when it
runs, all running time of group entity A->se will be accounted as idle
time.

Instead, we should update the share of a cfs_rq (in fact the weight of its
group entity) only after having updated the load_avg of the group_entity.

update_cfs_shares() now take the sched_entity as parameter instead of the
cfs_rq and  the weight of the group_entity is updated only once its load_avg
has been synced with current time.

Cc: <stable@vger.kernel.org> #v4.4+
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c | 53 +++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6559d19..b998287 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2689,16 +2689,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
 {
-	struct task_group *tg;
-	struct sched_entity *se;
 	long shares;
+	struct task_group *tg;
+	struct cfs_rq *cfs_rq = group_cfs_rq(se);
 
-	tg = cfs_rq->tg;
-	se = tg->se[cpu_of(rq_of(cfs_rq))];
-	if (!se || throttled_hierarchy(cfs_rq))
+	if (!cfs_rq)
 		return;
+
+	if (throttled_hierarchy(cfs_rq))
+		return;
+
+	tg = cfs_rq->tg;
+
 #ifndef CONFIG_SMP
 	if (likely(se->load.weight == tg->shares))
 		return;
@@ -2707,8 +2711,10 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
+
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3582,10 +3588,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (renorm && !curr)
 		se->vruntime += cfs_rq->min_vruntime;
 
+	/*
+	 * When enqueuing a sched_entity, we must:
+	 *   - Update loads to have both entity and cfs_rq synced with now.
+	 *   - Add its load to cfs_rq->runnable_avg
+	 *   - For group_entity, update its weight to reflect the new share of
+	 *     its group cfs_rq
+	 *   - Add its new weight to cfs_rq->load.weight
+	 */
 	update_load_avg(se, UPDATE_TG);
 	enqueue_entity_load_avg(cfs_rq, se);
+	update_cfs_shares(se);
 	account_entity_enqueue(cfs_rq, se);
-	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP)
 		place_entity(cfs_rq, se, 0);
@@ -3657,6 +3671,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+
+	/*
+	 * When dequeuing a sched_entity, we must:
+	 *   - Update loads to have both entity and cfs_rq synced with now.
+	 *   - Substract its load from the cfs_rq->runnable_avg.
+	 *   - Substract its previous weight from cfs_rq->load.weight.
+	 *   - For group entity, update its weight to reflect the new share
+	 *     of its group cfs_rq.
+	 */
 	update_load_avg(se, UPDATE_TG);
 	dequeue_entity_load_avg(cfs_rq, se);
 
@@ -3681,7 +3704,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	/* return excess runtime on last dequeue */
 	return_cfs_rq_runtime(cfs_rq);
 
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(se);
 
 	/*
 	 * Now advance min_vruntime if @se was the entity holding it back,
@@ -3864,7 +3887,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * Ensure that runnable average is periodically updated.
 	 */
 	update_load_avg(curr, UPDATE_TG);
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(curr);
 
 #ifdef CONFIG_SCHED_HRTICK
 	/*
@@ -4761,7 +4784,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(se, UPDATE_TG);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(se);
 	}
 
 	if (!se)
@@ -4820,7 +4843,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(se, UPDATE_TG);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(se);
 	}
 
 	if (!se)
@@ -9356,8 +9379,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 
 		/* Possible calls to update_curr() need rq clock */
 		update_rq_clock(rq);
-		for_each_sched_entity(se)
-			update_cfs_shares(group_cfs_rq(se));
+		for_each_sched_entity(se) {
+			update_load_avg(se, UPDATE_TG);
+			update_cfs_shares(se);
+		}
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] sched: fix group_entity's share update
  2016-12-21 15:50 [PATCH v2] sched: fix group_entity's share update Vincent Guittot
@ 2017-01-04 17:20 ` Dietmar Eggemann
  2017-01-04 17:48   ` Vincent Guittot
  2017-01-14 12:48 ` [tip:sched/core] sched/core: Fix " tip-bot for Vincent Guittot
  1 sibling, 1 reply; 4+ messages in thread
From: Dietmar Eggemann @ 2017-01-04 17:20 UTC (permalink / raw)
  To: Vincent Guittot, peterz, mingo, linux-kernel; +Cc: stable, pjt

On 21/12/16 15:50, Vincent Guittot wrote:

IMHO, the overall idea makes sense to me. Just a couple of small
questions ...

> The update of the share of a cfs_rq is done when its load_avg is updated
> but before the group_entity's load_avg has been updated for the past time
> slot. This generates wrong load_avg accounting which can be significant
> when small tasks are involved in the scheduling.

Why for small tasks? Is it because we use load =
scale_load_down(cfs_rq->load.weight) and not load = cfs_rq->avg.load_avg
in calc_cfs_shares()?

> Let take the example of a task a that is dequeued of its task group A:
>    root
>   (cfs_rq)
>     \
>     (se)
>      A
>     (cfs_rq)
>       \
>       (se)
>        a
> 
> Task "a" was the only task in task group A which becomes idle when a is
> dequeued.
> 
> We have the sequence:
> 
> - dequeue_entity a->se
>     - update_load_avg(a->se)
>     - dequeue_entity_load_avg(A->cfs_rq, a->se)
>     - update_cfs_shares(A->cfs_rq)
> 	A->cfs_rq->load.weight == 0

You mean A->cfs_rq->load.weight = 0 ?

>         A->se->load.weight is updated with the new share (0 in this case)

Shouldn't this be MIN_SHARES (2) instead of 0?

> - dequeue_entity A->se
>     - update_load_avg(A->se) but its weight is now null so the last time
>       slot (up to a tick) will be accounted with a weight of 0 instead of
>       its real weight during the time slot. The last time slot will be
>       accounted as an idle one whereas it was a running one.
> 
> If the running time of task a is short enough that no tick happens when it
> runs, all running time of group entity A->se will be accounted as idle
> time.
> 
> Instead, we should update the share of a cfs_rq (in fact the weight of its
> group entity) only after having updated the load_avg of the group_entity.

This is because we use 'se->on_rq * scale_load_down(se->load.weight)' in
__update_load_avg() as weight parameter for PELT load_avg update?

> update_cfs_shares() now take the sched_entity as parameter instead of the
> cfs_rq and  the weight of the group_entity is updated only once its load_avg
> has been synced with current time.

[...]

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] sched: fix group_entity's share update
  2017-01-04 17:20 ` Dietmar Eggemann
@ 2017-01-04 17:48   ` Vincent Guittot
  0 siblings, 0 replies; 4+ messages in thread
From: Vincent Guittot @ 2017-01-04 17:48 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, stable, Paul Turner

On 4 January 2017 at 18:20, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
> On 21/12/16 15:50, Vincent Guittot wrote:
>
> IMHO, the overall idea makes sense to me. Just a couple of small
> questions ...
>
>> The update of the share of a cfs_rq is done when its load_avg is updated
>> but before the group_entity's load_avg has been updated for the past time
>> slot. This generates wrong load_avg accounting which can be significant
>> when small tasks are involved in the scheduling.
>
> Why for small tasks? Is it because we use load =
> scale_load_down(cfs_rq->load.weight) and not load = cfs_rq->avg.load_avg
> in calc_cfs_shares()?

small tasks are significantly impacted because the tick has less
chance to fire while task is running and as a result less chance to
update group entity's load_avg of the past tick with the correct
weight and minimize the impact of this bug.

>
>> Let take the example of a task a that is dequeued of its task group A:
>>    root
>>   (cfs_rq)
>>     \
>>     (se)
>>      A
>>     (cfs_rq)
>>       \
>>       (se)
>>        a
>>
>> Task "a" was the only task in task group A which becomes idle when a is
>> dequeued.
>>
>> We have the sequence:
>>
>> - dequeue_entity a->se
>>     - update_load_avg(a->se)
>>     - dequeue_entity_load_avg(A->cfs_rq, a->se)
>>     - update_cfs_shares(A->cfs_rq)
>>       A->cfs_rq->load.weight == 0
>
> You mean A->cfs_rq->load.weight = 0 ?

no A->cfs_rq->load.weight == 0. Just to point out that
A->cfs_rq->load.weight equals 0 at that time.
A->cfs_rq->load.weight has been set to 0 in account_entity_dequeue()
which is not described here and happen between
dequeue_entity_load_avg() and update_cfs_shares()

>
>>         A->se->load.weight is updated with the new share (0 in this case)
>
> Shouldn't this be MIN_SHARES (2) instead of 0?

yes you're right
I had in mind scale_load_down(se->load.weight) which is used when
calling  __update_load_avg() in update_load_avg() and which is 0 on
the 64bits platform that I used

>
>> - dequeue_entity A->se
>>     - update_load_avg(A->se) but its weight is now null so the last time
>>       slot (up to a tick) will be accounted with a weight of 0 instead of
>>       its real weight during the time slot. The last time slot will be
>>       accounted as an idle one whereas it was a running one.
>>
>> If the running time of task a is short enough that no tick happens when it
>> runs, all running time of group entity A->se will be accounted as idle
>> time.
>>
>> Instead, we should update the share of a cfs_rq (in fact the weight of its
>> group entity) only after having updated the load_avg of the group_entity.
>
> This is because we use 'se->on_rq * scale_load_down(se->load.weight)' in
> __update_load_avg() as weight parameter for PELT load_avg update?

yes

>
>> update_cfs_shares() now take the sched_entity as parameter instead of the
>> cfs_rq and  the weight of the group_entity is updated only once its load_avg
>> has been synced with current time.
>
> [...]
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [tip:sched/core] sched/core: Fix group_entity's share update
  2016-12-21 15:50 [PATCH v2] sched: fix group_entity's share update Vincent Guittot
  2017-01-04 17:20 ` Dietmar Eggemann
@ 2017-01-14 12:48 ` tip-bot for Vincent Guittot
  1 sibling, 0 replies; 4+ messages in thread
From: tip-bot for Vincent Guittot @ 2017-01-14 12:48 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: tglx, mingo, peterz, torvalds, efault, vincent.guittot,
	linux-kernel, hpa

Commit-ID:  89ee048f3cc796db6f26906c6bef4edf0bee70fd
Gitweb:     http://git.kernel.org/tip/89ee048f3cc796db6f26906c6bef4edf0bee70fd
Author:     Vincent Guittot <vincent.guittot@linaro.org>
AuthorDate: Wed, 21 Dec 2016 16:50:26 +0100
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Sat, 14 Jan 2017 11:30:02 +0100

sched/core: Fix group_entity's share update

The update of the share of a cfs_rq is done when its load_avg is updated
but before the group_entity's load_avg has been updated for the past time
slot. This generates wrong load_avg accounting which can be significant
when small tasks are involved in the scheduling.

Let take the example of a task a that is dequeued of its task group A:
   root
  (cfs_rq)
    \
    (se)
     A
    (cfs_rq)
      \
      (se)
       a

Task "a" was the only task in task group A which becomes idle when a is
dequeued.

We have the sequence:

- dequeue_entity a->se
    - update_load_avg(a->se)
    - dequeue_entity_load_avg(A->cfs_rq, a->se)
    - update_cfs_shares(A->cfs_rq)
	A->cfs_rq->load.weight == 0
        A->se->load.weight is updated with the new share (0 in this case)
- dequeue_entity A->se
    - update_load_avg(A->se) but its weight is now null so the last time
      slot (up to a tick) will be accounted with a weight of 0 instead of
      its real weight during the time slot. The last time slot will be
      accounted as an idle one whereas it was a running one.

If the running time of task a is short enough that no tick happens when it
runs, all running time of group entity A->se will be accounted as idle
time.

Instead, we should update the share of a cfs_rq (in fact the weight of its
group entity) only after having updated the load_avg of the group_entity.

update_cfs_shares() now takes the sched_entity as a parameter instead of the
cfs_rq, and the weight of the group_entity is updated only once its load_avg
has been synced with current time.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: pjt@google.com
Link: http://lkml.kernel.org/r/1482335426-7664-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 50 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b3bfe3f..2b866a2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2689,16 +2689,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
 {
+	struct cfs_rq *cfs_rq = group_cfs_rq(se);
 	struct task_group *tg;
-	struct sched_entity *se;
 	long shares;
 
-	tg = cfs_rq->tg;
-	se = tg->se[cpu_of(rq_of(cfs_rq))];
-	if (!se || throttled_hierarchy(cfs_rq))
+	if (!cfs_rq)
+		return;
+
+	if (throttled_hierarchy(cfs_rq))
 		return;
+
+	tg = cfs_rq->tg;
+
 #ifndef CONFIG_SMP
 	if (likely(se->load.weight == tg->shares))
 		return;
@@ -2707,8 +2711,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3582,10 +3587,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (renorm && !curr)
 		se->vruntime += cfs_rq->min_vruntime;
 
+	/*
+	 * When enqueuing a sched_entity, we must:
+	 *   - Update loads to have both entity and cfs_rq synced with now.
+	 *   - Add its load to cfs_rq->runnable_avg
+	 *   - For group_entity, update its weight to reflect the new share of
+	 *     its group cfs_rq
+	 *   - Add its new weight to cfs_rq->load.weight
+	 */
 	update_load_avg(se, UPDATE_TG);
 	enqueue_entity_load_avg(cfs_rq, se);
+	update_cfs_shares(se);
 	account_entity_enqueue(cfs_rq, se);
-	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP)
 		place_entity(cfs_rq, se, 0);
@@ -3657,6 +3670,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+
+	/*
+	 * When dequeuing a sched_entity, we must:
+	 *   - Update loads to have both entity and cfs_rq synced with now.
+	 *   - Substract its load from the cfs_rq->runnable_avg.
+	 *   - Substract its previous weight from cfs_rq->load.weight.
+	 *   - For group entity, update its weight to reflect the new share
+	 *     of its group cfs_rq.
+	 */
 	update_load_avg(se, UPDATE_TG);
 	dequeue_entity_load_avg(cfs_rq, se);
 
@@ -3681,7 +3703,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	/* return excess runtime on last dequeue */
 	return_cfs_rq_runtime(cfs_rq);
 
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(se);
 
 	/*
 	 * Now advance min_vruntime if @se was the entity holding it back,
@@ -3864,7 +3886,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * Ensure that runnable average is periodically updated.
 	 */
 	update_load_avg(curr, UPDATE_TG);
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(curr);
 
 #ifdef CONFIG_SCHED_HRTICK
 	/*
@@ -4761,7 +4783,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(se, UPDATE_TG);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(se);
 	}
 
 	if (!se)
@@ -4820,7 +4842,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(se, UPDATE_TG);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(se);
 	}
 
 	if (!se)
@@ -9362,8 +9384,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 
 		/* Possible calls to update_curr() need rq clock */
 		update_rq_clock(rq);
-		for_each_sched_entity(se)
-			update_cfs_shares(group_cfs_rq(se));
+		for_each_sched_entity(se) {
+			update_load_avg(se, UPDATE_TG);
+			update_cfs_shares(se);
+		}
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2017-01-14 12:48 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-21 15:50 [PATCH v2] sched: fix group_entity's share update Vincent Guittot
2017-01-04 17:20 ` Dietmar Eggemann
2017-01-04 17:48   ` Vincent Guittot
2017-01-14 12:48 ` [tip:sched/core] sched/core: Fix " tip-bot for Vincent Guittot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.