linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] sched: fix group_entity's share update
@ 2016-12-21 15:50 Vincent Guittot
  2017-01-04 17:20 ` Dietmar Eggemann
  2017-01-14 12:48 ` [tip:sched/core] sched/core: Fix " tip-bot for Vincent Guittot
  0 siblings, 2 replies; 4+ messages in thread
From: Vincent Guittot @ 2016-12-21 15:50 UTC (permalink / raw)
  To: peterz, mingo, linux-kernel; +Cc: stable, pjt, Vincent Guittot

The update of the share of a cfs_rq is done when its load_avg is updated
but before the group_entity's load_avg has been updated for the past time
slot. This generates wrong load_avg accounting which can be significant
when small tasks are involved in the scheduling.

Let take the example of a task a that is dequeued of its task group A:
   root
  (cfs_rq)
    \
    (se)
     A
    (cfs_rq)
      \
      (se)
       a

Task "a" was the only task in task group A which becomes idle when a is
dequeued.

We have the sequence:

- dequeue_entity a->se
    - update_load_avg(a->se)
    - dequeue_entity_load_avg(A->cfs_rq, a->se)
    - update_cfs_shares(A->cfs_rq)
	A->cfs_rq->load.weight == 0
        A->se->load.weight is updated with the new share (0 in this case)
- dequeue_entity A->se
    - update_load_avg(A->se) but its weight is now null so the last time
      slot (up to a tick) will be accounted with a weight of 0 instead of
      its real weight during the time slot. The last time slot will be
      accounted as an idle one whereas it was a running one.

If the running time of task a is short enough that no tick happens when it
runs, all running time of group entity A->se will be accounted as idle
time.

Instead, we should update the share of a cfs_rq (in fact the weight of its
group entity) only after having updated the load_avg of the group_entity.

update_cfs_shares() now take the sched_entity as parameter instead of the
cfs_rq and  the weight of the group_entity is updated only once its load_avg
has been synced with current time.

Cc: <stable@vger.kernel.org> #v4.4+
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
 kernel/sched/fair.c | 53 +++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 39 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6559d19..b998287 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2689,16 +2689,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
 {
-	struct task_group *tg;
-	struct sched_entity *se;
 	long shares;
+	struct task_group *tg;
+	struct cfs_rq *cfs_rq = group_cfs_rq(se);
 
-	tg = cfs_rq->tg;
-	se = tg->se[cpu_of(rq_of(cfs_rq))];
-	if (!se || throttled_hierarchy(cfs_rq))
+	if (!cfs_rq)
 		return;
+
+	if (throttled_hierarchy(cfs_rq))
+		return;
+
+	tg = cfs_rq->tg;
+
 #ifndef CONFIG_SMP
 	if (likely(se->load.weight == tg->shares))
 		return;
@@ -2707,8 +2711,10 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
+
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3582,10 +3588,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (renorm && !curr)
 		se->vruntime += cfs_rq->min_vruntime;
 
+	/*
+	 * When enqueuing a sched_entity, we must:
+	 *   - Update loads to have both entity and cfs_rq synced with now.
+	 *   - Add its load to cfs_rq->runnable_avg
+	 *   - For group_entity, update its weight to reflect the new share of
+	 *     its group cfs_rq
+	 *   - Add its new weight to cfs_rq->load.weight
+	 */
 	update_load_avg(se, UPDATE_TG);
 	enqueue_entity_load_avg(cfs_rq, se);
+	update_cfs_shares(se);
 	account_entity_enqueue(cfs_rq, se);
-	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP)
 		place_entity(cfs_rq, se, 0);
@@ -3657,6 +3671,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+
+	/*
+	 * When dequeuing a sched_entity, we must:
+	 *   - Update loads to have both entity and cfs_rq synced with now.
+	 *   - Substract its load from the cfs_rq->runnable_avg.
+	 *   - Substract its previous weight from cfs_rq->load.weight.
+	 *   - For group entity, update its weight to reflect the new share
+	 *     of its group cfs_rq.
+	 */
 	update_load_avg(se, UPDATE_TG);
 	dequeue_entity_load_avg(cfs_rq, se);
 
@@ -3681,7 +3704,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	/* return excess runtime on last dequeue */
 	return_cfs_rq_runtime(cfs_rq);
 
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(se);
 
 	/*
 	 * Now advance min_vruntime if @se was the entity holding it back,
@@ -3864,7 +3887,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * Ensure that runnable average is periodically updated.
 	 */
 	update_load_avg(curr, UPDATE_TG);
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(curr);
 
 #ifdef CONFIG_SCHED_HRTICK
 	/*
@@ -4761,7 +4784,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(se, UPDATE_TG);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(se);
 	}
 
 	if (!se)
@@ -4820,7 +4843,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(se, UPDATE_TG);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(se);
 	}
 
 	if (!se)
@@ -9356,8 +9379,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 
 		/* Possible calls to update_curr() need rq clock */
 		update_rq_clock(rq);
-		for_each_sched_entity(se)
-			update_cfs_shares(group_cfs_rq(se));
+		for_each_sched_entity(se) {
+			update_load_avg(se, UPDATE_TG);
+			update_cfs_shares(se);
+		}
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 
-- 
2.7.4

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] sched: fix group_entity's share update
  2016-12-21 15:50 [PATCH v2] sched: fix group_entity's share update Vincent Guittot
@ 2017-01-04 17:20 ` Dietmar Eggemann
  2017-01-04 17:48   ` Vincent Guittot
  2017-01-14 12:48 ` [tip:sched/core] sched/core: Fix " tip-bot for Vincent Guittot
  1 sibling, 1 reply; 4+ messages in thread
From: Dietmar Eggemann @ 2017-01-04 17:20 UTC (permalink / raw)
  To: Vincent Guittot, peterz, mingo, linux-kernel; +Cc: stable, pjt

On 21/12/16 15:50, Vincent Guittot wrote:

IMHO, the overall idea makes sense to me. Just a couple of small
questions ...

> The update of the share of a cfs_rq is done when its load_avg is updated
> but before the group_entity's load_avg has been updated for the past time
> slot. This generates wrong load_avg accounting which can be significant
> when small tasks are involved in the scheduling.

Why for small tasks? Is it because we use load =
scale_load_down(cfs_rq->load.weight) and not load = cfs_rq->avg.load_avg
in calc_cfs_shares()?

> Let take the example of a task a that is dequeued of its task group A:
>    root
>   (cfs_rq)
>     \
>     (se)
>      A
>     (cfs_rq)
>       \
>       (se)
>        a
> 
> Task "a" was the only task in task group A which becomes idle when a is
> dequeued.
> 
> We have the sequence:
> 
> - dequeue_entity a->se
>     - update_load_avg(a->se)
>     - dequeue_entity_load_avg(A->cfs_rq, a->se)
>     - update_cfs_shares(A->cfs_rq)
> 	A->cfs_rq->load.weight == 0

You mean A->cfs_rq->load.weight = 0 ?

>         A->se->load.weight is updated with the new share (0 in this case)

Shouldn't this be MIN_SHARES (2) instead of 0?

> - dequeue_entity A->se
>     - update_load_avg(A->se) but its weight is now null so the last time
>       slot (up to a tick) will be accounted with a weight of 0 instead of
>       its real weight during the time slot. The last time slot will be
>       accounted as an idle one whereas it was a running one.
> 
> If the running time of task a is short enough that no tick happens when it
> runs, all running time of group entity A->se will be accounted as idle
> time.
> 
> Instead, we should update the share of a cfs_rq (in fact the weight of its
> group entity) only after having updated the load_avg of the group_entity.

This is because we use 'se->on_rq * scale_load_down(se->load.weight)' in
__update_load_avg() as weight parameter for PELT load_avg update?

> update_cfs_shares() now take the sched_entity as parameter instead of the
> cfs_rq and  the weight of the group_entity is updated only once its load_avg
> has been synced with current time.

[...]

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] sched: fix group_entity's share update
  2017-01-04 17:20 ` Dietmar Eggemann
@ 2017-01-04 17:48   ` Vincent Guittot
  0 siblings, 0 replies; 4+ messages in thread
From: Vincent Guittot @ 2017-01-04 17:48 UTC (permalink / raw)
  To: Dietmar Eggemann
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, stable, Paul Turner

On 4 January 2017 at 18:20, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
> On 21/12/16 15:50, Vincent Guittot wrote:
>
> IMHO, the overall idea makes sense to me. Just a couple of small
> questions ...
>
>> The update of the share of a cfs_rq is done when its load_avg is updated
>> but before the group_entity's load_avg has been updated for the past time
>> slot. This generates wrong load_avg accounting which can be significant
>> when small tasks are involved in the scheduling.
>
> Why for small tasks? Is it because we use load =
> scale_load_down(cfs_rq->load.weight) and not load = cfs_rq->avg.load_avg
> in calc_cfs_shares()?

small tasks are significantly impacted because the tick has less
chance to fire while task is running and as a result less chance to
update group entity's load_avg of the past tick with the correct
weight and minimize the impact of this bug.

>
>> Let take the example of a task a that is dequeued of its task group A:
>>    root
>>   (cfs_rq)
>>     \
>>     (se)
>>      A
>>     (cfs_rq)
>>       \
>>       (se)
>>        a
>>
>> Task "a" was the only task in task group A which becomes idle when a is
>> dequeued.
>>
>> We have the sequence:
>>
>> - dequeue_entity a->se
>>     - update_load_avg(a->se)
>>     - dequeue_entity_load_avg(A->cfs_rq, a->se)
>>     - update_cfs_shares(A->cfs_rq)
>>       A->cfs_rq->load.weight == 0
>
> You mean A->cfs_rq->load.weight = 0 ?

no A->cfs_rq->load.weight == 0. Just to point out that
A->cfs_rq->load.weight equals 0 at that time.
A->cfs_rq->load.weight has been set to 0 in account_entity_dequeue()
which is not described here and happen between
dequeue_entity_load_avg() and update_cfs_shares()

>
>>         A->se->load.weight is updated with the new share (0 in this case)
>
> Shouldn't this be MIN_SHARES (2) instead of 0?

yes you're right
I had in mind scale_load_down(se->load.weight) which is used when
calling  __update_load_avg() in update_load_avg() and which is 0 on
the 64bits platform that I used

>
>> - dequeue_entity A->se
>>     - update_load_avg(A->se) but its weight is now null so the last time
>>       slot (up to a tick) will be accounted with a weight of 0 instead of
>>       its real weight during the time slot. The last time slot will be
>>       accounted as an idle one whereas it was a running one.
>>
>> If the running time of task a is short enough that no tick happens when it
>> runs, all running time of group entity A->se will be accounted as idle
>> time.
>>
>> Instead, we should update the share of a cfs_rq (in fact the weight of its
>> group entity) only after having updated the load_avg of the group_entity.
>
> This is because we use 'se->on_rq * scale_load_down(se->load.weight)' in
> __update_load_avg() as weight parameter for PELT load_avg update?

yes

>
>> update_cfs_shares() now take the sched_entity as parameter instead of the
>> cfs_rq and  the weight of the group_entity is updated only once its load_avg
>> has been synced with current time.
>
> [...]
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [tip:sched/core] sched/core: Fix group_entity's share update
  2016-12-21 15:50 [PATCH v2] sched: fix group_entity's share update Vincent Guittot
  2017-01-04 17:20 ` Dietmar Eggemann
@ 2017-01-14 12:48 ` tip-bot for Vincent Guittot
  1 sibling, 0 replies; 4+ messages in thread
From: tip-bot for Vincent Guittot @ 2017-01-14 12:48 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: tglx, mingo, peterz, torvalds, efault, vincent.guittot,
	linux-kernel, hpa

Commit-ID:  89ee048f3cc796db6f26906c6bef4edf0bee70fd
Gitweb:     http://git.kernel.org/tip/89ee048f3cc796db6f26906c6bef4edf0bee70fd
Author:     Vincent Guittot <vincent.guittot@linaro.org>
AuthorDate: Wed, 21 Dec 2016 16:50:26 +0100
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Sat, 14 Jan 2017 11:30:02 +0100

sched/core: Fix group_entity's share update

The update of the share of a cfs_rq is done when its load_avg is updated
but before the group_entity's load_avg has been updated for the past time
slot. This generates wrong load_avg accounting which can be significant
when small tasks are involved in the scheduling.

Let take the example of a task a that is dequeued of its task group A:
   root
  (cfs_rq)
    \
    (se)
     A
    (cfs_rq)
      \
      (se)
       a

Task "a" was the only task in task group A which becomes idle when a is
dequeued.

We have the sequence:

- dequeue_entity a->se
    - update_load_avg(a->se)
    - dequeue_entity_load_avg(A->cfs_rq, a->se)
    - update_cfs_shares(A->cfs_rq)
	A->cfs_rq->load.weight == 0
        A->se->load.weight is updated with the new share (0 in this case)
- dequeue_entity A->se
    - update_load_avg(A->se) but its weight is now null so the last time
      slot (up to a tick) will be accounted with a weight of 0 instead of
      its real weight during the time slot. The last time slot will be
      accounted as an idle one whereas it was a running one.

If the running time of task a is short enough that no tick happens when it
runs, all running time of group entity A->se will be accounted as idle
time.

Instead, we should update the share of a cfs_rq (in fact the weight of its
group entity) only after having updated the load_avg of the group_entity.

update_cfs_shares() now takes the sched_entity as a parameter instead of the
cfs_rq, and the weight of the group_entity is updated only once its load_avg
has been synced with current time.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: pjt@google.com
Link: http://lkml.kernel.org/r/1482335426-7664-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 50 +++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b3bfe3f..2b866a2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2689,16 +2689,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
 
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
 {
+	struct cfs_rq *cfs_rq = group_cfs_rq(se);
 	struct task_group *tg;
-	struct sched_entity *se;
 	long shares;
 
-	tg = cfs_rq->tg;
-	se = tg->se[cpu_of(rq_of(cfs_rq))];
-	if (!se || throttled_hierarchy(cfs_rq))
+	if (!cfs_rq)
+		return;
+
+	if (throttled_hierarchy(cfs_rq))
 		return;
+
+	tg = cfs_rq->tg;
+
 #ifndef CONFIG_SMP
 	if (likely(se->load.weight == tg->shares))
 		return;
@@ -2707,8 +2711,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
 
 	reweight_entity(cfs_rq_of(se), se, shares);
 }
+
 #else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
 {
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3582,10 +3587,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (renorm && !curr)
 		se->vruntime += cfs_rq->min_vruntime;
 
+	/*
+	 * When enqueuing a sched_entity, we must:
+	 *   - Update loads to have both entity and cfs_rq synced with now.
+	 *   - Add its load to cfs_rq->runnable_avg
+	 *   - For group_entity, update its weight to reflect the new share of
+	 *     its group cfs_rq
+	 *   - Add its new weight to cfs_rq->load.weight
+	 */
 	update_load_avg(se, UPDATE_TG);
 	enqueue_entity_load_avg(cfs_rq, se);
+	update_cfs_shares(se);
 	account_entity_enqueue(cfs_rq, se);
-	update_cfs_shares(cfs_rq);
 
 	if (flags & ENQUEUE_WAKEUP)
 		place_entity(cfs_rq, se, 0);
@@ -3657,6 +3670,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * Update run-time statistics of the 'current'.
 	 */
 	update_curr(cfs_rq);
+
+	/*
+	 * When dequeuing a sched_entity, we must:
+	 *   - Update loads to have both entity and cfs_rq synced with now.
+	 *   - Substract its load from the cfs_rq->runnable_avg.
+	 *   - Substract its previous weight from cfs_rq->load.weight.
+	 *   - For group entity, update its weight to reflect the new share
+	 *     of its group cfs_rq.
+	 */
 	update_load_avg(se, UPDATE_TG);
 	dequeue_entity_load_avg(cfs_rq, se);
 
@@ -3681,7 +3703,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	/* return excess runtime on last dequeue */
 	return_cfs_rq_runtime(cfs_rq);
 
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(se);
 
 	/*
 	 * Now advance min_vruntime if @se was the entity holding it back,
@@ -3864,7 +3886,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * Ensure that runnable average is periodically updated.
 	 */
 	update_load_avg(curr, UPDATE_TG);
-	update_cfs_shares(cfs_rq);
+	update_cfs_shares(curr);
 
 #ifdef CONFIG_SCHED_HRTICK
 	/*
@@ -4761,7 +4783,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(se, UPDATE_TG);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(se);
 	}
 
 	if (!se)
@@ -4820,7 +4842,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 
 		update_load_avg(se, UPDATE_TG);
-		update_cfs_shares(cfs_rq);
+		update_cfs_shares(se);
 	}
 
 	if (!se)
@@ -9362,8 +9384,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 
 		/* Possible calls to update_curr() need rq clock */
 		update_rq_clock(rq);
-		for_each_sched_entity(se)
-			update_cfs_shares(group_cfs_rq(se));
+		for_each_sched_entity(se) {
+			update_load_avg(se, UPDATE_TG);
+			update_cfs_shares(se);
+		}
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 	}
 

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2017-01-14 12:48 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-21 15:50 [PATCH v2] sched: fix group_entity's share update Vincent Guittot
2017-01-04 17:20 ` Dietmar Eggemann
2017-01-04 17:48   ` Vincent Guittot
2017-01-14 12:48 ` [tip:sched/core] sched/core: Fix " tip-bot for Vincent Guittot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).