* [PATCH v2] sched: fix group_entity's share update
@ 2016-12-21 15:50 Vincent Guittot
2017-01-04 17:20 ` Dietmar Eggemann
2017-01-14 12:48 ` [tip:sched/core] sched/core: Fix " tip-bot for Vincent Guittot
0 siblings, 2 replies; 4+ messages in thread
From: Vincent Guittot @ 2016-12-21 15:50 UTC (permalink / raw)
To: peterz, mingo, linux-kernel; +Cc: stable, pjt, Vincent Guittot
The update of the share of a cfs_rq is done when its load_avg is updated
but before the group_entity's load_avg has been updated for the past time
slot. This generates wrong load_avg accounting which can be significant
when small tasks are involved in the scheduling.
Let take the example of a task a that is dequeued of its task group A:
root
(cfs_rq)
\
(se)
A
(cfs_rq)
\
(se)
a
Task "a" was the only task in task group A which becomes idle when a is
dequeued.
We have the sequence:
- dequeue_entity a->se
- update_load_avg(a->se)
- dequeue_entity_load_avg(A->cfs_rq, a->se)
- update_cfs_shares(A->cfs_rq)
A->cfs_rq->load.weight == 0
A->se->load.weight is updated with the new share (0 in this case)
- dequeue_entity A->se
- update_load_avg(A->se) but its weight is now null so the last time
slot (up to a tick) will be accounted with a weight of 0 instead of
its real weight during the time slot. The last time slot will be
accounted as an idle one whereas it was a running one.
If the running time of task a is short enough that no tick happens when it
runs, all running time of group entity A->se will be accounted as idle
time.
Instead, we should update the share of a cfs_rq (in fact the weight of its
group entity) only after having updated the load_avg of the group_entity.
update_cfs_shares() now take the sched_entity as parameter instead of the
cfs_rq and the weight of the group_entity is updated only once its load_avg
has been synced with current time.
Cc: <stable@vger.kernel.org> #v4.4+
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
---
kernel/sched/fair.c | 53 +++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 39 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6559d19..b998287 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2689,16 +2689,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
{
- struct task_group *tg;
- struct sched_entity *se;
long shares;
+ struct task_group *tg;
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
- tg = cfs_rq->tg;
- se = tg->se[cpu_of(rq_of(cfs_rq))];
- if (!se || throttled_hierarchy(cfs_rq))
+ if (!cfs_rq)
return;
+
+ if (throttled_hierarchy(cfs_rq))
+ return;
+
+ tg = cfs_rq->tg;
+
#ifndef CONFIG_SMP
if (likely(se->load.weight == tg->shares))
return;
@@ -2707,8 +2711,10 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
+
+
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3582,10 +3588,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
+ /*
+ * When enqueuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Add its load to cfs_rq->runnable_avg
+ * - For group_entity, update its weight to reflect the new share of
+ * its group cfs_rq
+ * - Add its new weight to cfs_rq->load.weight
+ */
update_load_avg(se, UPDATE_TG);
enqueue_entity_load_avg(cfs_rq, se);
+ update_cfs_shares(se);
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
@@ -3657,6 +3671,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+
+ /*
+ * When dequeuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Substract its load from the cfs_rq->runnable_avg.
+ * - Substract its previous weight from cfs_rq->load.weight.
+ * - For group entity, update its weight to reflect the new share
+ * of its group cfs_rq.
+ */
update_load_avg(se, UPDATE_TG);
dequeue_entity_load_avg(cfs_rq, se);
@@ -3681,7 +3704,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
/*
* Now advance min_vruntime if @se was the entity holding it back,
@@ -3864,7 +3887,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* Ensure that runnable average is periodically updated.
*/
update_load_avg(curr, UPDATE_TG);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -4761,7 +4784,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
break;
update_load_avg(se, UPDATE_TG);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
}
if (!se)
@@ -4820,7 +4843,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
break;
update_load_avg(se, UPDATE_TG);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
}
if (!se)
@@ -9356,8 +9379,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
/* Possible calls to update_curr() need rq clock */
update_rq_clock(rq);
- for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se));
+ for_each_sched_entity(se) {
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
--
2.7.4
^ permalink raw reply related [flat|nested] 4+ messages in thread
* Re: [PATCH v2] sched: fix group_entity's share update
2016-12-21 15:50 [PATCH v2] sched: fix group_entity's share update Vincent Guittot
@ 2017-01-04 17:20 ` Dietmar Eggemann
2017-01-04 17:48 ` Vincent Guittot
2017-01-14 12:48 ` [tip:sched/core] sched/core: Fix " tip-bot for Vincent Guittot
1 sibling, 1 reply; 4+ messages in thread
From: Dietmar Eggemann @ 2017-01-04 17:20 UTC (permalink / raw)
To: Vincent Guittot, peterz, mingo, linux-kernel; +Cc: stable, pjt
On 21/12/16 15:50, Vincent Guittot wrote:
IMHO, the overall idea makes sense to me. Just a couple of small
questions ...
> The update of the share of a cfs_rq is done when its load_avg is updated
> but before the group_entity's load_avg has been updated for the past time
> slot. This generates wrong load_avg accounting which can be significant
> when small tasks are involved in the scheduling.
Why for small tasks? Is it because we use load =
scale_load_down(cfs_rq->load.weight) and not load = cfs_rq->avg.load_avg
in calc_cfs_shares()?
> Let take the example of a task a that is dequeued of its task group A:
> root
> (cfs_rq)
> \
> (se)
> A
> (cfs_rq)
> \
> (se)
> a
>
> Task "a" was the only task in task group A which becomes idle when a is
> dequeued.
>
> We have the sequence:
>
> - dequeue_entity a->se
> - update_load_avg(a->se)
> - dequeue_entity_load_avg(A->cfs_rq, a->se)
> - update_cfs_shares(A->cfs_rq)
> A->cfs_rq->load.weight == 0
You mean A->cfs_rq->load.weight = 0 ?
> A->se->load.weight is updated with the new share (0 in this case)
Shouldn't this be MIN_SHARES (2) instead of 0?
> - dequeue_entity A->se
> - update_load_avg(A->se) but its weight is now null so the last time
> slot (up to a tick) will be accounted with a weight of 0 instead of
> its real weight during the time slot. The last time slot will be
> accounted as an idle one whereas it was a running one.
>
> If the running time of task a is short enough that no tick happens when it
> runs, all running time of group entity A->se will be accounted as idle
> time.
>
> Instead, we should update the share of a cfs_rq (in fact the weight of its
> group entity) only after having updated the load_avg of the group_entity.
This is because we use 'se->on_rq * scale_load_down(se->load.weight)' in
__update_load_avg() as weight parameter for PELT load_avg update?
> update_cfs_shares() now take the sched_entity as parameter instead of the
> cfs_rq and the weight of the group_entity is updated only once its load_avg
> has been synced with current time.
[...]
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH v2] sched: fix group_entity's share update
2017-01-04 17:20 ` Dietmar Eggemann
@ 2017-01-04 17:48 ` Vincent Guittot
0 siblings, 0 replies; 4+ messages in thread
From: Vincent Guittot @ 2017-01-04 17:48 UTC (permalink / raw)
To: Dietmar Eggemann
Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, stable, Paul Turner
On 4 January 2017 at 18:20, Dietmar Eggemann <dietmar.eggemann@arm.com> wrote:
> On 21/12/16 15:50, Vincent Guittot wrote:
>
> IMHO, the overall idea makes sense to me. Just a couple of small
> questions ...
>
>> The update of the share of a cfs_rq is done when its load_avg is updated
>> but before the group_entity's load_avg has been updated for the past time
>> slot. This generates wrong load_avg accounting which can be significant
>> when small tasks are involved in the scheduling.
>
> Why for small tasks? Is it because we use load =
> scale_load_down(cfs_rq->load.weight) and not load = cfs_rq->avg.load_avg
> in calc_cfs_shares()?
small tasks are significantly impacted because the tick has less
chance to fire while task is running and as a result less chance to
update group entity's load_avg of the past tick with the correct
weight and minimize the impact of this bug.
>
>> Let take the example of a task a that is dequeued of its task group A:
>> root
>> (cfs_rq)
>> \
>> (se)
>> A
>> (cfs_rq)
>> \
>> (se)
>> a
>>
>> Task "a" was the only task in task group A which becomes idle when a is
>> dequeued.
>>
>> We have the sequence:
>>
>> - dequeue_entity a->se
>> - update_load_avg(a->se)
>> - dequeue_entity_load_avg(A->cfs_rq, a->se)
>> - update_cfs_shares(A->cfs_rq)
>> A->cfs_rq->load.weight == 0
>
> You mean A->cfs_rq->load.weight = 0 ?
no A->cfs_rq->load.weight == 0. Just to point out that
A->cfs_rq->load.weight equals 0 at that time.
A->cfs_rq->load.weight has been set to 0 in account_entity_dequeue()
which is not described here and happen between
dequeue_entity_load_avg() and update_cfs_shares()
>
>> A->se->load.weight is updated with the new share (0 in this case)
>
> Shouldn't this be MIN_SHARES (2) instead of 0?
yes you're right
I had in mind scale_load_down(se->load.weight) which is used when
calling __update_load_avg() in update_load_avg() and which is 0 on
the 64bits platform that I used
>
>> - dequeue_entity A->se
>> - update_load_avg(A->se) but its weight is now null so the last time
>> slot (up to a tick) will be accounted with a weight of 0 instead of
>> its real weight during the time slot. The last time slot will be
>> accounted as an idle one whereas it was a running one.
>>
>> If the running time of task a is short enough that no tick happens when it
>> runs, all running time of group entity A->se will be accounted as idle
>> time.
>>
>> Instead, we should update the share of a cfs_rq (in fact the weight of its
>> group entity) only after having updated the load_avg of the group_entity.
>
> This is because we use 'se->on_rq * scale_load_down(se->load.weight)' in
> __update_load_avg() as weight parameter for PELT load_avg update?
yes
>
>> update_cfs_shares() now take the sched_entity as parameter instead of the
>> cfs_rq and the weight of the group_entity is updated only once its load_avg
>> has been synced with current time.
>
> [...]
>
^ permalink raw reply [flat|nested] 4+ messages in thread
* [tip:sched/core] sched/core: Fix group_entity's share update
2016-12-21 15:50 [PATCH v2] sched: fix group_entity's share update Vincent Guittot
2017-01-04 17:20 ` Dietmar Eggemann
@ 2017-01-14 12:48 ` tip-bot for Vincent Guittot
1 sibling, 0 replies; 4+ messages in thread
From: tip-bot for Vincent Guittot @ 2017-01-14 12:48 UTC (permalink / raw)
To: linux-tip-commits
Cc: tglx, mingo, peterz, torvalds, efault, vincent.guittot,
linux-kernel, hpa
Commit-ID: 89ee048f3cc796db6f26906c6bef4edf0bee70fd
Gitweb: http://git.kernel.org/tip/89ee048f3cc796db6f26906c6bef4edf0bee70fd
Author: Vincent Guittot <vincent.guittot@linaro.org>
AuthorDate: Wed, 21 Dec 2016 16:50:26 +0100
Committer: Ingo Molnar <mingo@kernel.org>
CommitDate: Sat, 14 Jan 2017 11:30:02 +0100
sched/core: Fix group_entity's share update
The update of the share of a cfs_rq is done when its load_avg is updated
but before the group_entity's load_avg has been updated for the past time
slot. This generates wrong load_avg accounting which can be significant
when small tasks are involved in the scheduling.
Let take the example of a task a that is dequeued of its task group A:
root
(cfs_rq)
\
(se)
A
(cfs_rq)
\
(se)
a
Task "a" was the only task in task group A which becomes idle when a is
dequeued.
We have the sequence:
- dequeue_entity a->se
- update_load_avg(a->se)
- dequeue_entity_load_avg(A->cfs_rq, a->se)
- update_cfs_shares(A->cfs_rq)
A->cfs_rq->load.weight == 0
A->se->load.weight is updated with the new share (0 in this case)
- dequeue_entity A->se
- update_load_avg(A->se) but its weight is now null so the last time
slot (up to a tick) will be accounted with a weight of 0 instead of
its real weight during the time slot. The last time slot will be
accounted as an idle one whereas it was a running one.
If the running time of task a is short enough that no tick happens when it
runs, all running time of group entity A->se will be accounted as idle
time.
Instead, we should update the share of a cfs_rq (in fact the weight of its
group entity) only after having updated the load_avg of the group_entity.
update_cfs_shares() now takes the sched_entity as a parameter instead of the
cfs_rq, and the weight of the group_entity is updated only once its load_avg
has been synced with current time.
Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: pjt@google.com
Link: http://lkml.kernel.org/r/1482335426-7664-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
kernel/sched/fair.c | 50 +++++++++++++++++++++++++++++++++++++-------------
1 file changed, 37 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b3bfe3f..2b866a2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2689,16 +2689,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-static void update_cfs_shares(struct cfs_rq *cfs_rq)
+static void update_cfs_shares(struct sched_entity *se)
{
+ struct cfs_rq *cfs_rq = group_cfs_rq(se);
struct task_group *tg;
- struct sched_entity *se;
long shares;
- tg = cfs_rq->tg;
- se = tg->se[cpu_of(rq_of(cfs_rq))];
- if (!se || throttled_hierarchy(cfs_rq))
+ if (!cfs_rq)
+ return;
+
+ if (throttled_hierarchy(cfs_rq))
return;
+
+ tg = cfs_rq->tg;
+
#ifndef CONFIG_SMP
if (likely(se->load.weight == tg->shares))
return;
@@ -2707,8 +2711,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
reweight_entity(cfs_rq_of(se), se, shares);
}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
+static inline void update_cfs_shares(struct sched_entity *se)
{
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -3582,10 +3587,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
+ /*
+ * When enqueuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Add its load to cfs_rq->runnable_avg
+ * - For group_entity, update its weight to reflect the new share of
+ * its group cfs_rq
+ * - Add its new weight to cfs_rq->load.weight
+ */
update_load_avg(se, UPDATE_TG);
enqueue_entity_load_avg(cfs_rq, se);
+ update_cfs_shares(se);
account_entity_enqueue(cfs_rq, se);
- update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0);
@@ -3657,6 +3670,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
+
+ /*
+ * When dequeuing a sched_entity, we must:
+ * - Update loads to have both entity and cfs_rq synced with now.
+ * - Substract its load from the cfs_rq->runnable_avg.
+ * - Substract its previous weight from cfs_rq->load.weight.
+ * - For group entity, update its weight to reflect the new share
+ * of its group cfs_rq.
+ */
update_load_avg(se, UPDATE_TG);
dequeue_entity_load_avg(cfs_rq, se);
@@ -3681,7 +3703,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
/*
* Now advance min_vruntime if @se was the entity holding it back,
@@ -3864,7 +3886,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* Ensure that runnable average is periodically updated.
*/
update_load_avg(curr, UPDATE_TG);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(curr);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -4761,7 +4783,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
break;
update_load_avg(se, UPDATE_TG);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
}
if (!se)
@@ -4820,7 +4842,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
break;
update_load_avg(se, UPDATE_TG);
- update_cfs_shares(cfs_rq);
+ update_cfs_shares(se);
}
if (!se)
@@ -9362,8 +9384,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
/* Possible calls to update_curr() need rq clock */
update_rq_clock(rq);
- for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se));
+ for_each_sched_entity(se) {
+ update_load_avg(se, UPDATE_TG);
+ update_cfs_shares(se);
+ }
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
^ permalink raw reply related [flat|nested] 4+ messages in thread
end of thread, other threads:[~2017-01-14 12:48 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-21 15:50 [PATCH v2] sched: fix group_entity's share update Vincent Guittot
2017-01-04 17:20 ` Dietmar Eggemann
2017-01-04 17:48 ` Vincent Guittot
2017-01-14 12:48 ` [tip:sched/core] sched/core: Fix " tip-bot for Vincent Guittot
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).