> +static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) > { > + int decayed; > > + if (atomic_long_read(&cfs_rq->removed_load_avg)) { > + long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); > + cfs_rq->avg.load_avg = subtract_until_zero(cfs_rq->avg.load_avg, r); > + r *= LOAD_AVG_MAX; > + cfs_rq->avg.load_sum = subtract_until_zero(cfs_rq->avg.load_sum, r); > } > > + decayed = __update_load_avg(now, &cfs_rq->avg, cfs_rq->load.weight); > > +#ifndef CONFIG_64BIT > + if (cfs_rq->avg.last_update_time != cfs_rq->load_last_update_time_copy) { > + smp_wmb(); > + cfs_rq->load_last_update_time_copy = cfs_rq->avg.last_update_time; > + } > +#endif > > + return decayed; > +} So on every cfs_rq update we first process the 'pending' removals, then decay and then store the current timestamp. > +static inline void enqueue_entity_load_avg(struct sched_entity *se) > { > + struct sched_avg *sa = &se->avg; > + struct cfs_rq *cfs_rq = cfs_rq_of(se); > + u64 now = cfs_rq_clock_task(cfs_rq); > + int migrated = 0, decayed; > > + if (sa->last_update_time == 0) { > + sa->last_update_time = now; > > + if (entity_is_task(se)) > + migrated = 1; > } > + else > + __update_load_avg(now, sa, se->on_rq * se->load.weight); > > + decayed = update_cfs_rq_load_avg(now, cfs_rq); > > + if (migrated) { > + cfs_rq->avg.load_avg += sa->load_avg; > + cfs_rq->avg.load_sum += sa->load_sum; > } > > + if (decayed || migrated) > + update_tg_load_avg(cfs_rq); > } On enqueue we add ourselves to the cfs_rq.. and assume the entity is 'current' wrt updates since we did that when we just pulled it from the old rq. > @@ -4551,18 +4382,34 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu) > { > struct sched_entity *se = &p->se; > struct cfs_rq *cfs_rq = cfs_rq_of(se); > + u64 last_update_time; > > /* > + * Task on old CPU catches up with its old cfs_rq, and subtract itself from > + * the cfs_rq (task must be off the queue now). > */ > +#ifndef CONFIG_64BIT > + u64 last_update_time_copy; > + > + do { > + last_update_time_copy = cfs_rq->load_last_update_time_copy; > + smp_rmb(); > + last_update_time = cfs_rq->avg.last_update_time; > + } while (last_update_time != last_update_time_copy); > +#else > + last_update_time = cfs_rq->avg.last_update_time; > +#endif > + __update_load_avg(last_update_time, &se->avg, 0); > + atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); > + > + /* > + * We are supposed to update the task to "current" time, then its up to date > + * and ready to go to new CPU/cfs_rq. But we have difficulty in getting > + * what current time is, so simply throw away the out-of-date time. This > + * will result in the wakee task is less decayed, but giving the wakee more > + * load sounds not bad. > + */ > + se->avg.last_update_time = 0; > > /* We have migrated, no longer consider this task hot */ > se->exec_start = 0; And here we try and make good on that assumption. The thing I worry about is what happens if the machine is entirely idle... What guarantees an semi up-to-date cfs_rq->avg.last_update_time.