> +static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>  {
> +	int decayed;
>  
> +	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> +		long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> +		cfs_rq->avg.load_avg = subtract_until_zero(cfs_rq->avg.load_avg, r);
> +		r *= LOAD_AVG_MAX;
> +		cfs_rq->avg.load_sum = subtract_until_zero(cfs_rq->avg.load_sum, r);
>  	}
>  
> +	decayed = __update_load_avg(now, &cfs_rq->avg, cfs_rq->load.weight);
>  
> +#ifndef CONFIG_64BIT
> +	if (cfs_rq->avg.last_update_time != cfs_rq->load_last_update_time_copy) {
> +		smp_wmb();
> +		cfs_rq->load_last_update_time_copy = cfs_rq->avg.last_update_time;
> +	}
> +#endif
>  
> +	return decayed;
> +}

So on every cfs_rq update we first process the 'pending' removals, then
decay and then store the current timestamp.

> +static inline void enqueue_entity_load_avg(struct sched_entity *se)
>  {
> +	struct sched_avg *sa = &se->avg;
> +	struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +	u64 now = cfs_rq_clock_task(cfs_rq);
> +	int migrated = 0, decayed;
>  
> +	if (sa->last_update_time == 0) {
> +		sa->last_update_time = now;
>  
> +		if (entity_is_task(se))
> +			migrated = 1;
>  	}
> +	else
> +		__update_load_avg(now, sa, se->on_rq * se->load.weight);
>  
> +	decayed = update_cfs_rq_load_avg(now, cfs_rq);
>  
> +	if (migrated) {
> +		cfs_rq->avg.load_avg += sa->load_avg;
> +		cfs_rq->avg.load_sum += sa->load_sum;
>  	}
>  
> +	if (decayed || migrated)
> +		update_tg_load_avg(cfs_rq);
>  }

On enqueue we add ourselves to the cfs_rq.. and assume the entity is
'current' wrt updates since we did that when we just pulled it from the
old rq.

> @@ -4551,18 +4382,34 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
>  {
>  	struct sched_entity *se = &p->se;
>  	struct cfs_rq *cfs_rq = cfs_rq_of(se);
> +	u64 last_update_time;
>  
>  	/*
> +	 * Task on old CPU catches up with its old cfs_rq, and subtract itself from
> +	 * the cfs_rq (task must be off the queue now).
>  	 */
> +#ifndef CONFIG_64BIT
> +	u64 last_update_time_copy;
> +
> +	do {
> +		last_update_time_copy = cfs_rq->load_last_update_time_copy;
> +		smp_rmb();
> +		last_update_time = cfs_rq->avg.last_update_time;
> +	} while (last_update_time != last_update_time_copy);
> +#else
> +	last_update_time = cfs_rq->avg.last_update_time;
> +#endif
> +	__update_load_avg(last_update_time, &se->avg, 0);
> +	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
> +
> +	/*
> +	 * We are supposed to update the task to "current" time, then its up to date
> +	 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
> +	 * what current time is, so simply throw away the out-of-date time. This
> +	 * will result in the wakee task is less decayed, but giving the wakee more
> +	 * load sounds not bad.
> +	 */
> +	se->avg.last_update_time = 0;
>  
>  	/* We have migrated, no longer consider this task hot */
>  	se->exec_start = 0;


And here we try and make good on that assumption. The thing I worry
about is what happens if the machine is entirely idle...

What guarantees an semi up-to-date cfs_rq->avg.last_update_time.