From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752528AbeBFI4N (ORCPT ); Tue, 6 Feb 2018 03:56:13 -0500 Received: from mail-io0-f194.google.com ([209.85.223.194]:33934 "EHLO mail-io0-f194.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751835AbeBFI4F (ORCPT ); Tue, 6 Feb 2018 03:56:05 -0500 X-Google-Smtp-Source: AH8x225pXE9qDEP2wwGa5SJbHbxM+AIfDEnGGnpxlgvGgNAWnvgcVrox4RraCzVlI/YkzGdJwuJnTEUgpZQynlF6bcc= MIME-Version: 1.0 In-Reply-To: <1517905943-24528-1-git-send-email-vincent.guittot@linaro.org> References: <20180201181041.GF2269@hirez.programming.kicks-ass.net> <1517905943-24528-1-git-send-email-vincent.guittot@linaro.org> From: Vincent Guittot Date: Tue, 6 Feb 2018 09:55:43 +0100 Message-ID: Subject: Re: [PATCH 1/3] sched: Stop nohz stats when decayed To: Peter Zijlstra , Ingo Molnar , linux-kernel , Valentin Schneider Cc: Morten Rasmussen , Brendan Jackman , Dietmar Eggemann , Vincent Guittot Content-Type: text/plain; charset="UTF-8" Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Hi Peter, On 6 February 2018 at 09:32, Vincent Guittot wrote: > Stopped the periodic update of blocked load when all idle CPUs have fully > decayed. We introduce a new nohz.has_blocked that reflect if some idle > CPUs has blocked load that have to be periodiccally updated. nohz.has_blocked > is set everytime that a Idle CPU can have blocked load and it is then clear > when no more blocked load has been detected during an update. We don't need > atomic operation but only to make cure of the right ordering when updating > nohz.idle_cpus_mask and nohz.has_blocked. > > Suggested-by: Peter Zijlstra (Intel) > Signed-off-by: Vincent Guittot > --- This patchset applies on your testing branch after removing the 2 last commits: 56eb4679 ("sched: Clean up nohz enter/exit") > kernel/sched/fair.c | 94 +++++++++++++++++++++++++++++++++++++++++----------- > kernel/sched/sched.h | 1 + > 2 files changed, 75 insertions(+), 20 deletions(-) > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > index 7af1fa9..279f4b2 100644 > --- a/kernel/sched/fair.c > +++ b/kernel/sched/fair.c > @@ -5383,8 +5383,9 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) > static struct { > cpumask_var_t idle_cpus_mask; > atomic_t nr_cpus; > + int has_blocked; /* Idle CPUS has blocked load */ > unsigned long next_balance; /* in jiffy units */ > - unsigned long next_stats; > + unsigned long next_blocked; /* Next update of blocked load in jiffies */ > } nohz ____cacheline_aligned; > > #endif /* CONFIG_NO_HZ_COMMON */ > @@ -6951,6 +6952,7 @@ enum fbq_type { regular, remote, all }; > #define LBF_DST_PINNED 0x04 > #define LBF_SOME_PINNED 0x08 > #define LBF_NOHZ_STATS 0x10 > +#define LBF_NOHZ_AGAIN 0x20 > > struct lb_env { > struct sched_domain *sd; > @@ -7335,8 +7337,6 @@ static void attach_tasks(struct lb_env *env) > rq_unlock(env->dst_rq, &rf); > } > > -#ifdef CONFIG_FAIR_GROUP_SCHED > - > static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) > { > if (cfs_rq->load.weight) > @@ -7354,11 +7354,14 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) > return true; > } > > +#ifdef CONFIG_FAIR_GROUP_SCHED > + > static void update_blocked_averages(int cpu) > { > struct rq *rq = cpu_rq(cpu); > struct cfs_rq *cfs_rq, *pos; > struct rq_flags rf; > + bool done = true; > > rq_lock_irqsave(rq, &rf); > update_rq_clock(rq); > @@ -7388,10 +7391,14 @@ static void update_blocked_averages(int cpu) > */ > if (cfs_rq_is_decayed(cfs_rq)) > list_del_leaf_cfs_rq(cfs_rq); > + else > + done = false; > } > > #ifdef CONFIG_NO_HZ_COMMON > rq->last_blocked_load_update_tick = jiffies; > + if (done) > + rq->has_blocked_load = 0; > #endif > rq_unlock_irqrestore(rq, &rf); > } > @@ -7454,6 +7461,8 @@ static inline void update_blocked_averages(int cpu) > update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq); > #ifdef CONFIG_NO_HZ_COMMON > rq->last_blocked_load_update_tick = jiffies; > + if (cfs_rq_is_decayed(cfs_rq)) > + rq->has_blocked_load = 0; > #endif > rq_unlock_irqrestore(rq, &rf); > } > @@ -7789,18 +7798,25 @@ group_type group_classify(struct sched_group *group, > return group_other; > } > > -static void update_nohz_stats(struct rq *rq) > +static bool update_nohz_stats(struct rq *rq) > { > #ifdef CONFIG_NO_HZ_COMMON > unsigned int cpu = rq->cpu; > > + if (!rq->has_blocked_load) > + return false; > + > if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) > - return; > + return false; > > if (!time_after(jiffies, rq->last_blocked_load_update_tick)) > - return; > + return true; > > update_blocked_averages(cpu); > + > + return rq->has_blocked_load; > +#else > + return false; > #endif > } > > @@ -7826,8 +7842,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, > for_each_cpu_and(i, sched_group_span(group), env->cpus) { > struct rq *rq = cpu_rq(i); > > - if (env->flags & LBF_NOHZ_STATS) > - update_nohz_stats(rq); > + if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq)) > + env->flags |= LBF_NOHZ_AGAIN; > > /* Bias balancing toward cpus of our domain */ > if (local_group) > @@ -7979,18 +7995,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd > struct sg_lb_stats *local = &sds->local_stat; > struct sg_lb_stats tmp_sgs; > int load_idx, prefer_sibling = 0; > + int has_blocked = READ_ONCE(nohz.has_blocked); > bool overload = false; > > if (child && child->flags & SD_PREFER_SIBLING) > prefer_sibling = 1; > > #ifdef CONFIG_NO_HZ_COMMON > - if (env->idle == CPU_NEWLY_IDLE) { > + if (env->idle == CPU_NEWLY_IDLE && has_blocked) > env->flags |= LBF_NOHZ_STATS; > - > - if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) > - nohz.next_stats = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD); > - } > #endif > > load_idx = get_sd_load_idx(env->sd, env->idle); > @@ -8046,6 +8059,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd > sg = sg->next; > } while (sg != env->sd->groups); > > +#ifdef CONFIG_NO_HZ_COMMON > + if ((env->flags & LBF_NOHZ_AGAIN) && > + cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) { > + > + WRITE_ONCE(nohz.next_blocked, > + jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD)); > + } > +#endif > + > if (env->sd->flags & SD_NUMA) > env->fbq_type = fbq_classify_group(&sds->busiest_stat); > > @@ -9069,6 +9091,8 @@ static void nohz_balancer_kick(struct rq *rq) > struct sched_domain *sd; > int nr_busy, i, cpu = rq->cpu; > unsigned int flags = 0; > + unsigned long has_blocked = READ_ONCE(nohz.has_blocked); > + unsigned long next = READ_ONCE(nohz.next_blocked); > > if (unlikely(rq->idle_balance)) > return; > @@ -9086,7 +9110,7 @@ static void nohz_balancer_kick(struct rq *rq) > if (likely(!atomic_read(&nohz.nr_cpus))) > return; > > - if (time_after(now, nohz.next_stats)) > + if (time_after(now, next) && has_blocked) > flags = NOHZ_STATS_KICK; > > if (time_before(now, nohz.next_balance)) > @@ -9207,13 +9231,15 @@ void nohz_balance_enter_idle(int cpu) > if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) > return; > > + rq->has_blocked_load = 1; > + > if (rq->nohz_tick_stopped) > - return; > + goto out; > > /* > * If we're a completely isolated CPU, we don't play. > */ > - if (on_null_domain(cpu_rq(cpu))) > + if (on_null_domain(rq)) > return; > > rq->nohz_tick_stopped = 1; > @@ -9222,6 +9248,13 @@ void nohz_balance_enter_idle(int cpu) > atomic_inc(&nohz.nr_cpus); > > set_cpu_sd_state_idle(cpu); > + > +out: > + /* > + * Each time a cpu enter idle, we assume that it has blocked load and > + * enable the periodic update of the load of idle cpus > + */ > + WRITE_ONCE(nohz.has_blocked, 1); > } > #else > static inline void nohz_balancer_kick(struct rq *rq) { } > @@ -9374,6 +9407,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) > > SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); > > + /* > + * We assume there will be no idle load after this update and clear > + * the stats state. If a cpu enters idle in the mean time, it will > + * set the stats state and trig another update of idle load. > + * Because a cpu that becomes idle, is added to idle_cpus_mask before > + * setting the stats state, we are sure to not clear the state and not > + * check the load of an idle cpu. > + */ > + WRITE_ONCE(nohz.has_blocked, 0); > + > for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { > if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) > continue; > @@ -9383,11 +9426,16 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) > * work being done for other cpus. Next load > * balancing owner will pick it up. > */ > - if (need_resched()) > - break; > + if (need_resched()) { > + has_blocked_load = true; > + goto abort; > + } > > rq = cpu_rq(balance_cpu); > > + update_blocked_averages(rq->cpu); > + has_blocked_load |= rq->has_blocked_load; > + > /* > * If time for next balance is due, > * do the balance. > @@ -9400,7 +9448,6 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) > cpu_load_update_idle(rq); > rq_unlock_irq(rq, &rf); > > - update_blocked_averages(rq->cpu); > if (flags & NOHZ_BALANCE_KICK) > rebalance_domains(rq, CPU_IDLE); > } > @@ -9415,7 +9462,13 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) > if (flags & NOHZ_BALANCE_KICK) > rebalance_domains(this_rq, CPU_IDLE); > > - nohz.next_stats = next_stats; > + WRITE_ONCE(nohz.next_blocked, > + now + msecs_to_jiffies(LOAD_AVG_PERIOD)); > + > +abort: > + /* There is still blocked load, enable periodic update */ > + if (has_blocked_load) > + WRITE_ONCE(nohz.has_blocked, 1); > > /* > * next_balance will be updated only when there is a need. > @@ -10046,6 +10099,7 @@ __init void init_sched_fair_class(void) > > #ifdef CONFIG_NO_HZ_COMMON > nohz.next_balance = jiffies; > + nohz.next_blocked = jiffies; > zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); > #endif > #endif /* SMP */ > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h > index e200045..ad9b929 100644 > --- a/kernel/sched/sched.h > +++ b/kernel/sched/sched.h > @@ -723,6 +723,7 @@ struct rq { > #ifdef CONFIG_SMP > unsigned long last_load_update_tick; > unsigned long last_blocked_load_update_tick; > + unsigned int has_blocked_load; > #endif /* CONFIG_SMP */ > unsigned int nohz_tick_stopped; > atomic_t nohz_flags; > -- > 2.7.4 >