From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1035699AbdD1Ui4 (ORCPT ); Fri, 28 Apr 2017 16:38:56 -0400 Received: from mail-yw0-f181.google.com ([209.85.161.181]:35485 "EHLO mail-yw0-f181.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756096AbdD1Uis (ORCPT ); Fri, 28 Apr 2017 16:38:48 -0400 Date: Fri, 28 Apr 2017 16:38:45 -0400 From: Tejun Heo To: Vincent Guittot Cc: Ingo Molnar , Peter Zijlstra , linux-kernel , Linus Torvalds , Mike Galbraith , Paul Turner , Chris Mason , kernel-team@fb.com Subject: Re: [PATCH 2/2] sched/fair: Always propagate runnable_load_avg Message-ID: <20170428203845.GA22354@htj.duckdns.org> References: <20170424201344.GA14169@wtj.duckdns.org> <20170424201444.GC14169@wtj.duckdns.org> <20170426225202.GC11348@wtj.duckdns.org> <20170428203347.GC19364@htj.duckdns.org> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20170428203347.GC19364@htj.duckdns.org> User-Agent: Mutt/1.8.0 (2017-02-23) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Here's the debug patch. The debug condition triggers when the load balancer picks a group w/o more than one schbench threads on a CPU over one w/. /sys/module/fair/parameters/dbg_odd_cnt: resettable counter /sys/module/fair/parameters/dbg_odd_nth: dump group states on Nth occurrence via trace_printk() The load / weights are printed out so that NICE_0_LOAD is 1.000. Thanks. --- kernel/sched/fair.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 159 insertions(+), 1 deletion(-) --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -32,11 +32,18 @@ #include #include #include +#include #include #include "sched.h" +static unsigned long dbg_odd_nth; +static unsigned long dbg_odd_cnt; + +module_param(dbg_odd_nth, ulong, 0644); +module_param(dbg_odd_cnt, ulong, 0644); + /* * Targeted preemption latency for CPU-bound tasks: * @@ -7413,6 +7420,149 @@ static inline void update_sg_lb_stats(st sgs->group_type = group_classify(group, sgs); } +static int count_schb(struct rq *rq) +{ + unsigned long flags; + struct task_struct *p; + int cnt = 0; + + raw_spin_lock_irqsave(&rq->lock, flags); + + list_for_each_entry(p, &rq->cfs_tasks, se.group_node) + if (!strncmp(p->comm, "schbench", 8)) + cnt++; + + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return cnt; +} + +static bool sg_has_two_schb(struct sched_group *sg) +{ + int cpu; + + for_each_cpu(cpu, sched_group_cpus(sg)) + if (count_schb(cpu_rq(cpu)) >= 2) + return true; + return false; +} + +static DEFINE_PER_CPU(char [PAGE_SIZE], odd_buf); + +#define lbw(x) (int)((x) / NICE_0_LOAD), (int)(((x) % NICE_0_LOAD) * 1000 / NICE_0_LOAD) +#define lba(x) (int)((scale_load(x)) / NICE_0_LOAD), (int)(((scale_load(x)) % NICE_0_LOAD) * 1000 / NICE_0_LOAD) + +static int odd_append_se(struct sched_entity *se, const char *postfix, + int cnt, char *buf, size_t size) +{ +#define odd_append(fmt, args...) do { \ + cnt += scnprintf(buf + cnt, size - cnt, fmt, ##args); \ + cnt = min_t(int, cnt, size); \ +} while (0) + + if (entity_is_task(se)) { + struct task_struct *task = task_of(se); + odd_append(" %s(%d%s)", task->comm, task->pid, postfix); + } else { + char nbuf[64]; + cgroup_name(se->my_q->tg->css.cgroup, nbuf, sizeof(nbuf)); + odd_append(" %s(%s)", nbuf, postfix); + } + odd_append(":w=%d.%03d,l=%d.%03d,u=%d.%03d", + lbw(se->load.weight), + lba(se->avg.load_avg), + lba(se->avg.util_avg)); + + return cnt; +} + +static void dbg_odd_dump(const char *pref, + struct sched_group *sg, struct sg_lb_stats *sgs) +{ + int cpu; + + trace_printk("%sgrp=%*pbl w=%u avg=%d.%03d grp=%d.%03d sum=%d.%03d pertask=%d.%03d\n", pref, + cpumask_pr_args(sched_group_cpus(sg)), sg->group_weight, + lba(sgs->avg_load), lba(sgs->group_load), + lba(sgs->sum_weighted_load), lba(sgs->load_per_task)); + trace_printk("%sgcap=%d.%03d gutil=%d.%03d run=%u idle=%u gwt=%u type=%d nocap=%d\n", + pref, + lba(sgs->group_capacity), lba(sgs->group_util), + sgs->sum_nr_running, sgs->idle_cpus, sgs->group_weight, + sgs->group_type, sgs->group_no_capacity); + + for_each_cpu(cpu, sched_group_cpus(sg)) { + struct task_group *tg; + unsigned long flags; + + trace_printk("%sCPU%03d: run=%u schb=%d\n", pref, cpu, + cpu_rq(cpu)->nr_running, count_schb(cpu_rq(cpu))); + + raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags); + + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; + char qname[32] = "root"; + int depth = 0; + long tg_weight = 0, tg_shares = 0; + struct sched_entity *se; + char *buf = per_cpu_ptr(odd_buf, cpu); + int cnt; + + if (!cfs_rq->nr_running) + continue; + + if (cfs_rq->tg) { + cgroup_name(cfs_rq->tg->css.cgroup, qname, sizeof(qname)); + if (cfs_rq->tg->se[cpu]) + depth = cfs_rq->tg->se[cpu]->depth; + tg_weight = atomic_long_read(&cfs_rq->tg->load_avg); + tg_shares = cfs_rq->tg->shares; + } + + trace_printk("%sQ%03d-%s@%d: w=%d.%03d,l=%d.%03d,u=%d.%03d,r=%d.%03d run=%u hrun=%u tgs=%d.%03d tgw=%d.%03d\n", + pref, cpu, qname, depth, + lbw(cfs_rq->load.weight), + lba(cfs_rq->avg.load_avg), + lba(cfs_rq->avg.util_avg), + lba(cfs_rq->runnable_load_avg), + cfs_rq->nr_running, cfs_rq->h_nr_running, + lbw(tg_shares), + lba(tg_weight)); + + buf[0] = '\0'; + cnt = 0; + + if (cfs_rq->curr) + cnt = odd_append_se(cfs_rq->curr, "C", cnt, buf, PAGE_SIZE); + + for (se = __pick_first_entity(cfs_rq); se; + se = __pick_next_entity(se)) + cnt = odd_append_se(se, "", cnt, buf, PAGE_SIZE); + + trace_printk("%sQ%03d-%s@%d: %s\n", + pref, cpu, qname, depth, buf); + } + + raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags); + } +} + +/* a has >= 2 dts, b doesn't */ +static void dbg_odd(struct lb_env *env, + struct sched_group *sga, struct sg_lb_stats *sgsa, + struct sched_group *sgb, struct sg_lb_stats *sgsb) +{ + if (dbg_odd_nth && (dbg_odd_cnt++ % dbg_odd_nth)) + return; + + trace_printk("odd: dst=%d idle=%d brk=%u lbtgt=%*pbl type=%d\n", + env->dst_cpu, env->idle, env->loop_break, + cpumask_pr_args(env->cpus), env->fbq_type); + dbg_odd_dump("A: ", sga, sgsa); + dbg_odd_dump("B: ", sgb, sgsb); +} + /** * update_sd_pick_busiest - return 1 on busiest group * @env: The load balancing environment. @@ -7432,6 +7582,8 @@ static bool update_sd_pick_busiest(struc struct sg_lb_stats *sgs) { struct sg_lb_stats *busiest = &sds->busiest_stat; + bool busiest_has_two = sds->busiest && sg_has_two_schb(sds->busiest); + bool sg_has_two = sg_has_two_schb(sg); if (sgs->group_type > busiest->group_type) return true; @@ -7439,8 +7591,14 @@ static bool update_sd_pick_busiest(struc if (sgs->group_type < busiest->group_type) return false; - if (sgs->avg_load <= busiest->avg_load) + if (sgs->avg_load <= busiest->avg_load) { + if (sg_has_two && !busiest_has_two) + dbg_odd(env, sg, sgs, sds->busiest, busiest); return false; + } + + if (!sg_has_two && busiest_has_two) + dbg_odd(env, sds->busiest, busiest, sg, sgs); if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) goto asym_packing;