From: Tejun Heo <tj@kernel.org>
To: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Ingo Molnar <mingo@redhat.com>,
Peter Zijlstra <peterz@infradead.org>,
linux-kernel <linux-kernel@vger.kernel.org>,
Linus Torvalds <torvalds@linux-foundation.org>,
Mike Galbraith <efault@gmx.de>, Paul Turner <pjt@google.com>,
Chris Mason <clm@fb.com>,
kernel-team@fb.com
Subject: Re: [PATCH 2/2] sched/fair: Always propagate runnable_load_avg
Date: Fri, 28 Apr 2017 16:38:45 -0400 [thread overview]
Message-ID: <20170428203845.GA22354@htj.duckdns.org> (raw)
In-Reply-To: <20170428203347.GC19364@htj.duckdns.org>
Here's the debug patch.
The debug condition triggers when the load balancer picks a group w/o
more than one schbench threads on a CPU over one w/.
/sys/module/fair/parameters/dbg_odd_cnt: resettable counter
/sys/module/fair/parameters/dbg_odd_nth: dump group states on Nth
occurrence via trace_printk()
The load / weights are printed out so that NICE_0_LOAD is 1.000.
Thanks.
---
kernel/sched/fair.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 159 insertions(+), 1 deletion(-)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,11 +32,18 @@
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
+#include <linux/moduleparam.h>
#include <trace/events/sched.h>
#include "sched.h"
+static unsigned long dbg_odd_nth;
+static unsigned long dbg_odd_cnt;
+
+module_param(dbg_odd_nth, ulong, 0644);
+module_param(dbg_odd_cnt, ulong, 0644);
+
/*
* Targeted preemption latency for CPU-bound tasks:
*
@@ -7413,6 +7420,149 @@ static inline void update_sg_lb_stats(st
sgs->group_type = group_classify(group, sgs);
}
+static int count_schb(struct rq *rq)
+{
+ unsigned long flags;
+ struct task_struct *p;
+ int cnt = 0;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
+ if (!strncmp(p->comm, "schbench", 8))
+ cnt++;
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+ return cnt;
+}
+
+static bool sg_has_two_schb(struct sched_group *sg)
+{
+ int cpu;
+
+ for_each_cpu(cpu, sched_group_cpus(sg))
+ if (count_schb(cpu_rq(cpu)) >= 2)
+ return true;
+ return false;
+}
+
+static DEFINE_PER_CPU(char [PAGE_SIZE], odd_buf);
+
+#define lbw(x) (int)((x) / NICE_0_LOAD), (int)(((x) % NICE_0_LOAD) * 1000 / NICE_0_LOAD)
+#define lba(x) (int)((scale_load(x)) / NICE_0_LOAD), (int)(((scale_load(x)) % NICE_0_LOAD) * 1000 / NICE_0_LOAD)
+
+static int odd_append_se(struct sched_entity *se, const char *postfix,
+ int cnt, char *buf, size_t size)
+{
+#define odd_append(fmt, args...) do { \
+ cnt += scnprintf(buf + cnt, size - cnt, fmt, ##args); \
+ cnt = min_t(int, cnt, size); \
+} while (0)
+
+ if (entity_is_task(se)) {
+ struct task_struct *task = task_of(se);
+ odd_append(" %s(%d%s)", task->comm, task->pid, postfix);
+ } else {
+ char nbuf[64];
+ cgroup_name(se->my_q->tg->css.cgroup, nbuf, sizeof(nbuf));
+ odd_append(" %s(%s)", nbuf, postfix);
+ }
+ odd_append(":w=%d.%03d,l=%d.%03d,u=%d.%03d",
+ lbw(se->load.weight),
+ lba(se->avg.load_avg),
+ lba(se->avg.util_avg));
+
+ return cnt;
+}
+
+static void dbg_odd_dump(const char *pref,
+ struct sched_group *sg, struct sg_lb_stats *sgs)
+{
+ int cpu;
+
+ trace_printk("%sgrp=%*pbl w=%u avg=%d.%03d grp=%d.%03d sum=%d.%03d pertask=%d.%03d\n", pref,
+ cpumask_pr_args(sched_group_cpus(sg)), sg->group_weight,
+ lba(sgs->avg_load), lba(sgs->group_load),
+ lba(sgs->sum_weighted_load), lba(sgs->load_per_task));
+ trace_printk("%sgcap=%d.%03d gutil=%d.%03d run=%u idle=%u gwt=%u type=%d nocap=%d\n",
+ pref,
+ lba(sgs->group_capacity), lba(sgs->group_util),
+ sgs->sum_nr_running, sgs->idle_cpus, sgs->group_weight,
+ sgs->group_type, sgs->group_no_capacity);
+
+ for_each_cpu(cpu, sched_group_cpus(sg)) {
+ struct task_group *tg;
+ unsigned long flags;
+
+ trace_printk("%sCPU%03d: run=%u schb=%d\n", pref, cpu,
+ cpu_rq(cpu)->nr_running, count_schb(cpu_rq(cpu)));
+
+ raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags);
+
+ list_for_each_entry_rcu(tg, &task_groups, list) {
+ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+ char qname[32] = "root";
+ int depth = 0;
+ long tg_weight = 0, tg_shares = 0;
+ struct sched_entity *se;
+ char *buf = per_cpu_ptr(odd_buf, cpu);
+ int cnt;
+
+ if (!cfs_rq->nr_running)
+ continue;
+
+ if (cfs_rq->tg) {
+ cgroup_name(cfs_rq->tg->css.cgroup, qname, sizeof(qname));
+ if (cfs_rq->tg->se[cpu])
+ depth = cfs_rq->tg->se[cpu]->depth;
+ tg_weight = atomic_long_read(&cfs_rq->tg->load_avg);
+ tg_shares = cfs_rq->tg->shares;
+ }
+
+ trace_printk("%sQ%03d-%s@%d: w=%d.%03d,l=%d.%03d,u=%d.%03d,r=%d.%03d run=%u hrun=%u tgs=%d.%03d tgw=%d.%03d\n",
+ pref, cpu, qname, depth,
+ lbw(cfs_rq->load.weight),
+ lba(cfs_rq->avg.load_avg),
+ lba(cfs_rq->avg.util_avg),
+ lba(cfs_rq->runnable_load_avg),
+ cfs_rq->nr_running, cfs_rq->h_nr_running,
+ lbw(tg_shares),
+ lba(tg_weight));
+
+ buf[0] = '\0';
+ cnt = 0;
+
+ if (cfs_rq->curr)
+ cnt = odd_append_se(cfs_rq->curr, "C", cnt, buf, PAGE_SIZE);
+
+ for (se = __pick_first_entity(cfs_rq); se;
+ se = __pick_next_entity(se))
+ cnt = odd_append_se(se, "", cnt, buf, PAGE_SIZE);
+
+ trace_printk("%sQ%03d-%s@%d: %s\n",
+ pref, cpu, qname, depth, buf);
+ }
+
+ raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags);
+ }
+}
+
+/* a has >= 2 dts, b doesn't */
+static void dbg_odd(struct lb_env *env,
+ struct sched_group *sga, struct sg_lb_stats *sgsa,
+ struct sched_group *sgb, struct sg_lb_stats *sgsb)
+{
+ if (dbg_odd_nth && (dbg_odd_cnt++ % dbg_odd_nth))
+ return;
+
+ trace_printk("odd: dst=%d idle=%d brk=%u lbtgt=%*pbl type=%d\n",
+ env->dst_cpu, env->idle, env->loop_break,
+ cpumask_pr_args(env->cpus), env->fbq_type);
+ dbg_odd_dump("A: ", sga, sgsa);
+ dbg_odd_dump("B: ", sgb, sgsb);
+}
+
/**
* update_sd_pick_busiest - return 1 on busiest group
* @env: The load balancing environment.
@@ -7432,6 +7582,8 @@ static bool update_sd_pick_busiest(struc
struct sg_lb_stats *sgs)
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
+ bool busiest_has_two = sds->busiest && sg_has_two_schb(sds->busiest);
+ bool sg_has_two = sg_has_two_schb(sg);
if (sgs->group_type > busiest->group_type)
return true;
@@ -7439,8 +7591,14 @@ static bool update_sd_pick_busiest(struc
if (sgs->group_type < busiest->group_type)
return false;
- if (sgs->avg_load <= busiest->avg_load)
+ if (sgs->avg_load <= busiest->avg_load) {
+ if (sg_has_two && !busiest_has_two)
+ dbg_odd(env, sg, sgs, sds->busiest, busiest);
return false;
+ }
+
+ if (!sg_has_two && busiest_has_two)
+ dbg_odd(env, sds->busiest, busiest, sg, sgs);
if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
goto asym_packing;
next prev parent reply other threads:[~2017-04-28 20:38 UTC|newest]
Thread overview: 69+ messages / expand[flat|nested] mbox.gz Atom feed top
2017-04-24 20:13 [RFC PATCHSET] sched/fair: fix load balancer behavior when cgroup is in use Tejun Heo
2017-04-24 20:14 ` [PATCH 1/2] sched/fair: Fix how load gets propagated from cfs_rq to its sched_entity Tejun Heo
2017-04-24 21:33 ` [PATCH v2 " Tejun Heo
2017-05-03 18:00 ` Peter Zijlstra
2017-05-03 21:45 ` Tejun Heo
2017-05-04 5:51 ` Peter Zijlstra
2017-05-04 6:21 ` Peter Zijlstra
2017-05-04 9:49 ` Dietmar Eggemann
2017-05-04 10:57 ` Peter Zijlstra
2017-05-04 17:39 ` Tejun Heo
2017-05-05 10:36 ` Dietmar Eggemann
2017-05-04 10:26 ` Vincent Guittot
2017-04-25 8:35 ` [PATCH " Vincent Guittot
2017-04-25 18:12 ` Tejun Heo
2017-04-26 16:51 ` Vincent Guittot
2017-04-26 22:40 ` Tejun Heo
2017-04-27 7:00 ` Vincent Guittot
2017-05-01 14:17 ` Peter Zijlstra
2017-05-01 14:52 ` Peter Zijlstra
2017-05-01 21:56 ` Tejun Heo
2017-05-02 8:19 ` Peter Zijlstra
2017-05-02 8:30 ` Peter Zijlstra
2017-05-02 20:00 ` Tejun Heo
2017-05-03 9:10 ` Peter Zijlstra
2017-04-26 16:14 ` Vincent Guittot
2017-04-26 22:27 ` Tejun Heo
2017-04-27 8:59 ` Vincent Guittot
2017-04-28 17:46 ` Tejun Heo
2017-05-02 7:20 ` Vincent Guittot
2017-04-24 20:14 ` [PATCH 2/2] sched/fair: Always propagate runnable_load_avg Tejun Heo
2017-04-25 8:46 ` Vincent Guittot
2017-04-25 9:05 ` Vincent Guittot
2017-04-25 12:59 ` Vincent Guittot
2017-04-25 18:49 ` Tejun Heo
2017-04-25 20:49 ` Tejun Heo
2017-04-25 21:15 ` Chris Mason
2017-04-25 21:08 ` Tejun Heo
2017-04-26 10:21 ` Vincent Guittot
2017-04-27 0:30 ` Tejun Heo
2017-04-27 8:28 ` Vincent Guittot
2017-04-28 16:14 ` Tejun Heo
2017-05-02 6:56 ` Vincent Guittot
2017-05-02 20:56 ` Tejun Heo
2017-05-03 7:25 ` Vincent Guittot
2017-05-03 7:54 ` Vincent Guittot
2017-04-26 18:12 ` Vincent Guittot
2017-04-26 22:52 ` Tejun Heo
2017-04-27 8:29 ` Vincent Guittot
2017-04-28 20:33 ` Tejun Heo
2017-04-28 20:38 ` Tejun Heo [this message]
2017-05-01 15:56 ` Peter Zijlstra
2017-05-02 22:01 ` Tejun Heo
2017-05-02 7:18 ` Vincent Guittot
2017-05-02 13:26 ` Vincent Guittot
2017-05-02 22:37 ` Tejun Heo
2017-05-02 21:50 ` Tejun Heo
2017-05-03 7:34 ` Vincent Guittot
2017-05-03 9:37 ` Peter Zijlstra
2017-05-03 10:37 ` Vincent Guittot
2017-05-03 13:09 ` Peter Zijlstra
2017-05-03 21:49 ` Tejun Heo
2017-05-04 8:19 ` Vincent Guittot
2017-05-04 17:43 ` Tejun Heo
2017-05-04 19:02 ` Vincent Guittot
2017-05-04 19:04 ` Tejun Heo
2017-04-24 21:35 ` [PATCH 3/2] sched/fair: Skip __update_load_avg() on cfs_rq sched_entities Tejun Heo
2017-04-24 21:48 ` Peter Zijlstra
2017-04-24 22:54 ` Tejun Heo
2017-04-25 21:09 ` Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20170428203845.GA22354@htj.duckdns.org \
--to=tj@kernel.org \
--cc=clm@fb.com \
--cc=efault@gmx.de \
--cc=kernel-team@fb.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
--cc=pjt@google.com \
--cc=torvalds@linux-foundation.org \
--cc=vincent.guittot@linaro.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).