linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	linux-kernel <linux-kernel@vger.kernel.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Mike Galbraith <efault@gmx.de>, Paul Turner <pjt@google.com>,
	Chris Mason <clm@fb.com>,
	kernel-team@fb.com
Subject: Re: [PATCH 2/2] sched/fair: Always propagate runnable_load_avg
Date: Fri, 28 Apr 2017 16:38:45 -0400	[thread overview]
Message-ID: <20170428203845.GA22354@htj.duckdns.org> (raw)
In-Reply-To: <20170428203347.GC19364@htj.duckdns.org>

Here's the debug patch.

The debug condition triggers when the load balancer picks a group w/o
more than one schbench threads on a CPU over one w/.

 /sys/module/fair/parameters/dbg_odd_cnt: resettable counter
 /sys/module/fair/parameters/dbg_odd_nth: dump group states on Nth
					  occurrence via trace_printk()

The load / weights are printed out so that NICE_0_LOAD is 1.000.

Thanks.
---
 kernel/sched/fair.c |  160 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 159 insertions(+), 1 deletion(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,11 +32,18 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
+#include <linux/moduleparam.h>
 
 #include <trace/events/sched.h>
 
 #include "sched.h"
 
+static unsigned long dbg_odd_nth;
+static unsigned long dbg_odd_cnt;
+
+module_param(dbg_odd_nth, ulong, 0644);
+module_param(dbg_odd_cnt, ulong, 0644);
+
 /*
  * Targeted preemption latency for CPU-bound tasks:
  *
@@ -7413,6 +7420,149 @@ static inline void update_sg_lb_stats(st
 	sgs->group_type = group_classify(group, sgs);
 }
 
+static int count_schb(struct rq *rq)
+{
+	unsigned long flags;
+	struct task_struct *p;
+	int cnt = 0;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
+		if (!strncmp(p->comm, "schbench", 8))
+			cnt++;
+
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	return cnt;
+}
+
+static bool sg_has_two_schb(struct sched_group *sg)
+{
+	int cpu;
+
+	for_each_cpu(cpu, sched_group_cpus(sg))
+		if (count_schb(cpu_rq(cpu)) >= 2)
+			return true;
+	return false;
+}
+
+static DEFINE_PER_CPU(char [PAGE_SIZE], odd_buf);
+
+#define lbw(x)	(int)((x) / NICE_0_LOAD), (int)(((x) % NICE_0_LOAD) * 1000 / NICE_0_LOAD)
+#define lba(x)	(int)((scale_load(x)) / NICE_0_LOAD), (int)(((scale_load(x)) % NICE_0_LOAD) * 1000 / NICE_0_LOAD)
+
+static int odd_append_se(struct sched_entity *se, const char *postfix,
+			 int cnt, char *buf, size_t size)
+{
+#define odd_append(fmt, args...)	do {				\
+	cnt += scnprintf(buf + cnt, size - cnt, fmt, ##args);		\
+	cnt = min_t(int, cnt, size);					\
+} while (0)
+
+	if (entity_is_task(se)) {
+		struct task_struct *task = task_of(se);
+		odd_append(" %s(%d%s)", task->comm, task->pid, postfix);
+	} else {
+		char nbuf[64];
+		cgroup_name(se->my_q->tg->css.cgroup, nbuf, sizeof(nbuf));
+		odd_append(" %s(%s)", nbuf, postfix);
+	}
+	odd_append(":w=%d.%03d,l=%d.%03d,u=%d.%03d",
+		   lbw(se->load.weight),
+		   lba(se->avg.load_avg),
+		   lba(se->avg.util_avg));
+
+	return cnt;
+}
+
+static void dbg_odd_dump(const char *pref,
+			 struct sched_group *sg, struct sg_lb_stats *sgs)
+{
+	int cpu;
+
+	trace_printk("%sgrp=%*pbl w=%u avg=%d.%03d grp=%d.%03d sum=%d.%03d pertask=%d.%03d\n", pref,
+		     cpumask_pr_args(sched_group_cpus(sg)), sg->group_weight,
+		     lba(sgs->avg_load), lba(sgs->group_load),
+		     lba(sgs->sum_weighted_load), lba(sgs->load_per_task));
+	trace_printk("%sgcap=%d.%03d gutil=%d.%03d run=%u idle=%u gwt=%u type=%d nocap=%d\n",
+		     pref,
+		     lba(sgs->group_capacity), lba(sgs->group_util),
+		     sgs->sum_nr_running, sgs->idle_cpus, sgs->group_weight,
+		     sgs->group_type, sgs->group_no_capacity);
+
+	for_each_cpu(cpu, sched_group_cpus(sg)) {
+		struct task_group *tg;
+		unsigned long flags;
+
+		trace_printk("%sCPU%03d: run=%u schb=%d\n", pref, cpu,
+			     cpu_rq(cpu)->nr_running, count_schb(cpu_rq(cpu)));
+
+		raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags);
+
+		list_for_each_entry_rcu(tg, &task_groups, list) {
+			struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+			char qname[32] = "root";
+			int depth = 0;
+			long tg_weight = 0, tg_shares = 0;
+			struct sched_entity *se;
+			char *buf = per_cpu_ptr(odd_buf, cpu);
+			int cnt;
+
+			if (!cfs_rq->nr_running)
+				continue;
+
+			if (cfs_rq->tg) {
+				cgroup_name(cfs_rq->tg->css.cgroup, qname, sizeof(qname));
+				if (cfs_rq->tg->se[cpu])
+					depth = cfs_rq->tg->se[cpu]->depth;
+				tg_weight = atomic_long_read(&cfs_rq->tg->load_avg);
+				tg_shares = cfs_rq->tg->shares;
+			}
+
+			trace_printk("%sQ%03d-%s@%d: w=%d.%03d,l=%d.%03d,u=%d.%03d,r=%d.%03d run=%u hrun=%u tgs=%d.%03d tgw=%d.%03d\n",
+				     pref, cpu, qname, depth,
+				     lbw(cfs_rq->load.weight),
+				     lba(cfs_rq->avg.load_avg),
+				     lba(cfs_rq->avg.util_avg),
+				     lba(cfs_rq->runnable_load_avg),
+				     cfs_rq->nr_running, cfs_rq->h_nr_running,
+				     lbw(tg_shares),
+				     lba(tg_weight));
+
+			buf[0] = '\0';
+			cnt = 0;
+
+			if (cfs_rq->curr)
+				cnt = odd_append_se(cfs_rq->curr, "C", cnt, buf, PAGE_SIZE);
+
+			for (se = __pick_first_entity(cfs_rq); se;
+			     se = __pick_next_entity(se))
+				cnt = odd_append_se(se, "", cnt, buf, PAGE_SIZE);
+
+			trace_printk("%sQ%03d-%s@%d: %s\n",
+				     pref, cpu, qname, depth, buf);
+		}
+
+		raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags);
+	}
+}
+
+/* a has >= 2 dts, b doesn't */
+static void dbg_odd(struct lb_env *env,
+		    struct sched_group *sga, struct sg_lb_stats *sgsa,
+		    struct sched_group *sgb, struct sg_lb_stats *sgsb)
+{
+	if (dbg_odd_nth && (dbg_odd_cnt++ % dbg_odd_nth))
+		return;
+
+	trace_printk("odd: dst=%d idle=%d brk=%u lbtgt=%*pbl type=%d\n",
+		     env->dst_cpu, env->idle, env->loop_break,
+		     cpumask_pr_args(env->cpus), env->fbq_type);
+	dbg_odd_dump("A: ", sga, sgsa);
+	dbg_odd_dump("B: ", sgb, sgsb);
+}
+
 /**
  * update_sd_pick_busiest - return 1 on busiest group
  * @env: The load balancing environment.
@@ -7432,6 +7582,8 @@ static bool update_sd_pick_busiest(struc
 				   struct sg_lb_stats *sgs)
 {
 	struct sg_lb_stats *busiest = &sds->busiest_stat;
+	bool busiest_has_two = sds->busiest && sg_has_two_schb(sds->busiest);
+	bool sg_has_two = sg_has_two_schb(sg);
 
 	if (sgs->group_type > busiest->group_type)
 		return true;
@@ -7439,8 +7591,14 @@ static bool update_sd_pick_busiest(struc
 	if (sgs->group_type < busiest->group_type)
 		return false;
 
-	if (sgs->avg_load <= busiest->avg_load)
+	if (sgs->avg_load <= busiest->avg_load) {
+		if (sg_has_two && !busiest_has_two)
+			dbg_odd(env, sg, sgs, sds->busiest, busiest);
 		return false;
+	}
+
+	if (!sg_has_two && busiest_has_two)
+		dbg_odd(env, sds->busiest, busiest, sg, sgs);
 
 	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
 		goto asym_packing;

  reply	other threads:[~2017-04-28 20:38 UTC|newest]

Thread overview: 69+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-04-24 20:13 [RFC PATCHSET] sched/fair: fix load balancer behavior when cgroup is in use Tejun Heo
2017-04-24 20:14 ` [PATCH 1/2] sched/fair: Fix how load gets propagated from cfs_rq to its sched_entity Tejun Heo
2017-04-24 21:33   ` [PATCH v2 " Tejun Heo
2017-05-03 18:00     ` Peter Zijlstra
2017-05-03 21:45       ` Tejun Heo
2017-05-04  5:51         ` Peter Zijlstra
2017-05-04  6:21           ` Peter Zijlstra
2017-05-04  9:49             ` Dietmar Eggemann
2017-05-04 10:57               ` Peter Zijlstra
2017-05-04 17:39               ` Tejun Heo
2017-05-05 10:36                 ` Dietmar Eggemann
2017-05-04 10:26       ` Vincent Guittot
2017-04-25  8:35   ` [PATCH " Vincent Guittot
2017-04-25 18:12     ` Tejun Heo
2017-04-26 16:51       ` Vincent Guittot
2017-04-26 22:40         ` Tejun Heo
2017-04-27  7:00           ` Vincent Guittot
2017-05-01 14:17         ` Peter Zijlstra
2017-05-01 14:52           ` Peter Zijlstra
2017-05-01 21:56           ` Tejun Heo
2017-05-02  8:19             ` Peter Zijlstra
2017-05-02  8:30               ` Peter Zijlstra
2017-05-02 20:00                 ` Tejun Heo
2017-05-03  9:10                   ` Peter Zijlstra
2017-04-26 16:14   ` Vincent Guittot
2017-04-26 22:27     ` Tejun Heo
2017-04-27  8:59       ` Vincent Guittot
2017-04-28 17:46         ` Tejun Heo
2017-05-02  7:20           ` Vincent Guittot
2017-04-24 20:14 ` [PATCH 2/2] sched/fair: Always propagate runnable_load_avg Tejun Heo
2017-04-25  8:46   ` Vincent Guittot
2017-04-25  9:05     ` Vincent Guittot
2017-04-25 12:59       ` Vincent Guittot
2017-04-25 18:49         ` Tejun Heo
2017-04-25 20:49           ` Tejun Heo
2017-04-25 21:15             ` Chris Mason
2017-04-25 21:08           ` Tejun Heo
2017-04-26 10:21             ` Vincent Guittot
2017-04-27  0:30               ` Tejun Heo
2017-04-27  8:28                 ` Vincent Guittot
2017-04-28 16:14                   ` Tejun Heo
2017-05-02  6:56                     ` Vincent Guittot
2017-05-02 20:56                       ` Tejun Heo
2017-05-03  7:25                         ` Vincent Guittot
2017-05-03  7:54                           ` Vincent Guittot
2017-04-26 18:12   ` Vincent Guittot
2017-04-26 22:52     ` Tejun Heo
2017-04-27  8:29       ` Vincent Guittot
2017-04-28 20:33         ` Tejun Heo
2017-04-28 20:38           ` Tejun Heo [this message]
2017-05-01 15:56           ` Peter Zijlstra
2017-05-02 22:01             ` Tejun Heo
2017-05-02  7:18           ` Vincent Guittot
2017-05-02 13:26             ` Vincent Guittot
2017-05-02 22:37               ` Tejun Heo
2017-05-02 21:50             ` Tejun Heo
2017-05-03  7:34               ` Vincent Guittot
2017-05-03  9:37                 ` Peter Zijlstra
2017-05-03 10:37                   ` Vincent Guittot
2017-05-03 13:09                     ` Peter Zijlstra
2017-05-03 21:49                       ` Tejun Heo
2017-05-04  8:19                         ` Vincent Guittot
2017-05-04 17:43                           ` Tejun Heo
2017-05-04 19:02                             ` Vincent Guittot
2017-05-04 19:04                               ` Tejun Heo
2017-04-24 21:35 ` [PATCH 3/2] sched/fair: Skip __update_load_avg() on cfs_rq sched_entities Tejun Heo
2017-04-24 21:48   ` Peter Zijlstra
2017-04-24 22:54     ` Tejun Heo
2017-04-25 21:09   ` Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170428203845.GA22354@htj.duckdns.org \
    --to=tj@kernel.org \
    --cc=clm@fb.com \
    --cc=efault@gmx.de \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=torvalds@linux-foundation.org \
    --cc=vincent.guittot@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).