All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tejun Heo <tj@kernel.org>
To: Vincent Guittot <vincent.guittot@linaro.org>
Cc: Ingo Molnar <mingo@redhat.com>,
	Peter Zijlstra <peterz@infradead.org>,
	linux-kernel <linux-kernel@vger.kernel.org>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Mike Galbraith <efault@gmx.de>, Paul Turner <pjt@google.com>,
	Chris Mason <clm@fb.com>,
	kernel-team@fb.com
Subject: Re: [PATCH 2/2] sched/fair: Always propagate runnable_load_avg
Date: Fri, 28 Apr 2017 16:38:45 -0400	[thread overview]
Message-ID: <20170428203845.GA22354@htj.duckdns.org> (raw)
In-Reply-To: <20170428203347.GC19364@htj.duckdns.org>

Here's the debug patch.

The debug condition triggers when the load balancer picks a group w/o
more than one schbench threads on a CPU over one w/.

 /sys/module/fair/parameters/dbg_odd_cnt: resettable counter
 /sys/module/fair/parameters/dbg_odd_nth: dump group states on Nth
					  occurrence via trace_printk()

The load / weights are printed out so that NICE_0_LOAD is 1.000.

Thanks.
---
 kernel/sched/fair.c |  160 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 159 insertions(+), 1 deletion(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -32,11 +32,18 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
+#include <linux/moduleparam.h>
 
 #include <trace/events/sched.h>
 
 #include "sched.h"
 
+static unsigned long dbg_odd_nth;
+static unsigned long dbg_odd_cnt;
+
+module_param(dbg_odd_nth, ulong, 0644);
+module_param(dbg_odd_cnt, ulong, 0644);
+
 /*
  * Targeted preemption latency for CPU-bound tasks:
  *
@@ -7413,6 +7420,149 @@ static inline void update_sg_lb_stats(st
 	sgs->group_type = group_classify(group, sgs);
 }
 
+static int count_schb(struct rq *rq)
+{
+	unsigned long flags;
+	struct task_struct *p;
+	int cnt = 0;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
+		if (!strncmp(p->comm, "schbench", 8))
+			cnt++;
+
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	return cnt;
+}
+
+static bool sg_has_two_schb(struct sched_group *sg)
+{
+	int cpu;
+
+	for_each_cpu(cpu, sched_group_cpus(sg))
+		if (count_schb(cpu_rq(cpu)) >= 2)
+			return true;
+	return false;
+}
+
+static DEFINE_PER_CPU(char [PAGE_SIZE], odd_buf);
+
+#define lbw(x)	(int)((x) / NICE_0_LOAD), (int)(((x) % NICE_0_LOAD) * 1000 / NICE_0_LOAD)
+#define lba(x)	(int)((scale_load(x)) / NICE_0_LOAD), (int)(((scale_load(x)) % NICE_0_LOAD) * 1000 / NICE_0_LOAD)
+
+static int odd_append_se(struct sched_entity *se, const char *postfix,
+			 int cnt, char *buf, size_t size)
+{
+#define odd_append(fmt, args...)	do {				\
+	cnt += scnprintf(buf + cnt, size - cnt, fmt, ##args);		\
+	cnt = min_t(int, cnt, size);					\
+} while (0)
+
+	if (entity_is_task(se)) {
+		struct task_struct *task = task_of(se);
+		odd_append(" %s(%d%s)", task->comm, task->pid, postfix);
+	} else {
+		char nbuf[64];
+		cgroup_name(se->my_q->tg->css.cgroup, nbuf, sizeof(nbuf));
+		odd_append(" %s(%s)", nbuf, postfix);
+	}
+	odd_append(":w=%d.%03d,l=%d.%03d,u=%d.%03d",
+		   lbw(se->load.weight),
+		   lba(se->avg.load_avg),
+		   lba(se->avg.util_avg));
+
+	return cnt;
+}
+
+static void dbg_odd_dump(const char *pref,
+			 struct sched_group *sg, struct sg_lb_stats *sgs)
+{
+	int cpu;
+
+	trace_printk("%sgrp=%*pbl w=%u avg=%d.%03d grp=%d.%03d sum=%d.%03d pertask=%d.%03d\n", pref,
+		     cpumask_pr_args(sched_group_cpus(sg)), sg->group_weight,
+		     lba(sgs->avg_load), lba(sgs->group_load),
+		     lba(sgs->sum_weighted_load), lba(sgs->load_per_task));
+	trace_printk("%sgcap=%d.%03d gutil=%d.%03d run=%u idle=%u gwt=%u type=%d nocap=%d\n",
+		     pref,
+		     lba(sgs->group_capacity), lba(sgs->group_util),
+		     sgs->sum_nr_running, sgs->idle_cpus, sgs->group_weight,
+		     sgs->group_type, sgs->group_no_capacity);
+
+	for_each_cpu(cpu, sched_group_cpus(sg)) {
+		struct task_group *tg;
+		unsigned long flags;
+
+		trace_printk("%sCPU%03d: run=%u schb=%d\n", pref, cpu,
+			     cpu_rq(cpu)->nr_running, count_schb(cpu_rq(cpu)));
+
+		raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags);
+
+		list_for_each_entry_rcu(tg, &task_groups, list) {
+			struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
+			char qname[32] = "root";
+			int depth = 0;
+			long tg_weight = 0, tg_shares = 0;
+			struct sched_entity *se;
+			char *buf = per_cpu_ptr(odd_buf, cpu);
+			int cnt;
+
+			if (!cfs_rq->nr_running)
+				continue;
+
+			if (cfs_rq->tg) {
+				cgroup_name(cfs_rq->tg->css.cgroup, qname, sizeof(qname));
+				if (cfs_rq->tg->se[cpu])
+					depth = cfs_rq->tg->se[cpu]->depth;
+				tg_weight = atomic_long_read(&cfs_rq->tg->load_avg);
+				tg_shares = cfs_rq->tg->shares;
+			}
+
+			trace_printk("%sQ%03d-%s@%d: w=%d.%03d,l=%d.%03d,u=%d.%03d,r=%d.%03d run=%u hrun=%u tgs=%d.%03d tgw=%d.%03d\n",
+				     pref, cpu, qname, depth,
+				     lbw(cfs_rq->load.weight),
+				     lba(cfs_rq->avg.load_avg),
+				     lba(cfs_rq->avg.util_avg),
+				     lba(cfs_rq->runnable_load_avg),
+				     cfs_rq->nr_running, cfs_rq->h_nr_running,
+				     lbw(tg_shares),
+				     lba(tg_weight));
+
+			buf[0] = '\0';
+			cnt = 0;
+
+			if (cfs_rq->curr)
+				cnt = odd_append_se(cfs_rq->curr, "C", cnt, buf, PAGE_SIZE);
+
+			for (se = __pick_first_entity(cfs_rq); se;
+			     se = __pick_next_entity(se))
+				cnt = odd_append_se(se, "", cnt, buf, PAGE_SIZE);
+
+			trace_printk("%sQ%03d-%s@%d: %s\n",
+				     pref, cpu, qname, depth, buf);
+		}
+
+		raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags);
+	}
+}
+
+/* a has >= 2 dts, b doesn't */
+static void dbg_odd(struct lb_env *env,
+		    struct sched_group *sga, struct sg_lb_stats *sgsa,
+		    struct sched_group *sgb, struct sg_lb_stats *sgsb)
+{
+	if (dbg_odd_nth && (dbg_odd_cnt++ % dbg_odd_nth))
+		return;
+
+	trace_printk("odd: dst=%d idle=%d brk=%u lbtgt=%*pbl type=%d\n",
+		     env->dst_cpu, env->idle, env->loop_break,
+		     cpumask_pr_args(env->cpus), env->fbq_type);
+	dbg_odd_dump("A: ", sga, sgsa);
+	dbg_odd_dump("B: ", sgb, sgsb);
+}
+
 /**
  * update_sd_pick_busiest - return 1 on busiest group
  * @env: The load balancing environment.
@@ -7432,6 +7582,8 @@ static bool update_sd_pick_busiest(struc
 				   struct sg_lb_stats *sgs)
 {
 	struct sg_lb_stats *busiest = &sds->busiest_stat;
+	bool busiest_has_two = sds->busiest && sg_has_two_schb(sds->busiest);
+	bool sg_has_two = sg_has_two_schb(sg);
 
 	if (sgs->group_type > busiest->group_type)
 		return true;
@@ -7439,8 +7591,14 @@ static bool update_sd_pick_busiest(struc
 	if (sgs->group_type < busiest->group_type)
 		return false;
 
-	if (sgs->avg_load <= busiest->avg_load)
+	if (sgs->avg_load <= busiest->avg_load) {
+		if (sg_has_two && !busiest_has_two)
+			dbg_odd(env, sg, sgs, sds->busiest, busiest);
 		return false;
+	}
+
+	if (!sg_has_two && busiest_has_two)
+		dbg_odd(env, sds->busiest, busiest, sg, sgs);
 
 	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
 		goto asym_packing;

  reply	other threads:[~2017-04-28 20:38 UTC|newest]

Thread overview: 69+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-04-24 20:13 [RFC PATCHSET] sched/fair: fix load balancer behavior when cgroup is in use Tejun Heo
2017-04-24 20:14 ` [PATCH 1/2] sched/fair: Fix how load gets propagated from cfs_rq to its sched_entity Tejun Heo
2017-04-24 21:33   ` [PATCH v2 " Tejun Heo
2017-05-03 18:00     ` Peter Zijlstra
2017-05-03 21:45       ` Tejun Heo
2017-05-04  5:51         ` Peter Zijlstra
2017-05-04  6:21           ` Peter Zijlstra
2017-05-04  9:49             ` Dietmar Eggemann
2017-05-04 10:57               ` Peter Zijlstra
2017-05-04 17:39               ` Tejun Heo
2017-05-05 10:36                 ` Dietmar Eggemann
2017-05-04 10:26       ` Vincent Guittot
2017-04-25  8:35   ` [PATCH " Vincent Guittot
2017-04-25 18:12     ` Tejun Heo
2017-04-26 16:51       ` Vincent Guittot
2017-04-26 22:40         ` Tejun Heo
2017-04-27  7:00           ` Vincent Guittot
2017-05-01 14:17         ` Peter Zijlstra
2017-05-01 14:52           ` Peter Zijlstra
2017-05-01 21:56           ` Tejun Heo
2017-05-02  8:19             ` Peter Zijlstra
2017-05-02  8:30               ` Peter Zijlstra
2017-05-02 20:00                 ` Tejun Heo
2017-05-03  9:10                   ` Peter Zijlstra
2017-04-26 16:14   ` Vincent Guittot
2017-04-26 22:27     ` Tejun Heo
2017-04-27  8:59       ` Vincent Guittot
2017-04-28 17:46         ` Tejun Heo
2017-05-02  7:20           ` Vincent Guittot
2017-04-24 20:14 ` [PATCH 2/2] sched/fair: Always propagate runnable_load_avg Tejun Heo
2017-04-25  8:46   ` Vincent Guittot
2017-04-25  9:05     ` Vincent Guittot
2017-04-25 12:59       ` Vincent Guittot
2017-04-25 18:49         ` Tejun Heo
2017-04-25 20:49           ` Tejun Heo
2017-04-25 21:15             ` Chris Mason
2017-04-25 21:08           ` Tejun Heo
2017-04-26 10:21             ` Vincent Guittot
2017-04-27  0:30               ` Tejun Heo
2017-04-27  8:28                 ` Vincent Guittot
2017-04-28 16:14                   ` Tejun Heo
2017-05-02  6:56                     ` Vincent Guittot
2017-05-02 20:56                       ` Tejun Heo
2017-05-03  7:25                         ` Vincent Guittot
2017-05-03  7:54                           ` Vincent Guittot
2017-04-26 18:12   ` Vincent Guittot
2017-04-26 22:52     ` Tejun Heo
2017-04-27  8:29       ` Vincent Guittot
2017-04-28 20:33         ` Tejun Heo
2017-04-28 20:38           ` Tejun Heo [this message]
2017-05-01 15:56           ` Peter Zijlstra
2017-05-02 22:01             ` Tejun Heo
2017-05-02  7:18           ` Vincent Guittot
2017-05-02 13:26             ` Vincent Guittot
2017-05-02 22:37               ` Tejun Heo
2017-05-02 21:50             ` Tejun Heo
2017-05-03  7:34               ` Vincent Guittot
2017-05-03  9:37                 ` Peter Zijlstra
2017-05-03 10:37                   ` Vincent Guittot
2017-05-03 13:09                     ` Peter Zijlstra
2017-05-03 21:49                       ` Tejun Heo
2017-05-04  8:19                         ` Vincent Guittot
2017-05-04 17:43                           ` Tejun Heo
2017-05-04 19:02                             ` Vincent Guittot
2017-05-04 19:04                               ` Tejun Heo
2017-04-24 21:35 ` [PATCH 3/2] sched/fair: Skip __update_load_avg() on cfs_rq sched_entities Tejun Heo
2017-04-24 21:48   ` Peter Zijlstra
2017-04-24 22:54     ` Tejun Heo
2017-04-25 21:09   ` Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170428203845.GA22354@htj.duckdns.org \
    --to=tj@kernel.org \
    --cc=clm@fb.com \
    --cc=efault@gmx.de \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=pjt@google.com \
    --cc=torvalds@linux-foundation.org \
    --cc=vincent.guittot@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.