All of lore.kernel.org
 help / color / mirror / Atom feed
From: Steve Sistare <steven.sistare@oracle.com>
To: mingo@redhat.com, peterz@infradead.org
Cc: subhra.mazumdar@oracle.com, dhaval.giani@oracle.com,
	daniel.m.jordan@oracle.com, pavel.tatashin@microsoft.com,
	matt@codeblueprint.co.uk, umgwanakikbuti@gmail.com,
	riel@redhat.com, jbacik@fb.com, juri.lelli@redhat.com,
	valentin.schneider@arm.com, vincent.guittot@linaro.org,
	quentin.perret@arm.com, steven.sistare@oracle.com,
	linux-kernel@vger.kernel.org
Subject: [PATCH v2 10/10] sched/fair: Provide idle search schedstats
Date: Mon,  5 Nov 2018 12:08:09 -0800	[thread overview]
Message-ID: <1541448489-19692-11-git-send-email-steven.sistare@oracle.com> (raw)
In-Reply-To: <1541448489-19692-1-git-send-email-steven.sistare@oracle.com>

Add schedstats to measure the effectiveness of searching for idle CPUs
and stealing tasks.  This is a temporary patch intended for use during
development only.  SCHEDSTAT_VERSION is bumped to 16, and the following
fields are added to the per-CPU statistics of /proc/schedstat:

field 10: # of times select_idle_sibling "easily" found an idle CPU --
          prev or target is idle.
field 11: # of times select_idle_sibling searched and found an idle cpu.
field 12: # of times select_idle_sibling searched and found an idle core.
field 13: # of times select_idle_sibling failed to find anything idle.
field 14: time in nanoseconds spent in functions that search for idle
          CPUs and search for tasks to steal.
field 15: # of times an idle CPU steals a task from another CPU.
field 16: # of times try_steal finds overloaded CPUs but no task is
           migratable.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
 kernel/sched/core.c  | 30 +++++++++++++++++++++++++++--
 kernel/sched/fair.c  | 54 ++++++++++++++++++++++++++++++++++++++++++++++------
 kernel/sched/sched.h |  9 +++++++++
 kernel/sched/stats.c | 11 ++++++++++-
 kernel/sched/stats.h | 13 +++++++++++++
 5 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ad97f3b..b61d15d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2214,17 +2214,43 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
 static bool __initdata __sched_schedstats = false;
 
+unsigned long schedstat_skid;
+
+static void compute_skid(void)
+{
+	int i, n = 0;
+	s64 t, skid = 0;
+
+	for (i = 0; i < 100; i++) {
+		t = local_clock();
+		t = local_clock() - t;
+		if (t > 0 && t < 1000) {	/* only use sane samples */
+			skid += t;
+			n++;
+		}
+	}
+
+	if (n > 0)
+		schedstat_skid = skid / n;
+	else
+		schedstat_skid = 0;
+	pr_info("schedstat_skid = %lu\n", schedstat_skid);
+}
+
 static void set_schedstats(bool enabled)
 {
-	if (enabled)
+	if (enabled) {
+		compute_skid();
 		static_branch_enable(&sched_schedstats);
-	else
+	} else {
 		static_branch_disable(&sched_schedstats);
+	}
 }
 
 void force_schedstat_enabled(void)
 {
 	if (!schedstat_enabled()) {
+		compute_skid();
 		pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
 		static_branch_enable(&sched_schedstats);
 	}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 56dce30..21ffe34 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3739,29 +3739,35 @@ static inline bool steal_enabled(void)
 static void overload_clear(struct rq *rq)
 {
 	struct sparsemask *overload_cpus;
+	unsigned long time;
 
 	if (!steal_enabled())
 		return;
 
+	time = schedstat_start_time();
 	rcu_read_lock();
 	overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
 	if (overload_cpus)
 		sparsemask_clear_elem(rq->cpu, overload_cpus);
 	rcu_read_unlock();
+	schedstat_end_time(rq->find_time, time);
 }
 
 static void overload_set(struct rq *rq)
 {
 	struct sparsemask *overload_cpus;
+	unsigned long time;
 
 	if (!steal_enabled())
 		return;
 
+	time = schedstat_start_time();
 	rcu_read_lock();
 	overload_cpus = rcu_dereference(rq->cfs_overload_cpus);
 	if (overload_cpus)
 		sparsemask_set_elem(rq->cpu, overload_cpus);
 	rcu_read_unlock();
+	schedstat_end_time(rq->find_time, time);
 }
 
 static int try_steal(struct rq *this_rq, struct rq_flags *rf);
@@ -6165,6 +6171,16 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 	return cpu;
 }
 
+#define SET_STAT(STAT)							\
+	do {								\
+		if (schedstat_enabled()) {				\
+			struct rq *rq = this_rq();			\
+									\
+			if (rq)						\
+				__schedstat_inc(rq->STAT);		\
+		}							\
+	} while (0)
+
 /*
  * Try and locate an idle core/thread in the LLC cache domain.
  */
@@ -6173,14 +6189,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	struct sched_domain *sd;
 	int i, recent_used_cpu;
 
-	if (available_idle_cpu(target))
+	if (available_idle_cpu(target)) {
+		SET_STAT(found_idle_cpu_easy);
 		return target;
+	}
 
 	/*
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
-	if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
+	if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev)) {
+		SET_STAT(found_idle_cpu_easy);
 		return prev;
+	}
 
 	/* Check a recently used CPU as a potential idle candidate: */
 	recent_used_cpu = p->recent_used_cpu;
@@ -6193,26 +6213,36 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 		 * Replace recent_used_cpu with prev as it is a potential
 		 * candidate for the next wake:
 		 */
+		SET_STAT(found_idle_cpu_easy);
 		p->recent_used_cpu = prev;
 		return recent_used_cpu;
 	}
 
 	sd = rcu_dereference(per_cpu(sd_llc, target));
-	if (!sd)
+	if (!sd) {
+		SET_STAT(nofound_idle_cpu);
 		return target;
+	}
 
 	i = select_idle_core(p, sd, target);
-	if ((unsigned)i < nr_cpumask_bits)
+	if ((unsigned)i < nr_cpumask_bits) {
+		SET_STAT(found_idle_core);
 		return i;
+	}
 
 	i = select_idle_cpu(p, sd, target);
-	if ((unsigned)i < nr_cpumask_bits)
+	if ((unsigned)i < nr_cpumask_bits) {
+		SET_STAT(found_idle_cpu);
 		return i;
+	}
 
 	i = select_idle_smt(p, sd, target);
-	if ((unsigned)i < nr_cpumask_bits)
+	if ((unsigned)i < nr_cpumask_bits) {
+		SET_STAT(found_idle_cpu);
 		return i;
+	}
 
+	SET_STAT(nofound_idle_cpu);
 	return target;
 }
 
@@ -6363,6 +6393,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 static int
 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
 {
+	unsigned long time = schedstat_start_time();
 	struct sched_domain *tmp, *sd = NULL;
 	int cpu = smp_processor_id();
 	int new_cpu = prev_cpu;
@@ -6411,6 +6442,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 			current->recent_used_cpu = cpu;
 	}
 	rcu_read_unlock();
+	schedstat_end_time(cpu_rq(cpu)->find_time, time);
 
 	return new_cpu;
 }
@@ -6657,6 +6689,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	struct sched_entity *se;
 	struct task_struct *p;
 	int new_tasks;
+	unsigned long time;
 
 again:
 	if (!cfs_rq->nr_running)
@@ -6767,6 +6800,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	return p;
 
 idle:
+	time = schedstat_start_time();
+
 	/*
 	 * We must set idle_stamp _before_ calling try_steal() or
 	 * idle_balance(), such that we measure the duration as idle time.
@@ -6780,6 +6815,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (new_tasks)
 		IF_SMP(rq->idle_stamp = 0;)
 
+	schedstat_end_time(rq->find_time, time);
+
 	/*
 	 * Because try_steal() and idle_balance() release (and re-acquire)
 	 * rq->lock, it is possible for any higher priority task to appear.
@@ -9770,6 +9807,7 @@ static int steal_from(struct rq *dst_rq, struct rq_flags *dst_rf, bool *locked,
 		update_rq_clock(dst_rq);
 		attach_task(dst_rq, p);
 		stolen = 1;
+		schedstat_inc(dst_rq->steal);
 	}
 	local_irq_restore(rf.flags);
 
@@ -9794,6 +9832,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf)
 	int dst_cpu = dst_rq->cpu;
 	bool locked = true;
 	int stolen = 0;
+	bool any_overload = false;
 	struct sparsemask *overload_cpus;
 
 	if (!steal_enabled())
@@ -9836,6 +9875,7 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf)
 			stolen = 1;
 			goto out;
 		}
+		any_overload = true;
 	}
 
 out:
@@ -9847,6 +9887,8 @@ static int try_steal(struct rq *dst_rq, struct rq_flags *dst_rf)
 	stolen |= (dst_rq->cfs.h_nr_running > 0);
 	if (dst_rq->nr_running != dst_rq->cfs.h_nr_running)
 		stolen = -1;
+	if (!stolen && any_overload)
+		schedstat_inc(dst_rq->steal_fail);
 	return stolen;
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 5f181e9..9f58e17 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -907,6 +907,15 @@ struct rq {
 	/* try_to_wake_up() stats */
 	unsigned int		ttwu_count;
 	unsigned int		ttwu_local;
+
+	/* Idle search stats */
+	unsigned int		found_idle_core;
+	unsigned int		found_idle_cpu;
+	unsigned int		found_idle_cpu_easy;
+	unsigned int		nofound_idle_cpu;
+	unsigned long		find_time;
+	unsigned int		steal;
+	unsigned int		steal_fail;
 #endif
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 750fb3c..00b3de5 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -10,7 +10,7 @@
  * Bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
-#define SCHEDSTAT_VERSION 15
+#define SCHEDSTAT_VERSION 16
 
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -37,6 +37,15 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		    rq->rq_cpu_time,
 		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
 
+		seq_printf(seq, " %u %u %u %u %lu %u %u",
+			   rq->found_idle_cpu_easy,
+			   rq->found_idle_cpu,
+			   rq->found_idle_core,
+			   rq->nofound_idle_cpu,
+			   rq->find_time,
+			   rq->steal,
+			   rq->steal_fail);
+
 		seq_printf(seq, "\n");
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8aea199..50c3cf8 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -39,6 +39,17 @@
 #define   schedstat_set(var, val)	do { if (schedstat_enabled()) { var = (val); } } while (0)
 #define   schedstat_val(var)		(var)
 #define   schedstat_val_or_zero(var)	((schedstat_enabled()) ? (var) : 0)
+#define   schedstat_start_time()	schedstat_val_or_zero(local_clock())
+#define   schedstat_end_time(stat, time)			\
+	do {							\
+		unsigned long endtime;				\
+								\
+		if (schedstat_enabled() && (time)) {		\
+			endtime = local_clock() - (time) - schedstat_skid; \
+			schedstat_add((stat), endtime);		\
+		}						\
+	} while (0)
+extern unsigned long schedstat_skid;
 
 #else /* !CONFIG_SCHEDSTATS: */
 static inline void rq_sched_info_arrive  (struct rq *rq, unsigned long long delta) { }
@@ -53,6 +64,8 @@ static inline void rq_sched_info_depart  (struct rq *rq, unsigned long long delt
 # define   schedstat_set(var, val)	do { } while (0)
 # define   schedstat_val(var)		0
 # define   schedstat_val_or_zero(var)	0
+# define   schedstat_start_time()	0
+# define   schedstat_end_time(stat, t)	do { } while (0)
 #endif /* CONFIG_SCHEDSTATS */
 
 #ifdef CONFIG_SCHED_INFO
-- 
1.8.3.1


  parent reply	other threads:[~2018-11-05 20:18 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-11-05 20:07 [PATCH v2 00/10] steal tasks to improve CPU utilization Steve Sistare
2018-11-05 20:08 ` [PATCH v2 01/10] sched: Provide sparsemask, a reduced contention bitmap Steve Sistare
2018-11-05 20:08 ` [PATCH v2 02/10] sched/topology: Provide hooks to allocate data shared per LLC Steve Sistare
2018-11-05 20:08 ` [PATCH v2 03/10] sched/topology: Provide cfs_overload_cpus bitmap Steve Sistare
2018-11-05 20:08 ` [PATCH v2 04/10] sched/fair: Dynamically update cfs_overload_cpus Steve Sistare
2018-11-05 20:08 ` [PATCH v2 05/10] sched/fair: Hoist idle_stamp up from idle_balance Steve Sistare
2018-11-05 20:08 ` [PATCH v2 06/10] sched/fair: Generalize the detach_task interface Steve Sistare
2018-11-05 20:08 ` [PATCH v2 07/10] sched/fair: Provide can_migrate_task_llc Steve Sistare
2018-11-05 20:08 ` [PATCH v2 08/10] sched/fair: Steal work from an overloaded CPU when CPU goes idle Steve Sistare
2018-11-05 20:08 ` [PATCH v2 09/10] sched/fair: disable stealing if too many NUMA nodes Steve Sistare
2018-11-05 20:08 ` Steve Sistare [this message]
2018-11-06 17:52 ` [PATCH v2 00/10] steal tasks to improve CPU utilization Valentin Schneider
2018-11-06 18:14   ` Steven Sistare
2018-11-09 12:58     ` Steven Sistare

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1541448489-19692-11-git-send-email-steven.sistare@oracle.com \
    --to=steven.sistare@oracle.com \
    --cc=daniel.m.jordan@oracle.com \
    --cc=dhaval.giani@oracle.com \
    --cc=jbacik@fb.com \
    --cc=juri.lelli@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=matt@codeblueprint.co.uk \
    --cc=mingo@redhat.com \
    --cc=pavel.tatashin@microsoft.com \
    --cc=peterz@infradead.org \
    --cc=quentin.perret@arm.com \
    --cc=riel@redhat.com \
    --cc=subhra.mazumdar@oracle.com \
    --cc=umgwanakikbuti@gmail.com \
    --cc=valentin.schneider@arm.com \
    --cc=vincent.guittot@linaro.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.