Re: [PATCH 0/5] sched/debug: decouple sched_stat tracepoints from CONFIG_SCHEDSTATS

From: Peter Zijlstra <peterz@infradead.org>
To: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>,
	linux-kernel@vger.kernel.org,
	Mel Gorman <mgorman@techsingularity.net>,
	Matt Fleming <matt@codeblueprint.co.uk>,
	Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Subject: Re: [PATCH 0/5] sched/debug: decouple sched_stat tracepoints from CONFIG_SCHEDSTATS
Date: Tue, 28 Jun 2016 14:43:36 +0200	[thread overview]
Message-ID: <20160628124336.GG30909@twins.programming.kicks-ass.net> (raw)
In-Reply-To: <cover.1466184592.git.jpoimboe@redhat.com>

On Fri, Jun 17, 2016 at 12:43:22PM -0500, Josh Poimboeuf wrote:
> NOTE: I didn't include any performance numbers because I wasn't able to
> get consistent results.  I tried the following on a Xeon E5-2420 v2 CPU:
> 
>   $ for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor; do echo -n performance > $i; done
>   $ echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
>   $ echo 100 > /sys/devices/system/cpu/intel_pstate/min_perf_pct
>   $ echo 0 > /proc/sys/kernel/nmi_watchdog
>   $ taskset 0x10 perf stat -n -r10 perf bench sched pipe -l 1000000
> 
> I was going to post the numbers from that, both with and without
> SCHEDSTATS, but then when I tried to repeat the test on a different day,
> the results were surprisingly different, with different conclusions.
> 
> So any advice on measuring scheduler performance would be appreciated...

Yeah, its a bit of a pain in general...

A) perf stat --null --repeat 50 -- perf bench sched messaging -g 50 -l 5000 | grep "seconds time elapsed"
B) perf stat --null --repeat 50 -- taskset 1 perf bench sched pipe | grep "seconds time elapsed"

1) tip/master + 1-4
2) tip/master + 1-5
3) tip/master + 1-5 + below

	1		2		3

A)	4.627767855	4.650429917	4.646208062
	4.633921933	4.641424424	4.612021058
	4.649536375	4.663144144	4.636815948
	4.630165619	4.649053552	4.613022902

B)	1.770732957	1.789534273	1.773334291
	1.761740716	1.795618428	1.773338681
	1.763761666	1.822316496	1.774385589


>From this it looks like patch 5 does hurt a wee bit, but we can get most
of that back by reordering the structure a bit. The results seem
'stable' across rebuilds and reboots (I've pop'ed all patches and
rebuild, rebooted and re-benched 1 at the end and obtained similar
results).

Although, possible that if we reorder first and then do 5, we'll just
see a bigger regression. I've not bothered.


---
 include/linux/sched.h |   33 +++++++++++++++------------------
 kernel/sched/core.c   |    4 ++--
 kernel/sched/debug.c  |    6 +++---
 3 files changed, 20 insertions(+), 23 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1220,7 +1220,7 @@ struct uts_namespace;
 struct load_weight {
 	unsigned long weight;
 	u32 inv_weight;
-};
+} __packed;
 
 /*
  * The load_avg/util_avg accumulates an infinite geometric series
@@ -1315,44 +1315,40 @@ struct sched_statistics {
 
 struct sched_entity {
 	struct load_weight	load;		/* for load-balancing */
+	unsigned int		on_rq;
 	struct rb_node		run_node;
 	struct list_head	group_node;
-	unsigned int		on_rq;
 
-	u64			exec_start;
+	u64			exec_start ____cacheline_aligned_in_smp;
 	u64			sum_exec_runtime;
 	u64			vruntime;
 	u64			prev_sum_exec_runtime;
-
-	u64			nr_migrations;
-
 	u64			wait_start;
 	u64			sleep_start;
 	u64			block_start;
 
+#ifdef CONFIG_SMP
+	/*
+	 * Per entity load average tracking.
+	 */
+	struct sched_avg	avg ____cacheline_aligned_in_smp;
+#endif
 #ifdef CONFIG_SCHEDSTATS
 	struct sched_statistics statistics;
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	int			depth;
+	/*
+	 * mostly constant values, separate from modifications above
+	 */
+	int			depth ____cacheline_aligned_in_smp;
 	struct sched_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
 	struct cfs_rq		*cfs_rq;
 	/* rq "owned" by this entity/group: */
 	struct cfs_rq		*my_q;
 #endif
-
-#ifdef CONFIG_SMP
-	/*
-	 * Per entity load average tracking.
-	 *
-	 * Put into separate cache line so it does not
-	 * collide with read-mostly values above.
-	 */
-	struct sched_avg	avg ____cacheline_aligned_in_smp;
-#endif
-};
+} ____cacheline_aligned_in_smp;
 
 struct sched_rt_entity {
 	struct list_head run_list;
@@ -1475,6 +1471,7 @@ struct task_struct {
 	int prio, static_prio, normal_prio;
 	unsigned int rt_priority;
 	const struct sched_class *sched_class;
+	u64 nr_migrations;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
 #ifdef CONFIG_CGROUP_SCHED
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1239,7 +1239,7 @@ void set_task_cpu(struct task_struct *p,
 	if (task_cpu(p) != new_cpu) {
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p);
-		p->se.nr_migrations++;
+		p->nr_migrations++;
 		perf_event_task_migrate(p);
 	}
 
@@ -2167,7 +2167,7 @@ static void __sched_fork(unsigned long c
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
-	p->se.nr_migrations		= 0;
+	p->nr_migrations		= 0;
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -885,7 +885,7 @@ void proc_sched_show_task(struct task_st
 
 	nr_switches = p->nvcsw + p->nivcsw;
 
-	P(se.nr_migrations);
+	P(nr_migrations);
 
 	PN(se.wait_start);
 	PN(se.sleep_start);
@@ -926,9 +926,9 @@ void proc_sched_show_task(struct task_st
 			avg_atom = -1LL;
 
 		avg_per_cpu = p->se.sum_exec_runtime;
-		if (p->se.nr_migrations) {
+		if (p->nr_migrations) {
 			avg_per_cpu = div64_u64(avg_per_cpu,
-						p->se.nr_migrations);
+						p->nr_migrations);
 		} else {
 			avg_per_cpu = -1LL;
 		}