All of lore.kernel.org
 help / color / mirror / Atom feed
* [patch v3 0/8] sched: use runnable avg in load balance
@ 2013-04-02  3:23 Alex Shi
  2013-04-02  3:23 ` [patch v3 1/8] Revert "sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking" Alex Shi
                   ` (9 more replies)
  0 siblings, 10 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-02  3:23 UTC (permalink / raw)
  To: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen
  Cc: vincent.guittot, gregkh, preeti, viresh.kumar, linux-kernel,
	alex.shi, len.brown, rafael.j.wysocki, jkosina, clark.williams,
	tony.luck, keescook, mgorman, riel

This version resolved the aim7 liked benchmark issue by patch 8th.
Thanks for MikeG's avg_idle that is a good bursty wakeup indicator.

The first 3 patches were also include in my power aware scheduling patchset.

Morten, you can rebase your patch on this new version that bases on latest
Linus tree. :)

a git tree for this patchset:
	https://github.com/alexshi/power-scheduling.git runnablelb

Thanks
Alex

[patch v3 1/8] Revert "sched: Introduce temporary FAIR_GROUP_SCHED
[patch v3 2/8] sched: set initial value of runnable avg for new
[patch v3 3/8] sched: only count runnable avg on cfs_rq's nr_running
[patch v3 4/8] sched: update cpu load after task_tick.
[patch v3 5/8] sched: compute runnable load avg in cpu_load and
[patch v3 6/8] sched: consider runnable load average in move_tasks
[patch v3 7/8] sched: consider runnable load average in
[patch v3 8/8] sched: use instant load for burst wake up

^ permalink raw reply	[flat|nested] 44+ messages in thread

* [patch v3 1/8] Revert "sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking"
  2013-04-02  3:23 [patch v3 0/8] sched: use runnable avg in load balance Alex Shi
@ 2013-04-02  3:23 ` Alex Shi
  2013-04-02  3:23 ` [patch v3 2/8] sched: set initial value of runnable avg for new forked task Alex Shi
                   ` (8 subsequent siblings)
  9 siblings, 0 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-02  3:23 UTC (permalink / raw)
  To: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen
  Cc: vincent.guittot, gregkh, preeti, viresh.kumar, linux-kernel,
	alex.shi, len.brown, rafael.j.wysocki, jkosina, clark.williams,
	tony.luck, keescook, mgorman, riel

Remove CONFIG_FAIR_GROUP_SCHED that covers the runnable info, then
we can use runnable load variables.

Signed-off-by: Alex Shi <alex.shi@intel.com>
---
 include/linux/sched.h |  7 +------
 kernel/sched/core.c   |  7 +------
 kernel/sched/fair.c   | 13 ++-----------
 kernel/sched/sched.h  |  9 +--------
 4 files changed, 5 insertions(+), 31 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6..5a4cf37 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1160,12 +1160,7 @@ struct sched_entity {
 	struct cfs_rq		*my_q;
 #endif
 
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+#ifdef CONFIG_SMP
 	/* Per-entity load-tracking */
 	struct sched_avg	avg;
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624..54eaaa2 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1561,12 +1561,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+#ifdef CONFIG_SMP
 	p->se.avg.runnable_avg_period = 0;
 	p->se.avg.runnable_avg_sum = 0;
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e59..9c2f726 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1109,8 +1109,7 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
-/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
-#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+#ifdef CONFIG_SMP
 /*
  * We choose a half-life close to 1 scheduling period.
  * Note: The tables below are dependent on this value.
@@ -3394,12 +3393,6 @@ unlock:
 }
 
 /*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
  * cfs_rq_of(p) references at time of call are still valid and identify the
  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
@@ -3422,7 +3415,6 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
 		atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
 	}
 }
-#endif
 #endif /* CONFIG_SMP */
 
 static unsigned long
@@ -6114,9 +6106,8 @@ const struct sched_class fair_sched_class = {
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_fair,
-#ifdef CONFIG_FAIR_GROUP_SCHED
 	.migrate_task_rq	= migrate_task_rq_fair,
-#endif
+
 	.rq_online		= rq_online_fair,
 	.rq_offline		= rq_offline_fair,
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cc03cfd..7f36024f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -227,12 +227,6 @@ struct cfs_rq {
 #endif
 
 #ifdef CONFIG_SMP
-/*
- * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
- * removed when useful for applications beyond shares distribution (e.g.
- * load-balance).
- */
-#ifdef CONFIG_FAIR_GROUP_SCHED
 	/*
 	 * CFS Load tracking
 	 * Under CFS, load is tracked on a per-entity basis and aggregated up.
@@ -242,8 +236,7 @@ struct cfs_rq {
 	u64 runnable_load_avg, blocked_load_avg;
 	atomic64_t decay_counter, removed_load;
 	u64 last_decay;
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-/* These always depend on CONFIG_FAIR_GROUP_SCHED */
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	u32 tg_runnable_contrib;
 	u64 tg_load_contrib;
-- 
1.7.12


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [patch v3 2/8] sched: set initial value of runnable avg for new forked task
  2013-04-02  3:23 [patch v3 0/8] sched: use runnable avg in load balance Alex Shi
  2013-04-02  3:23 ` [patch v3 1/8] Revert "sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking" Alex Shi
@ 2013-04-02  3:23 ` Alex Shi
  2013-04-02  3:23 ` [patch v3 3/8] sched: only count runnable avg on cfs_rq's nr_running Alex Shi
                   ` (7 subsequent siblings)
  9 siblings, 0 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-02  3:23 UTC (permalink / raw)
  To: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen
  Cc: vincent.guittot, gregkh, preeti, viresh.kumar, linux-kernel,
	alex.shi, len.brown, rafael.j.wysocki, jkosina, clark.williams,
	tony.luck, keescook, mgorman, riel

We need initialize the se.avg.{decay_count, load_avg_contrib} for a
new forked task.
Otherwise random values of above variables cause mess when do new task
enqueue:
    enqueue_task_fair
        enqueue_entity
            enqueue_entity_load_avg

and make forking balancing imbalance since incorrect load_avg_contrib.

set avg.decay_count = 0, and avg.load_avg_contrib = se->load.weight to
resolve such issues.

Signed-off-by: Alex Shi <alex.shi@intel.com>
---
 kernel/sched/core.c | 6 ++++++
 kernel/sched/fair.c | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 54eaaa2..8843cd3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1564,6 +1564,7 @@ static void __sched_fork(struct task_struct *p)
 #ifdef CONFIG_SMP
 	p->se.avg.runnable_avg_period = 0;
 	p->se.avg.runnable_avg_sum = 0;
+	p->se.avg.decay_count = 0;
 #endif
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -1651,6 +1652,11 @@ void sched_fork(struct task_struct *p)
 		p->sched_reset_on_fork = 0;
 	}
 
+	/* New forked task assumed with full utilization */
+#if defined(CONFIG_SMP)
+	p->se.avg.load_avg_contrib = p->se.load.weight;
+#endif
+
 	if (!rt_prio(p->prio))
 		p->sched_class = &fair_sched_class;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9c2f726..2881d42 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1508,6 +1508,10 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
 	 * We track migrations using entity decay_count <= 0, on a wake-up
 	 * migration we use a negative decay count to track the remote decays
 	 * accumulated while sleeping.
+	 *
+	 * When enqueue a new forked task, the se->avg.decay_count == 0, so
+	 * we bypass update_entity_load_avg(), use avg.load_avg_contrib initial
+	 * value: se->load.weight.
 	 */
 	if (unlikely(se->avg.decay_count <= 0)) {
 		se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
-- 
1.7.12


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [patch v3 3/8] sched: only count runnable avg on cfs_rq's nr_running
  2013-04-02  3:23 [patch v3 0/8] sched: use runnable avg in load balance Alex Shi
  2013-04-02  3:23 ` [patch v3 1/8] Revert "sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking" Alex Shi
  2013-04-02  3:23 ` [patch v3 2/8] sched: set initial value of runnable avg for new forked task Alex Shi
@ 2013-04-02  3:23 ` Alex Shi
  2013-04-03  3:19   ` Alex Shi
  2013-04-02  3:23 ` [patch v3 4/8] sched: update cpu load after task_tick Alex Shi
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 44+ messages in thread
From: Alex Shi @ 2013-04-02  3:23 UTC (permalink / raw)
  To: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen
  Cc: vincent.guittot, gregkh, preeti, viresh.kumar, linux-kernel,
	alex.shi, len.brown, rafael.j.wysocki, jkosina, clark.williams,
	tony.luck, keescook, mgorman, riel

Old function count the runnable avg on rq's nr_running even there is
only rt task in rq. That is incorrect, so correct it to cfs_rq's
nr_running.

Signed-off-by: Alex Shi <alex.shi@intel.com>
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2881d42..026e959 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2829,7 +2829,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	}
 
 	if (!se) {
-		update_rq_runnable_avg(rq, rq->nr_running);
+		update_rq_runnable_avg(rq, rq->cfs.nr_running);
 		inc_nr_running(rq);
 	}
 	hrtick_update(rq);
-- 
1.7.12


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [patch v3 4/8] sched: update cpu load after task_tick.
  2013-04-02  3:23 [patch v3 0/8] sched: use runnable avg in load balance Alex Shi
                   ` (2 preceding siblings ...)
  2013-04-02  3:23 ` [patch v3 3/8] sched: only count runnable avg on cfs_rq's nr_running Alex Shi
@ 2013-04-02  3:23 ` Alex Shi
  2013-04-02  3:23 ` [patch v3 5/8] sched: compute runnable load avg in cpu_load and cpu_avg_load_per_task Alex Shi
                   ` (5 subsequent siblings)
  9 siblings, 0 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-02  3:23 UTC (permalink / raw)
  To: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen
  Cc: vincent.guittot, gregkh, preeti, viresh.kumar, linux-kernel,
	alex.shi, len.brown, rafael.j.wysocki, jkosina, clark.williams,
	tony.luck, keescook, mgorman, riel

To get the latest runnable info, we need do this cpuload update after
task_tick.

Signed-off-by: Alex Shi <alex.shi@intel.com>
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8843cd3..e3233d3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2690,8 +2690,8 @@ void scheduler_tick(void)
 
 	raw_spin_lock(&rq->lock);
 	update_rq_clock(rq);
-	update_cpu_load_active(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
+	update_cpu_load_active(rq);
 	raw_spin_unlock(&rq->lock);
 
 	perf_event_task_tick();
-- 
1.7.12


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [patch v3 5/8] sched: compute runnable load avg in cpu_load and cpu_avg_load_per_task
  2013-04-02  3:23 [patch v3 0/8] sched: use runnable avg in load balance Alex Shi
                   ` (3 preceding siblings ...)
  2013-04-02  3:23 ` [patch v3 4/8] sched: update cpu load after task_tick Alex Shi
@ 2013-04-02  3:23 ` Alex Shi
  2013-04-02  3:23 ` [patch v3 6/8] sched: consider runnable load average in move_tasks Alex Shi
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-02  3:23 UTC (permalink / raw)
  To: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen
  Cc: vincent.guittot, gregkh, preeti, viresh.kumar, linux-kernel,
	alex.shi, len.brown, rafael.j.wysocki, jkosina, clark.williams,
	tony.luck, keescook, mgorman, riel

They are the base values in load balance, update them with rq runnable
load average, then the load balance will consider runnable load avg
naturally.

Signed-off-by: Alex Shi <alex.shi@intel.com>
---
 kernel/sched/core.c | 4 ++--
 kernel/sched/fair.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e3233d3..a772de0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2534,7 +2534,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 void update_idle_cpu_load(struct rq *this_rq)
 {
 	unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
-	unsigned long load = this_rq->load.weight;
+	unsigned long load = (unsigned long)this_rq->cfs.runnable_load_avg;
 	unsigned long pending_updates;
 
 	/*
@@ -2584,7 +2584,7 @@ static void update_cpu_load_active(struct rq *this_rq)
 	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
 	 */
 	this_rq->last_load_update_tick = jiffies;
-	__update_cpu_load(this_rq, this_rq->load.weight, 1);
+	__update_cpu_load(this_rq, this_rq->cfs.runnable_load_avg, 1);
 
 	calc_load_account_active(this_rq);
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 026e959..1f9026e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2900,7 +2900,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 /* Used instead of source_load when we know the type == 0 */
 static unsigned long weighted_cpuload(const int cpu)
 {
-	return cpu_rq(cpu)->load.weight;
+	return (unsigned long)cpu_rq(cpu)->cfs.runnable_load_avg;
 }
 
 /*
@@ -2947,7 +2947,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
 
 	if (nr_running)
-		return rq->load.weight / nr_running;
+		return (unsigned long)rq->cfs.runnable_load_avg / nr_running;
 
 	return 0;
 }
-- 
1.7.12


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [patch v3 6/8] sched: consider runnable load average in move_tasks
  2013-04-02  3:23 [patch v3 0/8] sched: use runnable avg in load balance Alex Shi
                   ` (4 preceding siblings ...)
  2013-04-02  3:23 ` [patch v3 5/8] sched: compute runnable load avg in cpu_load and cpu_avg_load_per_task Alex Shi
@ 2013-04-02  3:23 ` Alex Shi
  2013-04-09  7:08   ` Vincent Guittot
  2013-04-02  3:23 ` [patch v3 7/8] sched: consider runnable load average in effective_load Alex Shi
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 44+ messages in thread
From: Alex Shi @ 2013-04-02  3:23 UTC (permalink / raw)
  To: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen
  Cc: vincent.guittot, gregkh, preeti, viresh.kumar, linux-kernel,
	alex.shi, len.brown, rafael.j.wysocki, jkosina, clark.williams,
	tony.luck, keescook, mgorman, riel

Except using runnable load average in background, move_tasks is also
the key functions in load balance. We need consider the runnable load
average in it in order to the apple to apple load comparison.

Signed-off-by: Alex Shi <alex.shi@intel.com>
---
 kernel/sched/fair.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1f9026e..bf4e0d4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3966,6 +3966,15 @@ static unsigned long task_h_load(struct task_struct *p);
 
 static const unsigned int sched_nr_migrate_break = 32;
 
+static unsigned long task_h_load_avg(struct task_struct *p)
+{
+	u32 period = p->se.avg.runnable_avg_period;
+	if (!period)
+		return 0;
+
+	return task_h_load(p) * p->se.avg.runnable_avg_sum / period;
+}
+
 /*
  * move_tasks tries to move up to imbalance weighted load from busiest to
  * this_rq, as part of a balancing operation within domain "sd".
@@ -4001,7 +4010,7 @@ static int move_tasks(struct lb_env *env)
 		if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
 			goto next;
 
-		load = task_h_load(p);
+		load = task_h_load_avg(p);
 
 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
 			goto next;
-- 
1.7.12


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [patch v3 7/8] sched: consider runnable load average in effective_load
  2013-04-02  3:23 [patch v3 0/8] sched: use runnable avg in load balance Alex Shi
                   ` (5 preceding siblings ...)
  2013-04-02  3:23 ` [patch v3 6/8] sched: consider runnable load average in move_tasks Alex Shi
@ 2013-04-02  3:23 ` Alex Shi
  2013-04-02  3:23 ` [patch v3 8/8] sched: use instant load for burst wake up Alex Shi
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-02  3:23 UTC (permalink / raw)
  To: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen
  Cc: vincent.guittot, gregkh, preeti, viresh.kumar, linux-kernel,
	alex.shi, len.brown, rafael.j.wysocki, jkosina, clark.williams,
	tony.luck, keescook, mgorman, riel

effective_load calculates the load change as seen from the
root_task_group. It needs to engage the runnable average
of changed task.

Thanks for Morten Rasmussen's reminder of this.

Signed-off-by: Alex Shi <alex.shi@intel.com>
---
 kernel/sched/fair.c | 27 ++++++++++++++++++++-------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bf4e0d4..fdb88de 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2976,7 +2976,8 @@ static void task_waking_fair(struct task_struct *p)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
- * effective_load() calculates the load change as seen from the root_task_group
+ * effective_load() calculates the runnable load average change as seen from
+ * the root_task_group
  *
  * Adding load to a group doesn't make a group heavier, but can cause movement
  * of group shares between cpus. Assuming the shares were perfectly aligned one
@@ -3024,6 +3025,9 @@ static void task_waking_fair(struct task_struct *p)
  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
  * 4/7) times the weight of the group.
+ *
+ * After get effective_load of the load moving, will engaged the sched entity's
+ * runnable avg.
  */
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
@@ -3098,6 +3102,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	struct task_group *tg;
 	unsigned long weight;
 	int balanced;
+	int runnable_avg;
 
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
@@ -3113,13 +3118,19 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	if (sync) {
 		tg = task_group(current);
 		weight = current->se.load.weight;
+		runnable_avg = current->se.avg.runnable_avg_sum * NICE_0_LOAD
+				/ (current->se.avg.runnable_avg_period + 1);
 
-		this_load += effective_load(tg, this_cpu, -weight, -weight);
-		load += effective_load(tg, prev_cpu, 0, -weight);
+		this_load += effective_load(tg, this_cpu, -weight, -weight)
+				* runnable_avg >> NICE_0_SHIFT;
+		load += effective_load(tg, prev_cpu, 0, -weight)
+				* runnable_avg >> NICE_0_SHIFT;
 	}
 
 	tg = task_group(p);
 	weight = p->se.load.weight;
+	runnable_avg = p->se.avg.runnable_avg_sum * NICE_0_LOAD
+				/ (p->se.avg.runnable_avg_period + 1);
 
 	/*
 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
@@ -3131,16 +3142,18 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	 * task to be woken on this_cpu.
 	 */
 	if (this_load > 0) {
-		s64 this_eff_load, prev_eff_load;
+		s64 this_eff_load, prev_eff_load, tmp_eff_load;
 
 		this_eff_load = 100;
 		this_eff_load *= power_of(prev_cpu);
-		this_eff_load *= this_load +
-			effective_load(tg, this_cpu, weight, weight);
+		tmp_eff_load = effective_load(tg, this_cpu, weight, weight)
+				* runnable_avg >> NICE_0_SHIFT;
+		this_eff_load *= this_load + tmp_eff_load;
 
 		prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
 		prev_eff_load *= power_of(this_cpu);
-		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+		prev_eff_load *= load + (effective_load(tg, prev_cpu, 0, weight)
+						* runnable_avg >> NICE_0_SHIFT);
 
 		balanced = this_eff_load <= prev_eff_load;
 	} else
-- 
1.7.12


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* [patch v3 8/8] sched: use instant load for burst wake up
  2013-04-02  3:23 [patch v3 0/8] sched: use runnable avg in load balance Alex Shi
                   ` (6 preceding siblings ...)
  2013-04-02  3:23 ` [patch v3 7/8] sched: consider runnable load average in effective_load Alex Shi
@ 2013-04-02  3:23 ` Alex Shi
  2013-04-02  7:23 ` [patch v3 0/8] sched: use runnable avg in load balance Michael Wang
  2013-04-09  5:08 ` Alex Shi
  9 siblings, 0 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-02  3:23 UTC (permalink / raw)
  To: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen
  Cc: vincent.guittot, gregkh, preeti, viresh.kumar, linux-kernel,
	alex.shi, len.brown, rafael.j.wysocki, jkosina, clark.williams,
	tony.luck, keescook, mgorman, riel

If many tasks sleep long time, their runnable load are zero. And if they
are waked up bursty, too light runnable load causes big imbalance among
CPU. So such benchmark, like aim9 drop 5~7%.

With this patch the losing is covered, and even is slight better.

Signed-off-by: Alex Shi <alex.shi@intel.com>
---
 kernel/sched/fair.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fdb88de..c2f6e4e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3103,12 +3103,26 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	unsigned long weight;
 	int balanced;
 	int runnable_avg;
+	int burst_prev = 0, burst_this = 0;
 
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
 	prev_cpu  = task_cpu(p);
-	load	  = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
+
+	if (cpu_rq(this_cpu)->avg_idle < sysctl_sched_migration_cost)
+		burst_this = 1;
+	if (cpu_rq(prev_cpu)->avg_idle < sysctl_sched_migration_cost)
+		burst_prev = 1;
+
+	/* use instant load on bursty waking up cpu */
+	if (!burst_prev)
+		load = source_load(prev_cpu, idx);
+	else
+		load = cpu_rq(prev_cpu)->load.weight;
+	if (!burst_this)
+		this_load = target_load(this_cpu, idx);
+	else
+		this_load = cpu_rq(this_cpu)->load.weight;
 
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
-- 
1.7.12


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-02  3:23 [patch v3 0/8] sched: use runnable avg in load balance Alex Shi
                   ` (7 preceding siblings ...)
  2013-04-02  3:23 ` [patch v3 8/8] sched: use instant load for burst wake up Alex Shi
@ 2013-04-02  7:23 ` Michael Wang
  2013-04-02  8:34   ` Mike Galbraith
                     ` (2 more replies)
  2013-04-09  5:08 ` Alex Shi
  9 siblings, 3 replies; 44+ messages in thread
From: Michael Wang @ 2013-04-02  7:23 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/02/2013 11:23 AM, Alex Shi wrote:
[snip]
> 
> [patch v3 1/8] Revert "sched: Introduce temporary FAIR_GROUP_SCHED
> [patch v3 2/8] sched: set initial value of runnable avg for new
> [patch v3 3/8] sched: only count runnable avg on cfs_rq's nr_running
> [patch v3 4/8] sched: update cpu load after task_tick.
> [patch v3 5/8] sched: compute runnable load avg in cpu_load and
> [patch v3 6/8] sched: consider runnable load average in move_tasks
> [patch v3 7/8] sched: consider runnable load average in
> [patch v3 8/8] sched: use instant load for burst wake up

I've tested the patch set on 12 cpu X86 box with 3.9.0-rc2, and pgbench
show regression on high-end this time.

| db_size | clients |  tps  |   |  tps  |
+---------+---------+-------+   +-------+
| 22 MB   |       1 | 10662 |   | 10446 |
| 22 MB   |       2 | 21483 |   | 20887 |
| 22 MB   |       4 | 42046 |   | 41266 |
| 22 MB   |       8 | 55807 |   | 51987 |
| 22 MB   |      12 | 50768 |   | 50974 |
| 22 MB   |      16 | 49880 |   | 49510 |
| 22 MB   |      24 | 45904 |   | 42398 |
| 22 MB   |      32 | 43420 |   | 40995 |
| 7484 MB |       1 |  7965 |   |  7376 |
| 7484 MB |       2 | 19354 |   | 19149 |
| 7484 MB |       4 | 37552 |   | 37458 |
| 7484 MB |       8 | 48655 |   | 46618 |
| 7484 MB |      12 | 45778 |   | 45756 |
| 7484 MB |      16 | 45659 |   | 44911 |
| 7484 MB |      24 | 42192 |   | 37185 |	-11.87%
| 7484 MB |      32 | 36385 |   | 34447 |
| 15 GB   |       1 |  7677 |   |  7359 |
| 15 GB   |       2 | 19227 |   | 19049 |
| 15 GB   |       4 | 37335 |   | 36947 |
| 15 GB   |       8 | 48130 |   | 46898 |
| 15 GB   |      12 | 45393 |   | 43986 |
| 15 GB   |      16 | 45110 |   | 45719 |
| 15 GB   |      24 | 41415 |   | 36813 |	-11.11%
| 15 GB   |      32 | 35988 |   | 34025 |

The reason may caused by wake_affine()'s higher overhead, and pgbench is
really sensitive to this stuff...

Regards,
Michael Wang

> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-02  7:23 ` [patch v3 0/8] sched: use runnable avg in load balance Michael Wang
@ 2013-04-02  8:34   ` Mike Galbraith
  2013-04-02  9:13     ` Michael Wang
  2013-04-02  8:35   ` Alex Shi
  2013-04-03  8:46   ` Alex Shi
  2 siblings, 1 reply; 44+ messages in thread
From: Mike Galbraith @ 2013-04-02  8:34 UTC (permalink / raw)
  To: Michael Wang
  Cc: Alex Shi, mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On Tue, 2013-04-02 at 15:23 +0800, Michael Wang wrote: 
> On 04/02/2013 11:23 AM, Alex Shi wrote:
> [snip]
> > 
> > [patch v3 1/8] Revert "sched: Introduce temporary FAIR_GROUP_SCHED
> > [patch v3 2/8] sched: set initial value of runnable avg for new
> > [patch v3 3/8] sched: only count runnable avg on cfs_rq's nr_running
> > [patch v3 4/8] sched: update cpu load after task_tick.
> > [patch v3 5/8] sched: compute runnable load avg in cpu_load and
> > [patch v3 6/8] sched: consider runnable load average in move_tasks
> > [patch v3 7/8] sched: consider runnable load average in
> > [patch v3 8/8] sched: use instant load for burst wake up
> 
> I've tested the patch set on 12 cpu X86 box with 3.9.0-rc2, and pgbench
> show regression on high-end this time.
> 
> | db_size | clients |  tps  |   |  tps  |
> +---------+---------+-------+   +-------+
> | 22 MB   |       1 | 10662 |   | 10446 |
> | 22 MB   |       2 | 21483 |   | 20887 |
> | 22 MB   |       4 | 42046 |   | 41266 |
> | 22 MB   |       8 | 55807 |   | 51987 |
> | 22 MB   |      12 | 50768 |   | 50974 |
> | 22 MB   |      16 | 49880 |   | 49510 |
> | 22 MB   |      24 | 45904 |   | 42398 |
> | 22 MB   |      32 | 43420 |   | 40995 |
> | 7484 MB |       1 |  7965 |   |  7376 |
> | 7484 MB |       2 | 19354 |   | 19149 |
> | 7484 MB |       4 | 37552 |   | 37458 |
> | 7484 MB |       8 | 48655 |   | 46618 |
> | 7484 MB |      12 | 45778 |   | 45756 |
> | 7484 MB |      16 | 45659 |   | 44911 |
> | 7484 MB |      24 | 42192 |   | 37185 |	-11.87%
> | 7484 MB |      32 | 36385 |   | 34447 |
> | 15 GB   |       1 |  7677 |   |  7359 |
> | 15 GB   |       2 | 19227 |   | 19049 |
> | 15 GB   |       4 | 37335 |   | 36947 |
> | 15 GB   |       8 | 48130 |   | 46898 |
> | 15 GB   |      12 | 45393 |   | 43986 |
> | 15 GB   |      16 | 45110 |   | 45719 |
> | 15 GB   |      24 | 41415 |   | 36813 |	-11.11%
> | 15 GB   |      32 | 35988 |   | 34025 |
> 
> The reason may caused by wake_affine()'s higher overhead, and pgbench is
> really sensitive to this stuff...

For grins, you could try running the whole thing SCHED_BATCH.  (/me sees
singing/dancing red herring whenever wake_affine() and pgbench appear in
the same sentence;)

-Mike


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-02  7:23 ` [patch v3 0/8] sched: use runnable avg in load balance Michael Wang
  2013-04-02  8:34   ` Mike Galbraith
@ 2013-04-02  8:35   ` Alex Shi
  2013-04-02  9:45     ` Michael Wang
  2013-04-03  2:46     ` Michael Wang
  2013-04-03  8:46   ` Alex Shi
  2 siblings, 2 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-02  8:35 UTC (permalink / raw)
  To: Michael Wang
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/02/2013 03:23 PM, Michael Wang wrote:
>> > [patch v3 8/8] sched: use instant load for burst wake up
> I've tested the patch set on 12 cpu X86 box with 3.9.0-rc2, and pgbench
> show regression on high-end this time.
> 
> | db_size | clients |  tps  |   |  tps  |
> +---------+---------+-------+   +-------+
> | 22 MB   |       1 | 10662 |   | 10446 |
> | 22 MB   |       2 | 21483 |   | 20887 |
> | 22 MB   |       4 | 42046 |   | 41266 |
> | 22 MB   |       8 | 55807 |   | 51987 |
> | 22 MB   |      12 | 50768 |   | 50974 |
> | 22 MB   |      16 | 49880 |   | 49510 |
> | 22 MB   |      24 | 45904 |   | 42398 |
> | 22 MB   |      32 | 43420 |   | 40995 |
> | 7484 MB |       1 |  7965 |   |  7376 |
> | 7484 MB |       2 | 19354 |   | 19149 |
> | 7484 MB |       4 | 37552 |   | 37458 |
> | 7484 MB |       8 | 48655 |   | 46618 |
> | 7484 MB |      12 | 45778 |   | 45756 |
> | 7484 MB |      16 | 45659 |   | 44911 |
> | 7484 MB |      24 | 42192 |   | 37185 |	-11.87%
> | 7484 MB |      32 | 36385 |   | 34447 |
> | 15 GB   |       1 |  7677 |   |  7359 |
> | 15 GB   |       2 | 19227 |   | 19049 |
> | 15 GB   |       4 | 37335 |   | 36947 |
> | 15 GB   |       8 | 48130 |   | 46898 |
> | 15 GB   |      12 | 45393 |   | 43986 |
> | 15 GB   |      16 | 45110 |   | 45719 |
> | 15 GB   |      24 | 41415 |   | 36813 |	-11.11%
> | 15 GB   |      32 | 35988 |   | 34025 |
> 
> The reason may caused by wake_affine()'s higher overhead, and pgbench is
> really sensitive to this stuff...

Thanks for testing. Could you like to remove the last patch and test it
again? I want to know if the last patch has effect on pgbench.

-- 
Thanks Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-02  8:34   ` Mike Galbraith
@ 2013-04-02  9:13     ` Michael Wang
  0 siblings, 0 replies; 44+ messages in thread
From: Michael Wang @ 2013-04-02  9:13 UTC (permalink / raw)
  To: Mike Galbraith
  Cc: Alex Shi, mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/02/2013 04:34 PM, Mike Galbraith wrote:
[snip]
>> The reason may caused by wake_affine()'s higher overhead, and pgbench is
>> really sensitive to this stuff...
> 
> For grins, you could try running the whole thing SCHED_BATCH.  (/me sees
> singing/dancing red herring whenever wake_affine() and pgbench appear in
> the same sentence;)

I saw the patch touched the wake_affine(), just interested on what will
happen ;-)

The patch changed the overhead of wake_affine(), and also influence it's
result, I used to think the later one may do some help to the pgbench...

Regards,
Michael Wang

> 
> -Mike
> 


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-02  8:35   ` Alex Shi
@ 2013-04-02  9:45     ` Michael Wang
  2013-04-03  2:46     ` Michael Wang
  1 sibling, 0 replies; 44+ messages in thread
From: Michael Wang @ 2013-04-02  9:45 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/02/2013 04:35 PM, Alex Shi wrote:
> On 04/02/2013 03:23 PM, Michael Wang wrote:
[snip]
>>
>> The reason may caused by wake_affine()'s higher overhead, and pgbench is
>> really sensitive to this stuff...
> 
> Thanks for testing. Could you like to remove the last patch and test it
> again? I want to know if the last patch has effect on pgbench.

Amazing, without the last one, pgbench show very good improvement,
higher than 10ms throttle, lower than 100ms throttle, I need confirm
this with a night-through testing.

I will look into those patches in detail later, I think it addressed
part of the wake_affine() issue (make the decision more accurately),
that's nice ;-)

Regards,
Michael Wang

> 


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-02  8:35   ` Alex Shi
  2013-04-02  9:45     ` Michael Wang
@ 2013-04-03  2:46     ` Michael Wang
  2013-04-03  2:56       ` Alex Shi
  1 sibling, 1 reply; 44+ messages in thread
From: Michael Wang @ 2013-04-03  2:46 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/02/2013 04:35 PM, Alex Shi wrote:
[snip]
>>
>> The reason may caused by wake_affine()'s higher overhead, and pgbench is
>> really sensitive to this stuff...
> 
> Thanks for testing. Could you like to remove the last patch and test it
> again? I want to know if the last patch has effect on pgbench.

Done, here the results of pgbench without the last patch on my box:

| db_size | clients |  tps  |   |  tps  |
+---------+---------+-------+   +-------+
| 22 MB   |       1 | 10662 |   | 10679 |
| 22 MB   |       2 | 21483 |   | 21471 |
| 22 MB   |       4 | 42046 |   | 41957 |
| 22 MB   |       8 | 55807 |   | 55684 |
| 22 MB   |      12 | 50768 |   | 52074 |
| 22 MB   |      16 | 49880 |   | 52879 |
| 22 MB   |      24 | 45904 |   | 53406 |
| 22 MB   |      32 | 43420 |   | 54088 |	+24.57%
| 7484 MB |       1 |  7965 |   |  7725 |
| 7484 MB |       2 | 19354 |   | 19405 |
| 7484 MB |       4 | 37552 |   | 37246 |
| 7484 MB |       8 | 48655 |   | 50613 |
| 7484 MB |      12 | 45778 |   | 47639 |
| 7484 MB |      16 | 45659 |   | 48707 |
| 7484 MB |      24 | 42192 |   | 46469 |
| 7484 MB |      32 | 36385 |   | 46346 |	+27.38%
| 15 GB   |       1 |  7677 |   |  7727 |
| 15 GB   |       2 | 19227 |   | 19199 |
| 15 GB   |       4 | 37335 |   | 37372 |
| 15 GB   |       8 | 48130 |   | 50333 |
| 15 GB   |      12 | 45393 |   | 47590 |
| 15 GB   |      16 | 45110 |   | 48091 |
| 15 GB   |      24 | 41415 |   | 47415 |
| 15 GB   |      32 | 35988 |   | 45749 |	+27.12%

Very nice improvement, I'd like to test it with the wake-affine throttle
patch later, let's see what will happen ;-)

Any idea on why the last one caused the regression?

Regards,
Michael Wang

> 


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-03  2:46     ` Michael Wang
@ 2013-04-03  2:56       ` Alex Shi
  2013-04-03  3:23         ` Michael Wang
  0 siblings, 1 reply; 44+ messages in thread
From: Alex Shi @ 2013-04-03  2:56 UTC (permalink / raw)
  To: Michael Wang
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/03/2013 10:46 AM, Michael Wang wrote:
> | 15 GB   |      16 | 45110 |   | 48091 |
> | 15 GB   |      24 | 41415 |   | 47415 |
> | 15 GB   |      32 | 35988 |   | 45749 |	+27.12%
> 
> Very nice improvement, I'd like to test it with the wake-affine throttle
> patch later, let's see what will happen ;-)
> 
> Any idea on why the last one caused the regression?

you can change the burst threshold: sysctl_sched_migration_cost, to see
what's happen with different value. create a similar knob and tune it.
+
+	if (cpu_rq(this_cpu)->avg_idle < sysctl_sched_migration_cost)
+		burst_this = 1;
+	if (cpu_rq(prev_cpu)->avg_idle < sysctl_sched_migration_cost)
+		burst_prev = 1;
+


BTW, what's the job thread behaviour of pgbench, guess it has lots of
wakeup. what's the work and sleep ratio of pgbench?
-- 
Thanks Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 3/8] sched: only count runnable avg on cfs_rq's nr_running
  2013-04-02  3:23 ` [patch v3 3/8] sched: only count runnable avg on cfs_rq's nr_running Alex Shi
@ 2013-04-03  3:19   ` Alex Shi
  0 siblings, 0 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-03  3:19 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/02/2013 11:23 AM, Alex Shi wrote:
> Old function count the runnable avg on rq's nr_running even there is
> only rt task in rq. That is incorrect, so correct it to cfs_rq's
> nr_running.

this patch is incorrect and removed in
	https://github.com/alexshi/power-scheduling.git runnablelb


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-03  2:56       ` Alex Shi
@ 2013-04-03  3:23         ` Michael Wang
  2013-04-03  4:28           ` Alex Shi
  0 siblings, 1 reply; 44+ messages in thread
From: Michael Wang @ 2013-04-03  3:23 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/03/2013 10:56 AM, Alex Shi wrote:
> On 04/03/2013 10:46 AM, Michael Wang wrote:
>> | 15 GB   |      16 | 45110 |   | 48091 |
>> | 15 GB   |      24 | 41415 |   | 47415 |
>> | 15 GB   |      32 | 35988 |   | 45749 |	+27.12%
>>
>> Very nice improvement, I'd like to test it with the wake-affine throttle
>> patch later, let's see what will happen ;-)
>>
>> Any idea on why the last one caused the regression?
> 
> you can change the burst threshold: sysctl_sched_migration_cost, to see
> what's happen with different value. create a similar knob and tune it.
> +
> +	if (cpu_rq(this_cpu)->avg_idle < sysctl_sched_migration_cost)
> +		burst_this = 1;
> +	if (cpu_rq(prev_cpu)->avg_idle < sysctl_sched_migration_cost)
> +		burst_prev = 1;
> +
> 
> 

This changing the rate of adopt cpu_rq(cpu)->load.weight, correct?

So if rq is busy, cpu_rq(cpu)->load.weight is capable enough to stand
for the load status of rq? what's the really idea here?

> BTW, what's the job thread behaviour of pgbench, guess it has lots of
> wakeup. what's the work and sleep ratio of pgbench?

I won't do the summary unless I reviewed it's code :) what I know is,
it's a database benchmark, with several process operating database, see
below one for details:

pgbench is a simple program for running benchmark tests on PostgreSQL.
It runs the same sequence of SQL commands over and over, possibly in
multiple concurrent database sessions, and then calculates the average
transaction rate (transactions per second). By default, pgbench tests a
scenario that is loosely based on TPC-B, involving five SELECT, UPDATE,
and INSERT commands per transaction. However, it is easy to test other
cases by writing your own transaction script files.

Regards,
Michael Wang

> 


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-03  3:23         ` Michael Wang
@ 2013-04-03  4:28           ` Alex Shi
  2013-04-03  5:38             ` Michael Wang
  2013-04-03  6:22             ` Michael Wang
  0 siblings, 2 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-03  4:28 UTC (permalink / raw)
  To: Michael Wang
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/03/2013 11:23 AM, Michael Wang wrote:
> On 04/03/2013 10:56 AM, Alex Shi wrote:
>> On 04/03/2013 10:46 AM, Michael Wang wrote:
>>> | 15 GB   |      16 | 45110 |   | 48091 |
>>> | 15 GB   |      24 | 41415 |   | 47415 |
>>> | 15 GB   |      32 | 35988 |   | 45749 |	+27.12%
>>>
>>> Very nice improvement, I'd like to test it with the wake-affine throttle
>>> patch later, let's see what will happen ;-)
>>>
>>> Any idea on why the last one caused the regression?
>>
>> you can change the burst threshold: sysctl_sched_migration_cost, to see
>> what's happen with different value. create a similar knob and tune it.
>> +
>> +	if (cpu_rq(this_cpu)->avg_idle < sysctl_sched_migration_cost)
>> +		burst_this = 1;
>> +	if (cpu_rq(prev_cpu)->avg_idle < sysctl_sched_migration_cost)
>> +		burst_prev = 1;
>> +
>>
>>
> 
> This changing the rate of adopt cpu_rq(cpu)->load.weight, correct?
> 
> So if rq is busy, cpu_rq(cpu)->load.weight is capable enough to stand
> for the load status of rq? what's the really idea here?

This patch try to resolved the aim7 liked benchmark regression.
If many tasks sleep long time, their runnable load are zero. And then if 
they are waked up bursty, too light runnable load causes big imbalance in
 select_task_rq. So such benchmark, like aim9 drop 5~7%.

this patch try to detect the burst, if so, it use load weight directly not
 zero runnable load avg to avoid the imbalance.

but the patch may cause some unfairness if this/prev cpu are not burst at 
same time. So could like try the following patch?


>From 4722a7567dccfb19aa5afbb49982ffb6d65e6ae5 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Tue, 2 Apr 2013 10:27:45 +0800
Subject: [PATCH] sched: use instant load for burst wake up

If many tasks sleep long time, their runnable load are zero. And if they
are waked up bursty, too light runnable load causes big imbalance among
CPU. So such benchmark, like aim9 drop 5~7%.

With this patch the losing is covered, and even is slight better.

Signed-off-by: Alex Shi <alex.shi@intel.com>
---
 kernel/sched/fair.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dbaa8ca..25ac437 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3103,12 +3103,24 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	unsigned long weight;
 	int balanced;
 	int runnable_avg;
+	int burst = 0;
 
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
 	prev_cpu  = task_cpu(p);
-	load	  = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
+
+	if (cpu_rq(this_cpu)->avg_idle < sysctl_sched_migration_cost ||
+		cpu_rq(prev_cpu)->avg_idle < sysctl_sched_migration_cost)
+		burst= 1;
+
+	/* use instant load for bursty waking up */
+	if (!burst) {
+		load = source_load(prev_cpu, idx);
+		this_load = target_load(this_cpu, idx);
+	} else {
+		load = cpu_rq(prev_cpu)->load.weight;
+		this_load = cpu_rq(this_cpu)->load.weight;
+	}
 
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
-- 
1.7.12


^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-03  4:28           ` Alex Shi
@ 2013-04-03  5:38             ` Michael Wang
  2013-04-03  5:53               ` Michael Wang
  2013-04-03  6:01               ` Alex Shi
  2013-04-03  6:22             ` Michael Wang
  1 sibling, 2 replies; 44+ messages in thread
From: Michael Wang @ 2013-04-03  5:38 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/03/2013 12:28 PM, Alex Shi wrote:
[snip]
> 
> but the patch may cause some unfairness if this/prev cpu are not burst at 
> same time. So could like try the following patch?

I will try it later, some doubt below :)

[snip]
> +
> +	if (cpu_rq(this_cpu)->avg_idle < sysctl_sched_migration_cost ||
> +		cpu_rq(prev_cpu)->avg_idle < sysctl_sched_migration_cost)
> +		burst= 1;
> +
> +	/* use instant load for bursty waking up */
> +	if (!burst) {
> +		load = source_load(prev_cpu, idx);
> +		this_load = target_load(this_cpu, idx);
> +	} else {
> +		load = cpu_rq(prev_cpu)->load.weight;
> +		this_load = cpu_rq(this_cpu)->load.weight;

Ok, my understanding is, we want pull if 'prev_cpu' is burst, and don't
want pull if 'this_cpu' is burst, correct?

And we do this by guess the load higher or lower, is that right?

And I think target_load() is capable enough to choose the higher load,
if 'cpu_rq(cpu)->load.weight' is really higher, the results will be the
same.

So what about this:

	/* prefer higher load if burst */
	load = burst_prev ?
		target_load(prev_cpu, idx) : source_load(prev_cpu, idx);

	this_load = target_load(this_cpu, idx);

Regards,
Michael Wang

> +	}
> 
>  	/*
>  	 * If sync wakeup then subtract the (maximum possible)
> 


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-03  5:38             ` Michael Wang
@ 2013-04-03  5:53               ` Michael Wang
  2013-04-03  6:01               ` Alex Shi
  1 sibling, 0 replies; 44+ messages in thread
From: Michael Wang @ 2013-04-03  5:53 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/03/2013 01:38 PM, Michael Wang wrote:
> On 04/03/2013 12:28 PM, Alex Shi wrote:
> [snip]
>>
>> but the patch may cause some unfairness if this/prev cpu are not burst at 
>> same time. So could like try the following patch?
> 
> I will try it later, some doubt below :)
> 
> [snip]
>> +
>> +	if (cpu_rq(this_cpu)->avg_idle < sysctl_sched_migration_cost ||
>> +		cpu_rq(prev_cpu)->avg_idle < sysctl_sched_migration_cost)
>> +		burst= 1;
>> +
>> +	/* use instant load for bursty waking up */
>> +	if (!burst) {
>> +		load = source_load(prev_cpu, idx);
>> +		this_load = target_load(this_cpu, idx);
>> +	} else {
>> +		load = cpu_rq(prev_cpu)->load.weight;
>> +		this_load = cpu_rq(this_cpu)->load.weight;
> 
> Ok, my understanding is, we want pull if 'prev_cpu' is burst, and don't
> want pull if 'this_cpu' is burst, correct?
> 
> And we do this by guess the load higher or lower, is that right?
> 
> And I think target_load() is capable enough to choose the higher load,
> if 'cpu_rq(cpu)->load.weight' is really higher, the results will be the
> same.
> 
> So what about this:
> 
> 	/* prefer higher load if burst */
> 	load = burst_prev ?

And this check could also be:

	load = burst_prev && !burst_this ?

if we don't prefer the pull when this_cpu also bursted.

Regards,
Michael Wang

> 		target_load(prev_cpu, idx) : source_load(prev_cpu, idx);
> 
> 	this_load = target_load(this_cpu, idx);
> 
> Regards,
> Michael Wang
> 
>> +	}
>>
>>  	/*
>>  	 * If sync wakeup then subtract the (maximum possible)
>>
> 


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-03  5:38             ` Michael Wang
  2013-04-03  5:53               ` Michael Wang
@ 2013-04-03  6:01               ` Alex Shi
  1 sibling, 0 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-03  6:01 UTC (permalink / raw)
  To: Michael Wang
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/03/2013 01:38 PM, Michael Wang wrote:
> On 04/03/2013 12:28 PM, Alex Shi wrote:
> [snip]
>>
>> but the patch may cause some unfairness if this/prev cpu are not burst at 
>> same time. So could like try the following patch?
> 
> I will try it later, some doubt below :)
> 
> [snip]
>> +
>> +	if (cpu_rq(this_cpu)->avg_idle < sysctl_sched_migration_cost ||
>> +		cpu_rq(prev_cpu)->avg_idle < sysctl_sched_migration_cost)
>> +		burst= 1;
>> +
>> +	/* use instant load for bursty waking up */
>> +	if (!burst) {
>> +		load = source_load(prev_cpu, idx);
>> +		this_load = target_load(this_cpu, idx);
>> +	} else {
>> +		load = cpu_rq(prev_cpu)->load.weight;
>> +		this_load = cpu_rq(this_cpu)->load.weight;
> 
> Ok, my understanding is, we want pull if 'prev_cpu' is burst, and don't
> want pull if 'this_cpu' is burst, correct?

Nope, as my thought, a burst cpu doesn't mean it has heavy load than
others, so we still need to compare their load on the same condition.
> 
> And we do this by guess the load higher or lower, is that right?

ditto.
> 
> And I think target_load() is capable enough to choose the higher load,
> if 'cpu_rq(cpu)->load.weight' is really higher, the results will be the
> same.
> 
> So what about this:
> 
> 	/* prefer higher load if burst */
> 	load = burst_prev ?
> 		target_load(prev_cpu, idx) : source_load(prev_cpu, idx);
> 
> 	this_load = target_load(this_cpu, idx);

here, the idx is zero, so source_load and target load is same.
> 
> Regards,
> Michael Wang
> 
>> +	}
>>
>>  	/*
>>  	 * If sync wakeup then subtract the (maximum possible)
>>
> 


-- 
Thanks Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-03  4:28           ` Alex Shi
  2013-04-03  5:38             ` Michael Wang
@ 2013-04-03  6:22             ` Michael Wang
  2013-04-03  6:53               ` Alex Shi
  1 sibling, 1 reply; 44+ messages in thread
From: Michael Wang @ 2013-04-03  6:22 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/03/2013 12:28 PM, Alex Shi wrote:
> On 04/03/2013 11:23 AM, Michael Wang wrote:
>> On 04/03/2013 10:56 AM, Alex Shi wrote:
>>> On 04/03/2013 10:46 AM, Michael Wang wrote:
[snip]
> 
> 
> From 4722a7567dccfb19aa5afbb49982ffb6d65e6ae5 Mon Sep 17 00:00:00 2001
> From: Alex Shi <alex.shi@intel.com>
> Date: Tue, 2 Apr 2013 10:27:45 +0800
> Subject: [PATCH] sched: use instant load for burst wake up
> 
> If many tasks sleep long time, their runnable load are zero. And if they
> are waked up bursty, too light runnable load causes big imbalance among
> CPU. So such benchmark, like aim9 drop 5~7%.
> 
> With this patch the losing is covered, and even is slight better.

A fast test show the improvement disappear and the regression back
again...after applied this one as the 8th patch, it doesn't works.

Regards,
Michael Wang

> 
> Signed-off-by: Alex Shi <alex.shi@intel.com>
> ---
>  kernel/sched/fair.c | 16 ++++++++++++++--
>  1 file changed, 14 insertions(+), 2 deletions(-)
> 
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index dbaa8ca..25ac437 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3103,12 +3103,24 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
>  	unsigned long weight;
>  	int balanced;
>  	int runnable_avg;
> +	int burst = 0;
> 
>  	idx	  = sd->wake_idx;
>  	this_cpu  = smp_processor_id();
>  	prev_cpu  = task_cpu(p);
> -	load	  = source_load(prev_cpu, idx);
> -	this_load = target_load(this_cpu, idx);
> +
> +	if (cpu_rq(this_cpu)->avg_idle < sysctl_sched_migration_cost ||
> +		cpu_rq(prev_cpu)->avg_idle < sysctl_sched_migration_cost)
> +		burst= 1;
> +
> +	/* use instant load for bursty waking up */
> +	if (!burst) {
> +		load = source_load(prev_cpu, idx);
> +		this_load = target_load(this_cpu, idx);
> +	} else {
> +		load = cpu_rq(prev_cpu)->load.weight;
> +		this_load = cpu_rq(this_cpu)->load.weight;
> +	}
> 
>  	/*
>  	 * If sync wakeup then subtract the (maximum possible)
> 


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-03  6:22             ` Michael Wang
@ 2013-04-03  6:53               ` Alex Shi
  2013-04-03  7:18                 ` Michael Wang
  0 siblings, 1 reply; 44+ messages in thread
From: Alex Shi @ 2013-04-03  6:53 UTC (permalink / raw)
  To: Michael Wang
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/03/2013 02:22 PM, Michael Wang wrote:
>> > 
>> > If many tasks sleep long time, their runnable load are zero. And if they
>> > are waked up bursty, too light runnable load causes big imbalance among
>> > CPU. So such benchmark, like aim9 drop 5~7%.
>> > 
>> > With this patch the losing is covered, and even is slight better.
> A fast test show the improvement disappear and the regression back
> again...after applied this one as the 8th patch, it doesn't works.

It always is good for on benchmark and bad for another. :)

the following patch include the renamed knob, and you can tune it under 
/proc/sys/kernel/... to see detailed impact degree.

>From e540b31b99c887d5a0c5338f7b1f224972b98932 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Tue, 2 Apr 2013 10:27:45 +0800
Subject: [PATCH] sched: use instant load for burst wake up

If many tasks sleep long time, their runnable load are zero. And if they
are waked up bursty, too light runnable load causes big imbalance among
CPU. So such benchmark, like aim9 drop 5~7%.

rq->avg_idle is 'to used to accommodate bursty loads in a dirt simple
dirt cheap manner' -- Mike Galbraith.

With this cheap and smart bursty indicator, we can find the wake up
burst, and just use nr_running as instant utilization only.

The 'sysctl_sched_burst_threshold' used for wakeup burst.

With this patch the losing is covered, and even is slight better.

Signed-off-by: Alex Shi <alex.shi@intel.com>
---
 include/linux/sched/sysctl.h |  1 +
 kernel/sched/fair.c          | 17 +++++++++++++++--
 kernel/sysctl.c              |  7 +++++++
 3 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index bf8086b..a3c3d43 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -53,6 +53,7 @@ extern unsigned int sysctl_numa_balancing_settle_count;
 
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_migration_cost;
+extern unsigned int sysctl_sched_burst_threshold;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_time_avg;
 extern unsigned int sysctl_timer_migration;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dbaa8ca..f41ca67 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -91,6 +91,7 @@ unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+const_debug unsigned int sysctl_sched_burst_threshold = 1000000UL;
 
 /*
  * The exponential sliding  window over which load is averaged for shares
@@ -3103,12 +3104,24 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	unsigned long weight;
 	int balanced;
 	int runnable_avg;
+	int burst = 0;
 
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
 	prev_cpu  = task_cpu(p);
-	load	  = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
+
+	if (cpu_rq(this_cpu)->avg_idle < sysctl_sched_migration_cost ||
+		cpu_rq(prev_cpu)->avg_idle < sysctl_sched_migration_cost)
+		burst= 1;
+
+	/* use instant load for bursty waking up */
+	if (!burst) {
+		load = source_load(prev_cpu, idx);
+		this_load = target_load(this_cpu, idx);
+	} else {
+		load = cpu_rq(prev_cpu)->load.weight;
+		this_load = cpu_rq(this_cpu)->load.weight;
+	}
 
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index afc1dc6..1f23457 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -327,6 +327,13 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
+		.procname	= "sched_burst_threshold_ns",
+		.data		= &sysctl_sched_burst_threshold,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		.procname	= "sched_nr_migrate",
 		.data		= &sysctl_sched_nr_migrate,
 		.maxlen		= sizeof(unsigned int),
-- 
1.7.12


-- 
Thanks Alex

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-03  6:53               ` Alex Shi
@ 2013-04-03  7:18                 ` Michael Wang
  2013-04-03  7:28                   ` Alex Shi
  0 siblings, 1 reply; 44+ messages in thread
From: Michael Wang @ 2013-04-03  7:18 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/03/2013 02:53 PM, Alex Shi wrote:
> On 04/03/2013 02:22 PM, Michael Wang wrote:
>>>>
>>>> If many tasks sleep long time, their runnable load are zero. And if they
>>>> are waked up bursty, too light runnable load causes big imbalance among
>>>> CPU. So such benchmark, like aim9 drop 5~7%.
>>>>
>>>> With this patch the losing is covered, and even is slight better.
>> A fast test show the improvement disappear and the regression back
>> again...after applied this one as the 8th patch, it doesn't works.
> 
> It always is good for on benchmark and bad for another. :)

That's right :)

> 
> the following patch include the renamed knob, and you can tune it under 
> /proc/sys/kernel/... to see detailed impact degree.

Could I make the conclusion that the improvement on pgbench was caused
by the new weighted_cpuload()?

The burst branch seems to just adopt the load in old world, if reduce
the rate to enter that branch could regain the benefit, then I could
confirm my supposition.

> 
> +	if (cpu_rq(this_cpu)->avg_idle < sysctl_sched_migration_cost ||
> +		cpu_rq(prev_cpu)->avg_idle < sysctl_sched_migration_cost)

It should be 'sysctl_sched_burst_threshold' here, isn't it? anyway, I
will take a try with different rate.

Regards,
Michael Wang


> +		burst= 1;
> +
> +	/* use instant load for bursty waking up */
> +	if (!burst) {
> +		load = source_load(prev_cpu, idx);
> +		this_load = target_load(this_cpu, idx);
> +	} else {
> +		load = cpu_rq(prev_cpu)->load.weight;
> +		this_load = cpu_rq(this_cpu)->load.weight;
> +	}
> 
>  	/*
>  	 * If sync wakeup then subtract the (maximum possible)
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index afc1dc6..1f23457 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -327,6 +327,13 @@ static struct ctl_table kern_table[] = {
>  		.proc_handler	= proc_dointvec,
>  	},
>  	{
> +		.procname	= "sched_burst_threshold_ns",
> +		.data		= &sysctl_sched_burst_threshold,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{
>  		.procname	= "sched_nr_migrate",
>  		.data		= &sysctl_sched_nr_migrate,
>  		.maxlen		= sizeof(unsigned int),
> 


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-03  7:18                 ` Michael Wang
@ 2013-04-03  7:28                   ` Alex Shi
  0 siblings, 0 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-03  7:28 UTC (permalink / raw)
  To: Michael Wang
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/03/2013 03:18 PM, Michael Wang wrote:
>> > the following patch include the renamed knob, and you can tune it under 
>> > /proc/sys/kernel/... to see detailed impact degree.
> Could I make the conclusion that the improvement on pgbench was caused
> by the new weighted_cpuload()?

guess too.
> 
> The burst branch seems to just adopt the load in old world, if reduce
> the rate to enter that branch could regain the benefit, then I could
> confirm my supposition.
> 
>> > 
>> > +	if (cpu_rq(this_cpu)->avg_idle < sysctl_sched_migration_cost ||
>> > +		cpu_rq(prev_cpu)->avg_idle < sysctl_sched_migration_cost)
> It should be 'sysctl_sched_burst_threshold' here, isn't it? anyway, I
> will take a try with different rate.

Ops, sth wrong in my git operation.

-- 
Thanks Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-02  7:23 ` [patch v3 0/8] sched: use runnable avg in load balance Michael Wang
  2013-04-02  8:34   ` Mike Galbraith
  2013-04-02  8:35   ` Alex Shi
@ 2013-04-03  8:46   ` Alex Shi
  2013-04-03  9:37     ` Michael Wang
  2013-04-07  3:09     ` Michael Wang
  2 siblings, 2 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-03  8:46 UTC (permalink / raw)
  To: Michael Wang
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/02/2013 03:23 PM, Michael Wang wrote:
> | 15 GB   |      12 | 45393 |   | 43986 |
> | 15 GB   |      16 | 45110 |   | 45719 |
> | 15 GB   |      24 | 41415 |   | 36813 |	-11.11%
> | 15 GB   |      32 | 35988 |   | 34025 |
> 
> The reason may caused by wake_affine()'s higher overhead, and pgbench is
> really sensitive to this stuff...

Michael:
I changed the threshold to 0.1ms it has same effect on aim7.
So could you try the following on pgbench?


diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index bf8086b..a3c3d43 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -53,6 +53,7 @@ extern unsigned int sysctl_numa_balancing_settle_count;
 
 #ifdef CONFIG_SCHED_DEBUG
 extern unsigned int sysctl_sched_migration_cost;
+extern unsigned int sysctl_sched_burst_threshold;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_time_avg;
 extern unsigned int sysctl_timer_migration;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dbaa8ca..dd5a324 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -91,6 +91,7 @@ unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+const_debug unsigned int sysctl_sched_burst_threshold = 100000UL;
 
 /*
  * The exponential sliding  window over which load is averaged for shares
@@ -3103,12 +3104,24 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 	unsigned long weight;
 	int balanced;
 	int runnable_avg;
+	int burst = 0;
 
 	idx	  = sd->wake_idx;
 	this_cpu  = smp_processor_id();
 	prev_cpu  = task_cpu(p);
-	load	  = source_load(prev_cpu, idx);
-	this_load = target_load(this_cpu, idx);
+
+	if (cpu_rq(this_cpu)->avg_idle < sysctl_sched_burst_threshold ||
+		cpu_rq(prev_cpu)->avg_idle < sysctl_sched_burst_threshold)
+		burst= 1;
+
+	/* use instant load for bursty waking up */
+	if (!burst) {
+		load = source_load(prev_cpu, idx);
+		this_load = target_load(this_cpu, idx);
+	} else {
+		load = cpu_rq(prev_cpu)->load.weight;
+		this_load = cpu_rq(this_cpu)->load.weight;
+	}
 
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index afc1dc6..1f23457 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -327,6 +327,13 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
+		.procname	= "sched_burst_threshold_ns",
+		.data		= &sysctl_sched_burst_threshold,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
 		.procname	= "sched_nr_migrate",
 		.data		= &sysctl_sched_nr_migrate,
 		.maxlen		= sizeof(unsigned int),
-- 
Thanks Alex

^ permalink raw reply related	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-03  8:46   ` Alex Shi
@ 2013-04-03  9:37     ` Michael Wang
  2013-04-03 11:17       ` Alex Shi
  2013-04-07  3:09     ` Michael Wang
  1 sibling, 1 reply; 44+ messages in thread
From: Michael Wang @ 2013-04-03  9:37 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/03/2013 04:46 PM, Alex Shi wrote:
> On 04/02/2013 03:23 PM, Michael Wang wrote:
>> | 15 GB   |      12 | 45393 |   | 43986 |
>> | 15 GB   |      16 | 45110 |   | 45719 |
>> | 15 GB   |      24 | 41415 |   | 36813 |	-11.11%
>> | 15 GB   |      32 | 35988 |   | 34025 |
>>
>> The reason may caused by wake_affine()'s higher overhead, and pgbench is
>> really sensitive to this stuff...
> 
> Michael:
> I changed the threshold to 0.1ms it has same effect on aim7.
> So could you try the following on pgbench?

Hi, Alex

I've done some rough test and the change point should in 60000~120000,
I'm currently running a auto test with value 500000, 250000, 120000,
60000, 30000, 15000, 6000, 3000, 1500, it will take some time to finish
the test, and we will gain detail info for analysis.

BTW, you know we have festival in China, so the report may be delayed,
forgive me on that ;-)

Regards,
Michael Wang

> 
> 
> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> index bf8086b..a3c3d43 100644
> --- a/include/linux/sched/sysctl.h
> +++ b/include/linux/sched/sysctl.h
> @@ -53,6 +53,7 @@ extern unsigned int sysctl_numa_balancing_settle_count;
> 
>  #ifdef CONFIG_SCHED_DEBUG
>  extern unsigned int sysctl_sched_migration_cost;
> +extern unsigned int sysctl_sched_burst_threshold;
>  extern unsigned int sysctl_sched_nr_migrate;
>  extern unsigned int sysctl_sched_time_avg;
>  extern unsigned int sysctl_timer_migration;
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index dbaa8ca..dd5a324 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -91,6 +91,7 @@ unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
>  unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
> 
>  const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
> +const_debug unsigned int sysctl_sched_burst_threshold = 100000UL;
> 
>  /*
>   * The exponential sliding  window over which load is averaged for shares
> @@ -3103,12 +3104,24 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
>  	unsigned long weight;
>  	int balanced;
>  	int runnable_avg;
> +	int burst = 0;
> 
>  	idx	  = sd->wake_idx;
>  	this_cpu  = smp_processor_id();
>  	prev_cpu  = task_cpu(p);
> -	load	  = source_load(prev_cpu, idx);
> -	this_load = target_load(this_cpu, idx);
> +
> +	if (cpu_rq(this_cpu)->avg_idle < sysctl_sched_burst_threshold ||
> +		cpu_rq(prev_cpu)->avg_idle < sysctl_sched_burst_threshold)
> +		burst= 1;
> +
> +	/* use instant load for bursty waking up */
> +	if (!burst) {
> +		load = source_load(prev_cpu, idx);
> +		this_load = target_load(this_cpu, idx);
> +	} else {
> +		load = cpu_rq(prev_cpu)->load.weight;
> +		this_load = cpu_rq(this_cpu)->load.weight;
> +	}
> 
>  	/*
>  	 * If sync wakeup then subtract the (maximum possible)
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index afc1dc6..1f23457 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -327,6 +327,13 @@ static struct ctl_table kern_table[] = {
>  		.proc_handler	= proc_dointvec,
>  	},
>  	{
> +		.procname	= "sched_burst_threshold_ns",
> +		.data		= &sysctl_sched_burst_threshold,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{
>  		.procname	= "sched_nr_migrate",
>  		.data		= &sysctl_sched_nr_migrate,
>  		.maxlen		= sizeof(unsigned int),
> 


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-03  9:37     ` Michael Wang
@ 2013-04-03 11:17       ` Alex Shi
  0 siblings, 0 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-03 11:17 UTC (permalink / raw)
  To: Michael Wang
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/03/2013 05:37 PM, Michael Wang wrote:
>> > 
>> > Michael:
>> > I changed the threshold to 0.1ms it has same effect on aim7.
>> > So could you try the following on pgbench?
> Hi, Alex
> 
> I've done some rough test and the change point should in 60000~120000,

Sounds good.
> I'm currently running a auto test with value 500000, 250000, 120000,
> 60000, 30000, 15000, 6000, 3000, 1500, it will take some time to finish
> the test, and we will gain detail info for analysis.

Thanks
> 
> BTW, you know we have festival in China, so the report may be delayed,
> forgive me on that ;-)

That's all right. I locate in China too. :)

-- 
Thanks
    Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-03  8:46   ` Alex Shi
  2013-04-03  9:37     ` Michael Wang
@ 2013-04-07  3:09     ` Michael Wang
  2013-04-07  7:30       ` Alex Shi
  1 sibling, 1 reply; 44+ messages in thread
From: Michael Wang @ 2013-04-07  3:09 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/03/2013 04:46 PM, Alex Shi wrote:
> On 04/02/2013 03:23 PM, Michael Wang wrote:
>> | 15 GB   |      12 | 45393 |   | 43986 |
>> | 15 GB   |      16 | 45110 |   | 45719 |
>> | 15 GB   |      24 | 41415 |   | 36813 |	-11.11%
>> | 15 GB   |      32 | 35988 |   | 34025 |
>>
>> The reason may caused by wake_affine()'s higher overhead, and pgbench is
>> really sensitive to this stuff...
> 
> Michael:
> I changed the threshold to 0.1ms it has same effect on aim7.
> So could you try the following on pgbench?

Here the results of different threshold, too many data so I just list 22
MB 32 clients item:

threshold(us)			tps

base				43420
500				40694
250				40591
120				41468		-4.50%
60				47785		+10.05%
30				51389
15				52844
6				54539
3				52674
1.5				52885

Since 120~60us seems to be the inflection, I made more test in this range:

threshold(us)			tps
base				43420
110				41772
100				42246
90				43471		0%
80				44920
70				46341

According to these data, 90us == 90000 is the inflection point on my box
for 22 MB 32 clients item, other test items show different float, so
80~90us is the conclusion.

Now the concern is how to deal with this issue, the results may changed
on different deployment, static value is not acceptable, so we need
another new knob here?

I'm not sure whether you have take a look at the wake-affine throttle
patch I sent weeks ago, it's purpose is throttle the wake-affine to not
work too frequently.

And since the aim problem is caused by the imbalance which is the
side-effect of frequently succeed wake-affine, may be the throttle patch
could help to address that issue too, if it is, then we only need to add
one new knob.

BTW, the benefit your patch set bring is not conflict with wake-affine
throttle patch, which means with your patch set, the 1ms throttle will
also show 25% improvement now (used to be <5%), and it also increase the
maximum benefit from 40% to 45% on my box ;-)

Regards,
Michael Wang

> 
> 
> diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
> index bf8086b..a3c3d43 100644
> --- a/include/linux/sched/sysctl.h
> +++ b/include/linux/sched/sysctl.h
> @@ -53,6 +53,7 @@ extern unsigned int sysctl_numa_balancing_settle_count;
> 
>  #ifdef CONFIG_SCHED_DEBUG
>  extern unsigned int sysctl_sched_migration_cost;
> +extern unsigned int sysctl_sched_burst_threshold;
>  extern unsigned int sysctl_sched_nr_migrate;
>  extern unsigned int sysctl_sched_time_avg;
>  extern unsigned int sysctl_timer_migration;
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index dbaa8ca..dd5a324 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -91,6 +91,7 @@ unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
>  unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
> 
>  const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
> +const_debug unsigned int sysctl_sched_burst_threshold = 100000UL;
> 
>  /*
>   * The exponential sliding  window over which load is averaged for shares
> @@ -3103,12 +3104,24 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
>  	unsigned long weight;
>  	int balanced;
>  	int runnable_avg;
> +	int burst = 0;
> 
>  	idx	  = sd->wake_idx;
>  	this_cpu  = smp_processor_id();
>  	prev_cpu  = task_cpu(p);
> -	load	  = source_load(prev_cpu, idx);
> -	this_load = target_load(this_cpu, idx);
> +
> +	if (cpu_rq(this_cpu)->avg_idle < sysctl_sched_burst_threshold ||
> +		cpu_rq(prev_cpu)->avg_idle < sysctl_sched_burst_threshold)
> +		burst= 1;
> +
> +	/* use instant load for bursty waking up */
> +	if (!burst) {
> +		load = source_load(prev_cpu, idx);
> +		this_load = target_load(this_cpu, idx);
> +	} else {
> +		load = cpu_rq(prev_cpu)->load.weight;
> +		this_load = cpu_rq(this_cpu)->load.weight;
> +	}
> 
>  	/*
>  	 * If sync wakeup then subtract the (maximum possible)
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index afc1dc6..1f23457 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -327,6 +327,13 @@ static struct ctl_table kern_table[] = {
>  		.proc_handler	= proc_dointvec,
>  	},
>  	{
> +		.procname	= "sched_burst_threshold_ns",
> +		.data		= &sysctl_sched_burst_threshold,
> +		.maxlen		= sizeof(unsigned int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{
>  		.procname	= "sched_nr_migrate",
>  		.data		= &sysctl_sched_nr_migrate,
>  		.maxlen		= sizeof(unsigned int),
> 


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-07  3:09     ` Michael Wang
@ 2013-04-07  7:30       ` Alex Shi
  2013-04-07  8:56         ` Michael Wang
  0 siblings, 1 reply; 44+ messages in thread
From: Alex Shi @ 2013-04-07  7:30 UTC (permalink / raw)
  To: Michael Wang
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

> 
> According to these data, 90us == 90000 is the inflection point on my box
> for 22 MB 32 clients item, other test items show different float, so
> 80~90us is the conclusion.

Thanks a lot for the testing!
> 
> Now the concern is how to deal with this issue, the results may changed
> on different deployment, static value is not acceptable, so we need
> another new knob here?
> 
> I'm not sure whether you have take a look at the wake-affine throttle
> patch I sent weeks ago, it's purpose is throttle the wake-affine to not
> work too frequently.

Yes. In the patch your directly set the target cpu to this_cpu when no
wake_affine. Maybe this is the key points, not the wake_affine cost give
the improvement. Basically I agree with this. but if so, it is a bit
blind. but, but, The interesting point is the blind target cpu setting
has the best performance in our current testing. :)

> 
> And since the aim problem is caused by the imbalance which is the
> side-effect of frequently succeed wake-affine, may be the throttle patch
> could help to address that issue too, if it is, then we only need to add
> one new knob.

As to the aim7 problem, I need apologise to you all!
The aim7 regression exist with the patch v2 that base on 3.8 kernel, not
with this v3 version base on 3.9.

After the lock-stealing RW sem patch introduced in 3.9 kernel, the aim7
has recovered the cpu task imbalance, So on balanced 3.9 kernel, this v3
version won't bring extra imbalance on aim7. no clear regression on
aim7, no extra imbalance on aim7.

So, I referenced a old testing result without double confirming, tried
to resolve a disappeared problem. I am sorry and applogize to you all.

And this burst patch doesn't need on 3.9 kernel. Patch 1,2,4,5,6,7 are
enough and valid.

-- 
Thanks Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-07  7:30       ` Alex Shi
@ 2013-04-07  8:56         ` Michael Wang
  0 siblings, 0 replies; 44+ messages in thread
From: Michael Wang @ 2013-04-07  8:56 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/07/2013 03:30 PM, Alex Shi wrote:
>>
>> According to these data, 90us == 90000 is the inflection point on my box
>> for 22 MB 32 clients item, other test items show different float, so
>> 80~90us is the conclusion.
> 
> Thanks a lot for the testing!
>>
>> Now the concern is how to deal with this issue, the results may changed
>> on different deployment, static value is not acceptable, so we need
>> another new knob here?
>>
>> I'm not sure whether you have take a look at the wake-affine throttle
>> patch I sent weeks ago, it's purpose is throttle the wake-affine to not
>> work too frequently.
> 
> Yes. In the patch your directly set the target cpu to this_cpu when no
> wake_affine. Maybe this is the key points, not the wake_affine cost give
> the improvement. Basically I agree with this. but if so, it is a bit
> blind. but, but, The interesting point is the blind target cpu setting
> has the best performance in our current testing. :)

IMHO, the wake-affine stuff is blindly at all, so actually this throttle
knob should be added at the first time along with the stuff, what we
need to do now is just add that missed knob.

I do believe when first time the wake-affine stuff was added, there is
no regression, but since the world changed, the regression start to be
accumulated and become so big, we could not ignore it now.

The throttle idea is just try to provide a way to stop the blind
judgement, easy and efficient :)

> 
>>
>> And since the aim problem is caused by the imbalance which is the
>> side-effect of frequently succeed wake-affine, may be the throttle patch
>> could help to address that issue too, if it is, then we only need to add
>> one new knob.
> 
> As to the aim7 problem, I need apologise to you all!
> The aim7 regression exist with the patch v2 that base on 3.8 kernel, not
> with this v3 version base on 3.9.
> 
> After the lock-stealing RW sem patch introduced in 3.9 kernel, the aim7
> has recovered the cpu task imbalance, So on balanced 3.9 kernel, this v3
> version won't bring extra imbalance on aim7. no clear regression on
> aim7, no extra imbalance on aim7.
> 
> So, I referenced a old testing result without double confirming, tried
> to resolve a disappeared problem. I am sorry and applogize to you all.

That's all right, and it's good to know we could ignore the last patch,
I really like the benefit 1~7 bring, combined with the throttle idea,
pgbench was satisfied a lot ;-)

Regards,
Michael Wang

> 
> And this burst patch doesn't need on 3.9 kernel. Patch 1,2,4,5,6,7 are
> enough and valid.
> 


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-02  3:23 [patch v3 0/8] sched: use runnable avg in load balance Alex Shi
                   ` (8 preceding siblings ...)
  2013-04-02  7:23 ` [patch v3 0/8] sched: use runnable avg in load balance Michael Wang
@ 2013-04-09  5:08 ` Alex Shi
  2013-04-10 13:12   ` Alex Shi
  9 siblings, 1 reply; 44+ messages in thread
From: Alex Shi @ 2013-04-09  5:08 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, peterz, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/02/2013 11:23 AM, Alex Shi wrote:
> This version resolved the aim7 liked benchmark issue by patch 8th.
> Thanks for MikeG's avg_idle that is a good bursty wakeup indicator.
> 
> The first 3 patches were also include in my power aware scheduling patchset.
> 
> Morten, you can rebase your patch on this new version that bases on latest
> Linus tree. :)
> 
> a git tree for this patchset:
> 	https://github.com/alexshi/power-scheduling.git runnablelb

I removed the 3rd and 8th patches, left 1,2,4,5,6,7. and updated them in
above git tree.

I tested the kbuild, specjbb2005, aim9, fileio-cfq, hackbench and
dbench. on my NHM EP and 2 sockets SNB EP box. hackbench increased 3~5%
on both machines. kbuild has suspicious 2% increase. others has no clear
change.

Any more comments or concern about this patch?
> 
> Thanks
> Alex
> 
> [patch v3 1/8] Revert "sched: Introduce temporary FAIR_GROUP_SCHED
> [patch v3 2/8] sched: set initial value of runnable avg for new
> [patch v3 3/8] sched: only count runnable avg on cfs_rq's nr_running
> [patch v3 4/8] sched: update cpu load after task_tick.
> [patch v3 5/8] sched: compute runnable load avg in cpu_load and
> [patch v3 6/8] sched: consider runnable load average in move_tasks
> [patch v3 7/8] sched: consider runnable load average in
> [patch v3 8/8] sched: use instant load for burst wake up
> 


-- 
Thanks Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 6/8] sched: consider runnable load average in move_tasks
  2013-04-02  3:23 ` [patch v3 6/8] sched: consider runnable load average in move_tasks Alex Shi
@ 2013-04-09  7:08   ` Vincent Guittot
  2013-04-09  8:05     ` Alex Shi
  2013-04-10  6:07     ` Michael Wang
  0 siblings, 2 replies; 44+ messages in thread
From: Vincent Guittot @ 2013-04-09  7:08 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, Peter Zijlstra, Thomas Gleixner, Andrew Morton,
	Arjan van de Ven, Borislav Petkov, Paul Turner, Namhyung Kim,
	Mike Galbraith, Morten Rasmussen, gregkh, Preeti U Murthy,
	Viresh Kumar, linux-kernel, Len Brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 2 April 2013 05:23, Alex Shi <alex.shi@intel.com> wrote:
> Except using runnable load average in background, move_tasks is also
> the key functions in load balance. We need consider the runnable load
> average in it in order to the apple to apple load comparison.
>
> Signed-off-by: Alex Shi <alex.shi@intel.com>
> ---
>  kernel/sched/fair.c | 11 ++++++++++-
>  1 file changed, 10 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1f9026e..bf4e0d4 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3966,6 +3966,15 @@ static unsigned long task_h_load(struct task_struct *p);
>
>  static const unsigned int sched_nr_migrate_break = 32;
>
> +static unsigned long task_h_load_avg(struct task_struct *p)
> +{
> +       u32 period = p->se.avg.runnable_avg_period;
> +       if (!period)
> +               return 0;
> +
> +       return task_h_load(p) * p->se.avg.runnable_avg_sum / period;

How do you ensure that runnable_avg_period and runnable_avg_sum are
coherent ? an update of the statistic can occur in the middle of your
sequence.

Vincent

> +}
> +
>  /*
>   * move_tasks tries to move up to imbalance weighted load from busiest to
>   * this_rq, as part of a balancing operation within domain "sd".
> @@ -4001,7 +4010,7 @@ static int move_tasks(struct lb_env *env)
>                 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
>                         goto next;
>
> -               load = task_h_load(p);
> +               load = task_h_load_avg(p);
>
>                 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
>                         goto next;
> --
> 1.7.12
>

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 6/8] sched: consider runnable load average in move_tasks
  2013-04-09  7:08   ` Vincent Guittot
@ 2013-04-09  8:05     ` Alex Shi
  2013-04-09  8:58       ` Vincent Guittot
  2013-04-10  6:07     ` Michael Wang
  1 sibling, 1 reply; 44+ messages in thread
From: Alex Shi @ 2013-04-09  8:05 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: mingo, Peter Zijlstra, Thomas Gleixner, Andrew Morton,
	Arjan van de Ven, Borislav Petkov, Paul Turner, Namhyung Kim,
	Mike Galbraith, Morten Rasmussen, gregkh, Preeti U Murthy,
	Viresh Kumar, linux-kernel, Len Brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/09/2013 03:08 PM, Vincent Guittot wrote:
> On 2 April 2013 05:23, Alex Shi <alex.shi@intel.com> wrote:
>> Except using runnable load average in background, move_tasks is also
>> the key functions in load balance. We need consider the runnable load
>> average in it in order to the apple to apple load comparison.
>>
>> Signed-off-by: Alex Shi <alex.shi@intel.com>
>> ---
>>  kernel/sched/fair.c | 11 ++++++++++-
>>  1 file changed, 10 insertions(+), 1 deletion(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 1f9026e..bf4e0d4 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -3966,6 +3966,15 @@ static unsigned long task_h_load(struct task_struct *p);
>>
>>  static const unsigned int sched_nr_migrate_break = 32;
>>
>> +static unsigned long task_h_load_avg(struct task_struct *p)
>> +{
>> +       u32 period = p->se.avg.runnable_avg_period;
>> +       if (!period)
>> +               return 0;
>> +
>> +       return task_h_load(p) * p->se.avg.runnable_avg_sum / period;
> 
> How do you ensure that runnable_avg_period and runnable_avg_sum are
> coherent ? an update of the statistic can occur in the middle of your
> sequence.

Thanks for your question, Vincent!
the runnable_avg_period and runnable_avg_sum, only updated in
__update_entity_runnable_avg().
Yes, I didn't see some locks to ensure the coherent of them. but they
are updated closely, and it is not big deal even a little incorrect to
their value. These data are collected periodically, don't need very very
precise at every time.
Am I right? :)

-- 
Thanks Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 6/8] sched: consider runnable load average in move_tasks
  2013-04-09  8:05     ` Alex Shi
@ 2013-04-09  8:58       ` Vincent Guittot
  2013-04-09 10:38         ` Alex Shi
  0 siblings, 1 reply; 44+ messages in thread
From: Vincent Guittot @ 2013-04-09  8:58 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, Peter Zijlstra, Thomas Gleixner, Andrew Morton,
	Arjan van de Ven, Borislav Petkov, Paul Turner, Namhyung Kim,
	Mike Galbraith, Morten Rasmussen, gregkh, Preeti U Murthy,
	Viresh Kumar, linux-kernel, Len Brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 9 April 2013 10:05, Alex Shi <alex.shi@intel.com> wrote:
> On 04/09/2013 03:08 PM, Vincent Guittot wrote:
>> On 2 April 2013 05:23, Alex Shi <alex.shi@intel.com> wrote:
>>> Except using runnable load average in background, move_tasks is also
>>> the key functions in load balance. We need consider the runnable load
>>> average in it in order to the apple to apple load comparison.
>>>
>>> Signed-off-by: Alex Shi <alex.shi@intel.com>
>>> ---
>>>  kernel/sched/fair.c | 11 ++++++++++-
>>>  1 file changed, 10 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 1f9026e..bf4e0d4 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -3966,6 +3966,15 @@ static unsigned long task_h_load(struct task_struct *p);
>>>
>>>  static const unsigned int sched_nr_migrate_break = 32;
>>>
>>> +static unsigned long task_h_load_avg(struct task_struct *p)
>>> +{
>>> +       u32 period = p->se.avg.runnable_avg_period;
>>> +       if (!period)
>>> +               return 0;
>>> +
>>> +       return task_h_load(p) * p->se.avg.runnable_avg_sum / period;
>>
>> How do you ensure that runnable_avg_period and runnable_avg_sum are
>> coherent ? an update of the statistic can occur in the middle of your
>> sequence.
>
> Thanks for your question, Vincent!
> the runnable_avg_period and runnable_avg_sum, only updated in
> __update_entity_runnable_avg().
> Yes, I didn't see some locks to ensure the coherent of them. but they
> are updated closely, and it is not big deal even a little incorrect to
> their value. These data are collected periodically, don't need very very
> precise at every time.
> Am I right? :)

The problem mainly appears during starting phase (the 1st 345ms) when
runnable_avg_period has not reached the max value yet so you can have
avg.runnable_avg_sum greater than avg.runnable_avg_period. In a worst
case, runnable_avg_sum could be twice runnable_avg_period

Vincent

>
> --
> Thanks Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 6/8] sched: consider runnable load average in move_tasks
  2013-04-09  8:58       ` Vincent Guittot
@ 2013-04-09 10:38         ` Alex Shi
  2013-04-09 11:56           ` Vincent Guittot
  0 siblings, 1 reply; 44+ messages in thread
From: Alex Shi @ 2013-04-09 10:38 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: mingo, Peter Zijlstra, Thomas Gleixner, Andrew Morton,
	Arjan van de Ven, Borislav Petkov, Paul Turner, Namhyung Kim,
	Mike Galbraith, Morten Rasmussen, gregkh, Preeti U Murthy,
	Viresh Kumar, linux-kernel, Len Brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/09/2013 04:58 PM, Vincent Guittot wrote:
>>> >> How do you ensure that runnable_avg_period and runnable_avg_sum are
>>> >> coherent ? an update of the statistic can occur in the middle of your
>>> >> sequence.
>> >
>> > Thanks for your question, Vincent!
>> > the runnable_avg_period and runnable_avg_sum, only updated in
>> > __update_entity_runnable_avg().
>> > Yes, I didn't see some locks to ensure the coherent of them. but they
>> > are updated closely, and it is not big deal even a little incorrect to
>> > their value. These data are collected periodically, don't need very very
>> > precise at every time.
>> > Am I right? :)
> The problem mainly appears during starting phase (the 1st 345ms) when
> runnable_avg_period has not reached the max value yet so you can have
> avg.runnable_avg_sum greater than avg.runnable_avg_period. In a worst
> case, runnable_avg_sum could be twice runnable_avg_period

Oh, That's a serious problem. Do you catch it in real word or in code?
Could you explain more for details?

-- 
Thanks
    Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 6/8] sched: consider runnable load average in move_tasks
  2013-04-09 10:38         ` Alex Shi
@ 2013-04-09 11:56           ` Vincent Guittot
  2013-04-09 14:48             ` Alex Shi
  0 siblings, 1 reply; 44+ messages in thread
From: Vincent Guittot @ 2013-04-09 11:56 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, Peter Zijlstra, Thomas Gleixner, Andrew Morton,
	Arjan van de Ven, Borislav Petkov, Paul Turner, Namhyung Kim,
	Mike Galbraith, Morten Rasmussen, gregkh, Preeti U Murthy,
	Viresh Kumar, linux-kernel, Len Brown, rafael.j.wysocki, jkosina,
	Clark Williams, tony.luck, keescook, Mel Gorman, riel

On 9 April 2013 12:38, Alex Shi <alex.shi@intel.com> wrote:
> On 04/09/2013 04:58 PM, Vincent Guittot wrote:
>>>> >> How do you ensure that runnable_avg_period and runnable_avg_sum are
>>>> >> coherent ? an update of the statistic can occur in the middle of your
>>>> >> sequence.
>>> >
>>> > Thanks for your question, Vincent!
>>> > the runnable_avg_period and runnable_avg_sum, only updated in
>>> > __update_entity_runnable_avg().
>>> > Yes, I didn't see some locks to ensure the coherent of them. but they
>>> > are updated closely, and it is not big deal even a little incorrect to
>>> > their value. These data are collected periodically, don't need very very
>>> > precise at every time.
>>> > Am I right? :)
>> The problem mainly appears during starting phase (the 1st 345ms) when
>> runnable_avg_period has not reached the max value yet so you can have
>> avg.runnable_avg_sum greater than avg.runnable_avg_period. In a worst
>> case, runnable_avg_sum could be twice runnable_avg_period
>
> Oh, That's a serious problem. Do you catch it in real word or in code?

I haven't trace that shows this issue but nothing prevent an update to
occur while you get values so you can have a mix of old and new
values.

> Could you explain more for details?

Both fields of a new task increase simultaneously but if you get the
old value for runnable_avg_period and the new one for
runnable_avg_sum, runnable_avg_sum will be greater than
runnable_avg_period during this starting phase.

The worst case appears 2ms after the creation of the task,
runnable_avg_period and runnable_avg_sum should go from 1024 to 2046.
So the  task_h_load_avg will be 199% of task_h_load If you have
runnable_avg_period with 1024 and runnable_avg_sum with 2046.

A simple solution is to check that runnable_avg_period is always
greater or equal to runnable_avg_sum like i have done in my packing
small tasks patches

Vincent
>
> --
> Thanks
>     Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 6/8] sched: consider runnable load average in move_tasks
  2013-04-09 11:56           ` Vincent Guittot
@ 2013-04-09 14:48             ` Alex Shi
  2013-04-09 15:16               ` Vincent Guittot
  0 siblings, 1 reply; 44+ messages in thread
From: Alex Shi @ 2013-04-09 14:48 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: mingo, Peter Zijlstra, Thomas Gleixner, Andrew Morton,
	Arjan van de Ven, Borislav Petkov, Paul Turner, Namhyung Kim,
	Mike Galbraith, Morten Rasmussen, gregkh, Preeti U Murthy,
	Viresh Kumar, linux-kernel, Len Brown, rafael.j.wysocki, jkosina,
	Clark Williams, tony.luck, keescook, Mel Gorman, riel

On 04/09/2013 07:56 PM, Vincent Guittot wrote:
> On 9 April 2013 12:38, Alex Shi <alex.shi@intel.com> wrote:
>> On 04/09/2013 04:58 PM, Vincent Guittot wrote:
>>>>>>> How do you ensure that runnable_avg_period and runnable_avg_sum are
>>>>>>> coherent ? an update of the statistic can occur in the middle of your
>>>>>>> sequence.
>>>>>
>>>>> Thanks for your question, Vincent!
>>>>> the runnable_avg_period and runnable_avg_sum, only updated in
>>>>> __update_entity_runnable_avg().
>>>>> Yes, I didn't see some locks to ensure the coherent of them. but they
>>>>> are updated closely, and it is not big deal even a little incorrect to
>>>>> their value. These data are collected periodically, don't need very very
>>>>> precise at every time.
>>>>> Am I right? :)
>>> The problem mainly appears during starting phase (the 1st 345ms) when
>>> runnable_avg_period has not reached the max value yet so you can have
>>> avg.runnable_avg_sum greater than avg.runnable_avg_period. In a worst
>>> case, runnable_avg_sum could be twice runnable_avg_period
>>
>> Oh, That's a serious problem. Do you catch it in real word or in code?
> 
> I haven't trace that shows this issue but nothing prevent an update to
> occur while you get values so you can have a mix of old and new
> values.
> 
>> Could you explain more for details?
> 
> Both fields of a new task increase simultaneously but if you get the
> old value for runnable_avg_period and the new one for
> runnable_avg_sum, runnable_avg_sum will be greater than
> runnable_avg_period during this starting phase.
> 
> The worst case appears 2ms after the creation of the task,
> runnable_avg_period and runnable_avg_sum should go from 1024 to 2046.
> So the  task_h_load_avg will be 199% of task_h_load If you have
> runnable_avg_period with 1024 and runnable_avg_sum with 2046.

Thanks a lot for info sharing! Vincent.

But I checked the rq->avg and task->se.avg, seems none of them are
possible be updated on different CPU at the same time. So my printk
didn't catch this with benchmark kbuild and aim7 on my SNB EP box.

Then I find some words in your commit log:
"If a CPU accesses the runnable_avg_sum and runnable_avg_period fields
of its buddy CPU while the latter updates it, it can get the new version
of a field and the old version of the other one."
So is it possible caused by the buddy cpu's accessing?
Could you like to recheck this without your patch?


-- 
Thanks
    Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 6/8] sched: consider runnable load average in move_tasks
  2013-04-09 14:48             ` Alex Shi
@ 2013-04-09 15:16               ` Vincent Guittot
  2013-04-10  2:31                 ` Alex Shi
  0 siblings, 1 reply; 44+ messages in thread
From: Vincent Guittot @ 2013-04-09 15:16 UTC (permalink / raw)
  To: Alex Shi
  Cc: mingo, Peter Zijlstra, Thomas Gleixner, Andrew Morton,
	Arjan van de Ven, Borislav Petkov, Paul Turner, Namhyung Kim,
	Mike Galbraith, Morten Rasmussen, gregkh, Preeti U Murthy,
	Viresh Kumar, linux-kernel, Len Brown, rafael.j.wysocki, jkosina,
	Clark Williams, tony.luck, keescook, Mel Gorman, riel

On 9 April 2013 16:48, Alex Shi <alex.shi@intel.com> wrote:
> On 04/09/2013 07:56 PM, Vincent Guittot wrote:
>> On 9 April 2013 12:38, Alex Shi <alex.shi@intel.com> wrote:
>>> On 04/09/2013 04:58 PM, Vincent Guittot wrote:
>>>>>>>> How do you ensure that runnable_avg_period and runnable_avg_sum are
>>>>>>>> coherent ? an update of the statistic can occur in the middle of your
>>>>>>>> sequence.
>>>>>>
>>>>>> Thanks for your question, Vincent!
>>>>>> the runnable_avg_period and runnable_avg_sum, only updated in
>>>>>> __update_entity_runnable_avg().
>>>>>> Yes, I didn't see some locks to ensure the coherent of them. but they
>>>>>> are updated closely, and it is not big deal even a little incorrect to
>>>>>> their value. These data are collected periodically, don't need very very
>>>>>> precise at every time.
>>>>>> Am I right? :)
>>>> The problem mainly appears during starting phase (the 1st 345ms) when
>>>> runnable_avg_period has not reached the max value yet so you can have
>>>> avg.runnable_avg_sum greater than avg.runnable_avg_period. In a worst
>>>> case, runnable_avg_sum could be twice runnable_avg_period
>>>
>>> Oh, That's a serious problem. Do you catch it in real word or in code?
>>
>> I haven't trace that shows this issue but nothing prevent an update to
>> occur while you get values so you can have a mix of old and new
>> values.
>>
>>> Could you explain more for details?
>>
>> Both fields of a new task increase simultaneously but if you get the
>> old value for runnable_avg_period and the new one for
>> runnable_avg_sum, runnable_avg_sum will be greater than
>> runnable_avg_period during this starting phase.
>>
>> The worst case appears 2ms after the creation of the task,
>> runnable_avg_period and runnable_avg_sum should go from 1024 to 2046.
>> So the  task_h_load_avg will be 199% of task_h_load If you have
>> runnable_avg_period with 1024 and runnable_avg_sum with 2046.
>
> Thanks a lot for info sharing! Vincent.
>
> But I checked the rq->avg and task->se.avg, seems none of them are
> possible be updated on different CPU at the same time. So my printk
> didn't catch this with benchmark kbuild and aim7 on my SNB EP box.

The problem can happen because reader and writer are accessing the
variable asynchronously and on different CPUs

CPUA write runnable_avg_sum
CPUB read runnable_avg_sum
CPUB read runnable_avg_period
CPUA write runnable_avg_period

I agree that the time window, during which this can occur, is short
but not impossible

Vincent
>
> Then I find some words in your commit log:
> "If a CPU accesses the runnable_avg_sum and runnable_avg_period fields
> of its buddy CPU while the latter updates it, it can get the new version
> of a field and the old version of the other one."
> So is it possible caused by the buddy cpu's accessing?
> Could you like to recheck this without your patch?
>
>
> --
> Thanks
>     Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 6/8] sched: consider runnable load average in move_tasks
  2013-04-09 15:16               ` Vincent Guittot
@ 2013-04-10  2:31                 ` Alex Shi
  0 siblings, 0 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-10  2:31 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: mingo, Peter Zijlstra, Thomas Gleixner, Andrew Morton,
	Arjan van de Ven, Borislav Petkov, Paul Turner, Namhyung Kim,
	Mike Galbraith, Morten Rasmussen, gregkh, Preeti U Murthy,
	Viresh Kumar, linux-kernel, Len Brown, rafael.j.wysocki, jkosina,
	Clark Williams, tony.luck, keescook, Mel Gorman, riel

On 04/09/2013 11:16 PM, Vincent Guittot wrote:
>> > Thanks a lot for info sharing! Vincent.
>> >
>> > But I checked the rq->avg and task->se.avg, seems none of them are
>> > possible be updated on different CPU at the same time. So my printk
>> > didn't catch this with benchmark kbuild and aim7 on my SNB EP box.
> The problem can happen because reader and writer are accessing the
> variable asynchronously and on different CPUs
> 
> CPUA write runnable_avg_sum
> CPUB read runnable_avg_sum
> CPUB read runnable_avg_period
> CPUA write runnable_avg_period
> 
> I agree that the time window, during which this can occur, is short
> but not impossible

May I didn't say clear. Vincent.

member of rq struct include avg, should be protected by rq->lock when
other cpu want to access them. Or be accessed only by local cpu. So they
should be no above situation.
And for per task avg, the task is impossible to be on different cpu at
the same time. so there are also no above problem.

I thought the problem may exists for middle level entity, but seems task
group has each of entity on every cpu. So there is no this problem too.

So, you may better to hold rq->lock when check the buddy cpu info.

Correct me if sth wrong. :)

-- 
Thanks Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 6/8] sched: consider runnable load average in move_tasks
  2013-04-09  7:08   ` Vincent Guittot
  2013-04-09  8:05     ` Alex Shi
@ 2013-04-10  6:07     ` Michael Wang
  2013-04-10  6:55       ` Vincent Guittot
  1 sibling, 1 reply; 44+ messages in thread
From: Michael Wang @ 2013-04-10  6:07 UTC (permalink / raw)
  To: Vincent Guittot
  Cc: Alex Shi, mingo, Peter Zijlstra, Thomas Gleixner, Andrew Morton,
	Arjan van de Ven, Borislav Petkov, Paul Turner, Namhyung Kim,
	Mike Galbraith, Morten Rasmussen, gregkh, Preeti U Murthy,
	Viresh Kumar, linux-kernel, Len Brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/09/2013 03:08 PM, Vincent Guittot wrote:
> On 2 April 2013 05:23, Alex Shi <alex.shi@intel.com> wrote:
>> Except using runnable load average in background, move_tasks is also
>> the key functions in load balance. We need consider the runnable load
>> average in it in order to the apple to apple load comparison.
>>
>> Signed-off-by: Alex Shi <alex.shi@intel.com>
>> ---
>>  kernel/sched/fair.c | 11 ++++++++++-
>>  1 file changed, 10 insertions(+), 1 deletion(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 1f9026e..bf4e0d4 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -3966,6 +3966,15 @@ static unsigned long task_h_load(struct task_struct *p);
>>
>>  static const unsigned int sched_nr_migrate_break = 32;
>>
>> +static unsigned long task_h_load_avg(struct task_struct *p)
>> +{
>> +       u32 period = p->se.avg.runnable_avg_period;
>> +       if (!period)
>> +               return 0;
>> +
>> +       return task_h_load(p) * p->se.avg.runnable_avg_sum / period;
> 
> How do you ensure that runnable_avg_period and runnable_avg_sum are
> coherent ? an update of the statistic can occur in the middle of your
> sequence.

Hi, Vincent

Don't we have the 'rq->lock' to protect it?

move_tasks() was invoked with double locked, for all the se on src and
dst rq, no update should happen, isn't it?

Regards,
Michael Wang

> 
> Vincent
> 
>> +}
>> +
>>  /*
>>   * move_tasks tries to move up to imbalance weighted load from busiest to
>>   * this_rq, as part of a balancing operation within domain "sd".
>> @@ -4001,7 +4010,7 @@ static int move_tasks(struct lb_env *env)
>>                 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
>>                         goto next;
>>
>> -               load = task_h_load(p);
>> +               load = task_h_load_avg(p);
>>
>>                 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
>>                         goto next;
>> --
>> 1.7.12
>>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 


^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 6/8] sched: consider runnable load average in move_tasks
  2013-04-10  6:07     ` Michael Wang
@ 2013-04-10  6:55       ` Vincent Guittot
  0 siblings, 0 replies; 44+ messages in thread
From: Vincent Guittot @ 2013-04-10  6:55 UTC (permalink / raw)
  To: Michael Wang, Alex Shi
  Cc: mingo, Peter Zijlstra, Thomas Gleixner, Andrew Morton,
	Arjan van de Ven, Borislav Petkov, Paul Turner, Namhyung Kim,
	Mike Galbraith, Morten Rasmussen, gregkh, Preeti U Murthy,
	Viresh Kumar, linux-kernel, Len Brown, rafael.j.wysocki, jkosina,
	Clark Williams, tony.luck, keescook, Mel Gorman, riel

On 10 April 2013 08:07, Michael Wang <wangyun@linux.vnet.ibm.com> wrote:
> On 04/09/2013 03:08 PM, Vincent Guittot wrote:
>> On 2 April 2013 05:23, Alex Shi <alex.shi@intel.com> wrote:
>>> Except using runnable load average in background, move_tasks is also
>>> the key functions in load balance. We need consider the runnable load
>>> average in it in order to the apple to apple load comparison.
>>>
>>> Signed-off-by: Alex Shi <alex.shi@intel.com>
>>> ---
>>>  kernel/sched/fair.c | 11 ++++++++++-
>>>  1 file changed, 10 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 1f9026e..bf4e0d4 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -3966,6 +3966,15 @@ static unsigned long task_h_load(struct task_struct *p);
>>>
>>>  static const unsigned int sched_nr_migrate_break = 32;
>>>
>>> +static unsigned long task_h_load_avg(struct task_struct *p)
>>> +{
>>> +       u32 period = p->se.avg.runnable_avg_period;
>>> +       if (!period)
>>> +               return 0;
>>> +
>>> +       return task_h_load(p) * p->se.avg.runnable_avg_sum / period;
>>
>> How do you ensure that runnable_avg_period and runnable_avg_sum are
>> coherent ? an update of the statistic can occur in the middle of your
>> sequence.
>
> Hi, Vincent
>
> Don't we have the 'rq->lock' to protect it?
>
> move_tasks() was invoked with double locked, for all the se on src and
> dst rq, no update should happen, isn't it?

you're right, the double lock protect against concurrent access my
remark doesn't apply here

Regards,
Vincent

>
> Regards,
> Michael Wang
>
>>
>> Vincent
>>
>>> +}
>>> +
>>>  /*
>>>   * move_tasks tries to move up to imbalance weighted load from busiest to
>>>   * this_rq, as part of a balancing operation within domain "sd".
>>> @@ -4001,7 +4010,7 @@ static int move_tasks(struct lb_env *env)
>>>                 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
>>>                         goto next;
>>>
>>> -               load = task_h_load(p);
>>> +               load = task_h_load_avg(p);
>>>
>>>                 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
>>>                         goto next;
>>> --
>>> 1.7.12
>>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at  http://www.tux.org/lkml/
>>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 44+ messages in thread

* Re: [patch v3 0/8] sched: use runnable avg in load balance
  2013-04-09  5:08 ` Alex Shi
@ 2013-04-10 13:12   ` Alex Shi
  0 siblings, 0 replies; 44+ messages in thread
From: Alex Shi @ 2013-04-10 13:12 UTC (permalink / raw)
  To: Alex Shi, peterz
  Cc: mingo, tglx, akpm, arjan, bp, pjt, namhyung, efault,
	morten.rasmussen, vincent.guittot, gregkh, preeti, viresh.kumar,
	linux-kernel, len.brown, rafael.j.wysocki, jkosina,
	clark.williams, tony.luck, keescook, mgorman, riel

On 04/09/2013 01:08 PM, Alex Shi wrote:
> On 04/02/2013 11:23 AM, Alex Shi wrote:
>> > This version resolved the aim7 liked benchmark issue by patch 8th.
>> > Thanks for MikeG's avg_idle that is a good bursty wakeup indicator.
>> > 
>> > The first 3 patches were also include in my power aware scheduling patchset.
>> > 
>> > Morten, you can rebase your patch on this new version that bases on latest
>> > Linus tree. :)
>> > 
>> > a git tree for this patchset:
>> > 	https://github.com/alexshi/power-scheduling.git runnablelb
> I removed the 3rd and 8th patches, left 1,2,4,5,6,7. and updated them in
> above git tree.
> 
> I tested the kbuild, specjbb2005, aim9, fileio-cfq, hackbench and
> dbench. on my NHM EP and 2 sockets SNB EP box. hackbench increased 3~5%
> on both machines. kbuild has suspicious 2% increase. others has no clear
> change.

This patchset has tested by ARM guys and Michael. None of regression found.

Peter, could you like to give some comments?

-- 
Thanks
    Alex

^ permalink raw reply	[flat|nested] 44+ messages in thread

end of thread, other threads:[~2013-04-10 13:12 UTC | newest]

Thread overview: 44+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-04-02  3:23 [patch v3 0/8] sched: use runnable avg in load balance Alex Shi
2013-04-02  3:23 ` [patch v3 1/8] Revert "sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking" Alex Shi
2013-04-02  3:23 ` [patch v3 2/8] sched: set initial value of runnable avg for new forked task Alex Shi
2013-04-02  3:23 ` [patch v3 3/8] sched: only count runnable avg on cfs_rq's nr_running Alex Shi
2013-04-03  3:19   ` Alex Shi
2013-04-02  3:23 ` [patch v3 4/8] sched: update cpu load after task_tick Alex Shi
2013-04-02  3:23 ` [patch v3 5/8] sched: compute runnable load avg in cpu_load and cpu_avg_load_per_task Alex Shi
2013-04-02  3:23 ` [patch v3 6/8] sched: consider runnable load average in move_tasks Alex Shi
2013-04-09  7:08   ` Vincent Guittot
2013-04-09  8:05     ` Alex Shi
2013-04-09  8:58       ` Vincent Guittot
2013-04-09 10:38         ` Alex Shi
2013-04-09 11:56           ` Vincent Guittot
2013-04-09 14:48             ` Alex Shi
2013-04-09 15:16               ` Vincent Guittot
2013-04-10  2:31                 ` Alex Shi
2013-04-10  6:07     ` Michael Wang
2013-04-10  6:55       ` Vincent Guittot
2013-04-02  3:23 ` [patch v3 7/8] sched: consider runnable load average in effective_load Alex Shi
2013-04-02  3:23 ` [patch v3 8/8] sched: use instant load for burst wake up Alex Shi
2013-04-02  7:23 ` [patch v3 0/8] sched: use runnable avg in load balance Michael Wang
2013-04-02  8:34   ` Mike Galbraith
2013-04-02  9:13     ` Michael Wang
2013-04-02  8:35   ` Alex Shi
2013-04-02  9:45     ` Michael Wang
2013-04-03  2:46     ` Michael Wang
2013-04-03  2:56       ` Alex Shi
2013-04-03  3:23         ` Michael Wang
2013-04-03  4:28           ` Alex Shi
2013-04-03  5:38             ` Michael Wang
2013-04-03  5:53               ` Michael Wang
2013-04-03  6:01               ` Alex Shi
2013-04-03  6:22             ` Michael Wang
2013-04-03  6:53               ` Alex Shi
2013-04-03  7:18                 ` Michael Wang
2013-04-03  7:28                   ` Alex Shi
2013-04-03  8:46   ` Alex Shi
2013-04-03  9:37     ` Michael Wang
2013-04-03 11:17       ` Alex Shi
2013-04-07  3:09     ` Michael Wang
2013-04-07  7:30       ` Alex Shi
2013-04-07  8:56         ` Michael Wang
2013-04-09  5:08 ` Alex Shi
2013-04-10 13:12   ` Alex Shi

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.