linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair
@ 2012-02-11  5:05 Peter Zijlstra
  2012-02-11  6:56 ` Mike Galbraith
                   ` (5 more replies)
  0 siblings, 6 replies; 14+ messages in thread
From: Peter Zijlstra @ 2012-02-11  5:05 UTC (permalink / raw)
  To: Paul Turner, Venkatesh Pallipadi; +Cc: Ingo Molnar, Mike Galbraith, LKML

Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
path") it is likely we pick a new task from the same cgroup, doing a put
and then set on all intermediate entities is a waste of time, so try to
avoid this.

XXX check put_prev_task()'s update_rq_clock() magic..

Compile tested only.. inspired by pjt's fast switch stories.

Not-quite-signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h    |    4 +-
 kernel/sched/core.c      |   11 ++---
 kernel/sched/fair.c      |   94 +++++++++++++++++++++++++++++-----------------
 kernel/sched/idle_task.c |    6 ++-
 kernel/sched/rt.c        |   10 ++++-
 kernel/sched/stop_task.c |    6 ++-
 6 files changed, 85 insertions(+), 46 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a5be381..37e990b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1117,7 +1117,8 @@ struct sched_class {
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
 
-	struct task_struct * (*pick_next_task) (struct rq *rq);
+	struct task_struct * (*pick_next_task) (struct rq *rq, 
+						struct task_struct *prev);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
@@ -1210,6 +1211,7 @@ struct sched_entity {
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	int			depth;
 	struct sched_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
 	struct cfs_rq		*cfs_rq;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d7c4322..7628d25 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3120,7 +3120,7 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
 {
 	const struct sched_class *class;
 	struct task_struct *p;
@@ -3130,13 +3130,13 @@ pick_next_task(struct rq *rq)
 	 * the fair class we can call that function directly:
 	 */
 	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
-		p = fair_sched_class.pick_next_task(rq);
+		p = fair_sched_class.pick_next_task(rq, prev);
 		if (likely(p))
 			return p;
 	}
 
 	for_each_class(class) {
-		p = class->pick_next_task(rq);
+		p = class->pick_next_task(rq, prev);
 		if (p)
 			return p;
 	}
@@ -3197,8 +3197,7 @@ static void __sched __schedule(void)
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 
-	put_prev_task(rq, prev);
-	next = pick_next_task(rq);
+	next = pick_next_task(rq, prev);
 	clear_tsk_need_resched(prev);
 	rq->skip_clock_update = 0;
 
@@ -5102,7 +5101,7 @@ static void migrate_tasks(unsigned int dead_cpu)
 		if (rq->nr_running == 1)
 			break;
 
-		next = pick_next_task(rq);
+		next = pick_next_task(rq, NULL);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4ab60a2..4ea6f45 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -294,13 +294,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
 	if (se->cfs_rq == pse->cfs_rq)
-		return 1;
+		return se->cfs_rq;
 
-	return 0;
+	return NULL;
 }
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -308,17 +308,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 	return se->parent;
 }
 
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-	int depth = 0;
-
-	for_each_sched_entity(se)
-		depth++;
-
-	return depth;
-}
-
 static void
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
@@ -332,8 +321,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 	 */
 
 	/* First walk up until both entities are at same depth */
-	se_depth = depth_se(*se);
-	pse_depth = depth_se(*pse);
+	se_depth = (*se)->depth;
+	pse_depth = (*pse)->depth;
 
 	while (se_depth > pse_depth) {
 		se_depth--;
@@ -398,10 +387,10 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
-	return 1;
+	return cfs_rq_of(se); /* always the same rq */
 }
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -1134,10 +1123,10 @@ static void __clear_buddies_last(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->last == se)
-			cfs_rq->last = NULL;
-		else
+		if (cfs_rq->last != se)
 			break;
+
+		cfs_rq->last = NULL;
 	}
 }
 
@@ -1145,10 +1134,10 @@ static void __clear_buddies_next(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->next == se)
-			cfs_rq->next = NULL;
-		else
+		if (cfs_rq->next != se)
 			break;
+
+		cfs_rq->next = NULL;
 	}
 }
 
@@ -1156,10 +1145,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->skip == se)
-			cfs_rq->skip = NULL;
-		else
+		if (cfs_rq->skip != se)
 			break;
+
+		cfs_rq->skip = NULL;
 	}
 }
 
@@ -2991,22 +2980,51 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		set_last_buddy(se);
 }
 
-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
-	struct sched_entity *se;
+	struct sched_entity *se, *pse;
 
 	if (!cfs_rq->nr_running)
 		return NULL;
 
+	if (prev && prev->sched_class != &fair_sched_class) {
+		prev->sched_class->put_prev_task(rq, prev);
+		prev = NULL;
+	}
+
 	do {
 		se = pick_next_entity(cfs_rq);
-		set_next_entity(cfs_rq, se);
+		if (!prev)
+			set_next_entity(cfs_rq, se);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
 	p = task_of(se);
+
+	if (prev) {
+		pse = &prev->se;
+
+		while (!(cfs_rq = is_same_group(se, pse))) {
+			int se_depth = se->depth;
+			int pse_depth = pse->depth;
+
+			if (se_depth <= pse_depth) {
+				put_prev_entity(cfs_rq_of(pse), pse);
+				pse = parent_entity(pse);
+			}
+			if (se_depth >= pse_depth) {
+				set_next_entity(cfs_rq_of(se), se);
+				se = parent_entity(se);
+			}
+		}
+
+		put_prev_entity(cfs_rq, pse);
+		set_next_entity(cfs_rq, se);
+	}
+
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 
@@ -5358,6 +5376,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
+	struct sched_entity *se = &p->se;
+
 	/*
 	 * If the task was not on the rq at the time of this cgroup movement
 	 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5383,14 +5403,15 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 	 * To prevent boost or penalty in the new cfs_rq caused by delta
 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
 	 */
-	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+	if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
 		on_rq = 1;
 
 	if (!on_rq)
-		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+		se->vruntime -= cfs_rq_of(se)->min_vruntime;
 	set_task_rq(p, task_cpu(p));
+	se->depth = se->parent->depth + 1;
 	if (!on_rq)
-		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+		se->vruntime += cfs_rq_of(se)->min_vruntime;
 }
 
 void free_fair_sched_group(struct task_group *tg)
@@ -5488,10 +5509,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 	if (!se)
 		return;
 
-	if (!parent)
+	if (!parent) {
 		se->cfs_rq = &rq->cfs;
-	else
+		se->depth = 0;
+	} else {
 		se->cfs_rq = parent->my_q;
+		se->depth = parent->depth + 1;
+	}
 
 	se->my_q = cfs_rq;
 	update_load_set(&se->load, 0);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c95..7f5d232 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -22,8 +22,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 	resched_task(rq->idle);
 }
 
-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
 	schedstat_inc(rq, sched_goidle);
 	calc_load_account_idle(rq);
 	return rq->idle;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f42ae7f..0f6c316 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1343,9 +1343,15 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 	return p;
 }
 
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 {
-	struct task_struct *p = _pick_next_task_rt(rq);
+	struct task_struct *p;
+
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+       
+	p = _pick_next_task_rt(rq);
 
 	/* The running task is never eligible for pushing */
 	if (p)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7b386e8..441cc57 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,10 +23,14 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
 	/* we're never preempted */
 }
 
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *stop = rq->stop;
 
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
 	if (stop && stop->on_rq)
 		return stop;
 



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair
  2012-02-11  5:05 [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair Peter Zijlstra
@ 2012-02-11  6:56 ` Mike Galbraith
  2012-02-11 15:02   ` Peter Zijlstra
  2012-02-16 23:20 ` Peter Zijlstra
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 14+ messages in thread
From: Mike Galbraith @ 2012-02-11  6:56 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Paul Turner, Venkatesh Pallipadi, Ingo Molnar, LKML

On Sat, 2012-02-11 at 06:05 +0100, Peter Zijlstra wrote: 
> Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
> path") it is likely we pick a new task from the same cgroup, doing a put
> and then set on all intermediate entities is a waste of time, so try to
> avoid this.

Good idea, we need to lose some weight.

> XXX check put_prev_task()'s update_rq_clock() magic..

I made that even more lovable ;-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d7c4322..7c1cfa6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3175,6 +3175,7 @@ need_resched:
 		} else {
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
 			prev->on_rq = 0;
+			rq->skip_clock_update = 1;
 
 			/*
 			 * If a worker went to sleep, notify and ask workqueue
@@ -3200,9 +3201,9 @@ need_resched:
 	put_prev_task(rq, prev);
 	next = pick_next_task(rq);
 	clear_tsk_need_resched(prev);
-	rq->skip_clock_update = 0;
 
 	if (likely(prev != next)) {
+		rq->skip_clock_update = 0;
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
@@ -3220,6 +3221,7 @@ need_resched:
 		raw_spin_unlock_irq(&rq->lock);
 
 	post_schedule(rq);
+	rq->skip_clock_update = 0;
 
 	preempt_enable_no_resched();
 	if (need_resched())

> Compile tested only.. inspired by pjt's fast switch stories.

I was looking forward to seeing those fast switch patches, nice spot to
cut lard.

> Not-quite-signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

I'll see if I can give it some runtime.

-Mike


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair
  2012-02-11  6:56 ` Mike Galbraith
@ 2012-02-11 15:02   ` Peter Zijlstra
  0 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2012-02-11 15:02 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Paul Turner, Venkatesh Pallipadi, Ingo Molnar, LKML

On Sat, 2012-02-11 at 07:56 +0100, Mike Galbraith wrote:
> On Sat, 2012-02-11 at 06:05 +0100, Peter Zijlstra wrote: 
> > Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
> > path") it is likely we pick a new task from the same cgroup, doing a put
> > and then set on all intermediate entities is a waste of time, so try to
> > avoid this.
> 
> Good idea, we need to lose some weight. 

I actually set out to do something else, but then ended up here.. weird
how that works ;-)

My initial idea was to try and get rid of the pick_next_entity()
downward walk by keeping that data up-to-date when we dequeue/enqueue.

That all got a tad convoluted, I'll have to try it again. Anyway, that
ended up needing to rework the put/set stuff, and that part wasn't as
bad.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair
  2012-02-11  5:05 [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair Peter Zijlstra
  2012-02-11  6:56 ` Mike Galbraith
@ 2012-02-16 23:20 ` Peter Zijlstra
  2012-02-16 23:28   ` Peter Zijlstra
  2014-02-11 12:16 ` [tip:sched/core] sched/fair: Track cgroup depth tip-bot for Peter Zijlstra
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2012-02-16 23:20 UTC (permalink / raw)
  To: Paul Turner; +Cc: Venkatesh Pallipadi, Ingo Molnar, Mike Galbraith, LKML

On Sat, 2012-02-11 at 06:05 +0100, Peter Zijlstra wrote:
> Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
> path") it is likely we pick a new task from the same cgroup, doing a put
> and then set on all intermediate entities is a waste of time, so try to
> avoid this.
> 
> XXX check put_prev_task()'s update_rq_clock() magic..
> 
> Compile tested only.. inspired by pjt's fast switch stories.
> 
> Not-quite-signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> 

Here's one that actually boots..

---
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1117,7 +1117,8 @@ struct sched_class {
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
 
-	struct task_struct * (*pick_next_task) (struct rq *rq);
+	struct task_struct * (*pick_next_task) (struct rq *rq,
+						struct task_struct *prev);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
@@ -1210,6 +1211,7 @@ struct sched_entity {
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	int			depth;
 	struct sched_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
 	struct cfs_rq		*cfs_rq;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3119,7 +3119,7 @@ static void put_prev_task(struct rq *rq,
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
 {
 	const struct sched_class *class;
 	struct task_struct *p;
@@ -3129,13 +3129,13 @@ pick_next_task(struct rq *rq)
 	 * the fair class we can call that function directly:
 	 */
 	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
-		p = fair_sched_class.pick_next_task(rq);
+		p = fair_sched_class.pick_next_task(rq, prev);
 		if (likely(p))
 			return p;
 	}
 
 	for_each_class(class) {
-		p = class->pick_next_task(rq);
+		p = class->pick_next_task(rq, prev);
 		if (p)
 			return p;
 	}
@@ -3196,8 +3196,9 @@ static void __sched __schedule(void)
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 
-	put_prev_task(rq, prev);
-	next = pick_next_task(rq);
+	if (prev->on_rq || rq->skip_clock_update < 0)
+		update_rq_clock(rq);
+	next = pick_next_task(rq, prev);
 	clear_tsk_need_resched(prev);
 	rq->skip_clock_update = 0;
 
@@ -5101,7 +5102,7 @@ static void migrate_tasks(unsigned int d
 		if (rq->nr_running == 1)
 			break;
 
-		next = pick_next_task(rq);
+		next = pick_next_task(rq, NULL);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -294,13 +294,13 @@ static inline void list_del_leaf_cfs_rq(
 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
 	if (se->cfs_rq == pse->cfs_rq)
-		return 1;
+		return se->cfs_rq;
 
-	return 0;
+	return NULL;
 }
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -308,17 +308,6 @@ static inline struct sched_entity *paren
 	return se->parent;
 }
 
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-	int depth = 0;
-
-	for_each_sched_entity(se)
-		depth++;
-
-	return depth;
-}
-
 static void
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
@@ -332,8 +321,8 @@ find_matching_se(struct sched_entity **s
 	 */
 
 	/* First walk up until both entities are at same depth */
-	se_depth = depth_se(*se);
-	pse_depth = depth_se(*pse);
+	se_depth = (*se)->depth;
+	pse_depth = (*pse)->depth;
 
 	while (se_depth > pse_depth) {
 		se_depth--;
@@ -398,10 +387,10 @@ static inline void list_del_leaf_cfs_rq(
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
-	return 1;
+	return cfs_rq_of(se); /* always the same rq */
 }
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -1136,10 +1125,10 @@ static void __clear_buddies_last(struct 
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->last == se)
-			cfs_rq->last = NULL;
-		else
+		if (cfs_rq->last != se)
 			break;
+
+		cfs_rq->last = NULL;
 	}
 }
 
@@ -1147,10 +1136,10 @@ static void __clear_buddies_next(struct 
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->next == se)
-			cfs_rq->next = NULL;
-		else
+		if (cfs_rq->next != se)
 			break;
+
+		cfs_rq->next = NULL;
 	}
 }
 
@@ -1158,10 +1147,10 @@ static void __clear_buddies_skip(struct 
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->skip == se)
-			cfs_rq->skip = NULL;
-		else
+		if (cfs_rq->skip != se)
 			break;
+
+		cfs_rq->skip = NULL;
 	}
 }
 
@@ -2993,22 +2982,52 @@ static void check_preempt_wakeup(struct 
 		set_last_buddy(se);
 }
 
-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
-	struct sched_entity *se;
+	struct sched_entity *se, *pse;
 
 	if (!cfs_rq->nr_running)
 		return NULL;
 
+	if (prev && (prev->sched_class != &fair_sched_class ||
+				cfs_rq->nr_running == 1)) {
+		prev->sched_class->put_prev_task(rq, prev);
+		prev = NULL;
+	}
+
 	do {
 		se = pick_next_entity(cfs_rq);
-		set_next_entity(cfs_rq, se);
+		if (!prev)
+			set_next_entity(cfs_rq, se);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
 	p = task_of(se);
+
+	if (prev) {
+		pse = &prev->se;
+
+		while (!(cfs_rq = is_same_group(se, pse))) {
+			int se_depth = se->depth;
+			int pse_depth = pse->depth;
+
+			if (se_depth <= pse_depth) {
+				put_prev_entity(cfs_rq_of(pse), pse);
+				pse = parent_entity(pse);
+			}
+			if (se_depth >= pse_depth) {
+				set_next_entity(cfs_rq_of(se), se);
+				se = parent_entity(se);
+			}
+		}
+
+		put_prev_entity(cfs_rq, pse);
+		set_next_entity(cfs_rq, se);
+	}
+
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 
@@ -5360,6 +5379,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
+	struct sched_entity *se = &p->se;
+
 	/*
 	 * If the task was not on the rq at the time of this cgroup movement
 	 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5385,14 +5406,16 @@ static void task_move_group_fair(struct 
 	 * To prevent boost or penalty in the new cfs_rq caused by delta
 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
 	 */
-	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+	if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
 		on_rq = 1;
 
 	if (!on_rq)
-		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+		se->vruntime -= cfs_rq_of(se)->min_vruntime;
 	set_task_rq(p, task_cpu(p));
+	if (se->parent)
+		se->depth = se->parent->depth + 1;
 	if (!on_rq)
-		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+		se->vruntime += cfs_rq_of(se)->min_vruntime;
 }
 
 void free_fair_sched_group(struct task_group *tg)
@@ -5490,10 +5513,13 @@ void init_tg_cfs_entry(struct task_group
 	if (!se)
 		return;
 
-	if (!parent)
+	if (!parent) {
 		se->cfs_rq = &rq->cfs;
-	else
+		se->depth = 0;
+	} else {
 		se->cfs_rq = parent->my_q;
+		se->depth = parent->depth + 1;
+	}
 
 	se->my_q = cfs_rq;
 	update_load_set(&se->load, 0);
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -22,8 +22,12 @@ static void check_preempt_curr_idle(stru
 	resched_task(rq->idle);
 }
 
-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
 	schedstat_inc(rq, sched_goidle);
 	calc_load_account_idle(rq);
 	return rq->idle;
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1321,15 +1321,7 @@ static struct task_struct *_pick_next_ta
 {
 	struct sched_rt_entity *rt_se;
 	struct task_struct *p;
-	struct rt_rq *rt_rq;
-
-	rt_rq = &rq->rt;
-
-	if (!rt_rq->rt_nr_running)
-		return NULL;
-
-	if (rt_rq_throttled(rt_rq))
-		return NULL;
+	struct rt_rq *rt_rq  = &rq->rt;
 
 	do {
 		rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1343,9 +1335,22 @@ static struct task_struct *_pick_next_ta
 	return p;
 }
 
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 {
-	struct task_struct *p = _pick_next_task_rt(rq);
+	struct task_struct *p;
+	struct rt_rq *rt_rq = &rq->rt;
+
+	if (!rt_rq->rt_nr_running)
+		return NULL;
+
+	if (rt_rq_throttled(rt_rq))
+		return NULL;
+
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
+	p = _pick_next_task_rt(rq);
 
 	/* The running task is never eligible for pushing */
 	if (p)
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,12 +23,17 @@ check_preempt_curr_stop(struct rq *rq, s
 	/* we're never preempted */
 }
 
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *stop = rq->stop;
 
-	if (stop && stop->on_rq)
+	if (stop && stop->on_rq) {
+		if (prev)
+			prev->sched_class->put_prev_task(rq, prev);
+
 		return stop;
+	}
 
 	return NULL;
 }


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair
  2012-02-16 23:20 ` Peter Zijlstra
@ 2012-02-16 23:28   ` Peter Zijlstra
  0 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2012-02-16 23:28 UTC (permalink / raw)
  To: Paul Turner; +Cc: Venkatesh Pallipadi, Ingo Molnar, Mike Galbraith, LKML

On Fri, 2012-02-17 at 00:20 +0100, Peter Zijlstra wrote:
> +       if (prev && (prev->sched_class != &fair_sched_class ||
> +                               cfs_rq->nr_running == 1)) { 

this ==1 check is way ugly and a problem that needs addressing. Not
putting cfs_rq->curr back in the tree perturbs pick_next_entity()'s view
of things.

Probably easiest to pick leftmost and ->curr and go from there, but
that's for tomorrow or so.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [tip:sched/core] sched/fair: Track cgroup depth
  2012-02-11  5:05 [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair Peter Zijlstra
  2012-02-11  6:56 ` Mike Galbraith
  2012-02-16 23:20 ` Peter Zijlstra
@ 2014-02-11 12:16 ` tip-bot for Peter Zijlstra
  2014-02-11 12:16 ` [tip:sched/core] sched: Push put_prev_task() into pick_next_task( ) tip-bot for Peter Zijlstra
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 14+ messages in thread
From: tip-bot for Peter Zijlstra @ 2014-02-11 12:16 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, peterz, tglx

Commit-ID:  fed14d45f945042a15b09de48d7d3d58d9455fc4
Gitweb:     http://git.kernel.org/tip/fed14d45f945042a15b09de48d7d3d58d9455fc4
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Sat, 11 Feb 2012 06:05:00 +0100
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Mon, 10 Feb 2014 16:17:10 +0100

sched/fair: Track cgroup depth

Track depth in cgroup tree, this is useful for things like
find_matching_se() where you need to get to a common parent of two
sched entities.

Keeping the depth avoids having to calculate it on the spot, which
saves a number of possible cache-misses.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  1 +
 kernel/sched/fair.c   | 47 +++++++++++++++++++++--------------------------
 2 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e3d5564..555e27d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1078,6 +1078,7 @@ struct sched_entity {
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	int			depth;
 	struct sched_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
 	struct cfs_rq		*cfs_rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fea77..748a7ac 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
 	if (se->cfs_rq == pse->cfs_rq)
-		return 1;
+		return se->cfs_rq;
 
-	return 0;
+	return NULL;
 }
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 	return se->parent;
 }
 
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-	int depth = 0;
-
-	for_each_sched_entity(se)
-		depth++;
-
-	return depth;
-}
-
 static void
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 	 */
 
 	/* First walk up until both entities are at same depth */
-	se_depth = depth_se(*se);
-	pse_depth = depth_se(*pse);
+	se_depth = (*se)->depth;
+	pse_depth = (*pse)->depth;
 
 	while (se_depth > pse_depth) {
 		se_depth--;
@@ -426,10 +415,10 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
-	return 1;
+	return cfs_rq_of(se); /* always the same rq */
 }
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -7262,7 +7251,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
+	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq;
+
 	/*
 	 * If the task was not on the rq at the time of this cgroup movement
 	 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7288,23 +7279,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 	 * To prevent boost or penalty in the new cfs_rq caused by delta
 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
 	 */
-	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+	if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
 		on_rq = 1;
 
 	if (!on_rq)
-		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+		se->vruntime -= cfs_rq_of(se)->min_vruntime;
 	set_task_rq(p, task_cpu(p));
+	se->depth = se->parent ? se->parent->depth + 1 : 0;
 	if (!on_rq) {
-		cfs_rq = cfs_rq_of(&p->se);
-		p->se.vruntime += cfs_rq->min_vruntime;
+		cfs_rq = cfs_rq_of(se);
+		se->vruntime += cfs_rq->min_vruntime;
 #ifdef CONFIG_SMP
 		/*
 		 * migrate_task_rq_fair() will have removed our previous
 		 * contribution, but we must synchronize for ongoing future
 		 * decay.
 		 */
-		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
-		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
 #endif
 	}
 }
@@ -7400,10 +7392,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 	if (!se)
 		return;
 
-	if (!parent)
+	if (!parent) {
 		se->cfs_rq = &rq->cfs;
-	else
+		se->depth = 0;
+	} else {
 		se->cfs_rq = parent->my_q;
+		se->depth = parent->depth + 1;
+	}
 
 	se->my_q = cfs_rq;
 	/* guarantee group entities always have weight */

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )
  2012-02-11  5:05 [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair Peter Zijlstra
                   ` (2 preceding siblings ...)
  2014-02-11 12:16 ` [tip:sched/core] sched/fair: Track cgroup depth tip-bot for Peter Zijlstra
@ 2014-02-11 12:16 ` tip-bot for Peter Zijlstra
  2014-02-12  7:00   ` Kirill Tkhai
  2014-02-11 12:16 ` [tip:sched/core] sched/fair: Clean up the __clear_buddies_*() functions tip-bot for Peter Zijlstra
  2014-02-11 12:16 ` [tip:sched/core] sched/fair: Optimize cgroup pick_next_task_fair( ) tip-bot for Peter Zijlstra
  5 siblings, 1 reply; 14+ messages in thread
From: tip-bot for Peter Zijlstra @ 2014-02-11 12:16 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, peterz, tglx

Commit-ID:  606dba2e289446600a0b68422ed2019af5355c12
Gitweb:     http://git.kernel.org/tip/606dba2e289446600a0b68422ed2019af5355c12
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Sat, 11 Feb 2012 06:05:00 +0100
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Mon, 10 Feb 2014 16:17:13 +0100

sched: Push put_prev_task() into pick_next_task()

In order to avoid having to do put/set on a whole cgroup hierarchy
when we context switch, push the put into pick_next_task() so that
both operations are in the same function. Further changes then allow
us to possibly optimize away redundant work.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c      | 21 ++++++++-------------
 kernel/sched/deadline.c  |  5 ++++-
 kernel/sched/fair.c      |  6 +++++-
 kernel/sched/idle_task.c |  6 +++++-
 kernel/sched/rt.c        | 27 ++++++++++++++++-----------
 kernel/sched/sched.h     |  8 +++++++-
 kernel/sched/stop_task.c | 16 ++++++++++------
 7 files changed, 55 insertions(+), 34 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 417cf65..dedb5f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2579,18 +2579,11 @@ static inline void schedule_debug(struct task_struct *prev)
 	schedstat_inc(this_rq(), sched_count);
 }
 
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
-{
-	if (prev->on_rq || rq->skip_clock_update < 0)
-		update_rq_clock(rq);
-	prev->sched_class->put_prev_task(rq, prev);
-}
-
 /*
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
 {
 	const struct sched_class *class;
 	struct task_struct *p;
@@ -2600,13 +2593,13 @@ pick_next_task(struct rq *rq)
 	 * the fair class we can call that function directly:
 	 */
 	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
-		p = fair_sched_class.pick_next_task(rq);
+		p = fair_sched_class.pick_next_task(rq, prev);
 		if (likely(p))
 			return p;
 	}
 
 	for_each_class(class) {
-		p = class->pick_next_task(rq);
+		p = class->pick_next_task(rq, prev);
 		if (p)
 			return p;
 	}
@@ -2714,8 +2707,10 @@ need_resched:
 			rq->idle_stamp = 0;
 	}
 
-	put_prev_task(rq, prev);
-	next = pick_next_task(rq);
+	if (prev->on_rq || rq->skip_clock_update < 0)
+		update_rq_clock(rq);
+
+	next = pick_next_task(rq, prev);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 	rq->skip_clock_update = 0;
@@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
 		if (rq->nr_running == 1)
 			break;
 
-		next = pick_next_task(rq);
+		next = pick_next_task(rq, NULL);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b5700bc..50797d5 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -990,7 +990,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
 	return rb_entry(left, struct sched_dl_entity, rb_node);
 }
 
-struct task_struct *pick_next_task_dl(struct rq *rq)
+struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 {
 	struct sched_dl_entity *dl_se;
 	struct task_struct *p;
@@ -1001,6 +1001,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
 	if (unlikely(!dl_rq->dl_nr_running))
 		return NULL;
 
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
 	dl_se = pick_next_dl_entity(rq, dl_rq);
 	BUG_ON(!dl_se);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 748a7ac..c4bb0ac 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4655,7 +4655,8 @@ preempt:
 		set_last_buddy(se);
 }
 
-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
@@ -4664,6 +4665,9 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 	if (!cfs_rq->nr_running)
 		return NULL;
 
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
 	do {
 		se = pick_next_entity(cfs_rq);
 		set_next_entity(cfs_rq, se);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9..e5c922a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -33,8 +33,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 	resched_task(rq->idle);
 }
 
-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
 	schedstat_inc(rq, sched_goidle);
 #ifdef CONFIG_SMP
 	/* Trigger the post schedule to do an idle_enter for CFS */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a2740b7..a15ca1c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1310,15 +1310,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 {
 	struct sched_rt_entity *rt_se;
 	struct task_struct *p;
-	struct rt_rq *rt_rq;
-
-	rt_rq = &rq->rt;
-
-	if (!rt_rq->rt_nr_running)
-		return NULL;
-
-	if (rt_rq_throttled(rt_rq))
-		return NULL;
+	struct rt_rq *rt_rq  = &rq->rt;
 
 	do {
 		rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1332,9 +1324,22 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 	return p;
 }
 
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 {
-	struct task_struct *p = _pick_next_task_rt(rq);
+	struct task_struct *p;
+	struct rt_rq *rt_rq = &rq->rt;
+
+	if (!rt_rq->rt_nr_running)
+		return NULL;
+
+	if (rt_rq_throttled(rt_rq))
+		return NULL;
+
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
+	p = _pick_next_task_rt(rq);
 
 	/* The running task is never eligible for pushing */
 	if (p)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bb89991..c534cf4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1105,7 +1105,13 @@ struct sched_class {
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
 
-	struct task_struct * (*pick_next_task) (struct rq *rq);
+	/*
+	 * It is the responsibility of the pick_next_task() method that will
+	 * return the next task to call put_prev_task() on the @prev task or
+	 * something equivalent.
+	 */
+	struct task_struct * (*pick_next_task) (struct rq *rq,
+						struct task_struct *prev);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fdb6bb0..a4147c9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,16 +23,20 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
 	/* we're never preempted */
 }
 
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *stop = rq->stop;
 
-	if (stop && stop->on_rq) {
-		stop->se.exec_start = rq_clock_task(rq);
-		return stop;
-	}
+	if (!stop || !stop->on_rq)
+		return NULL;
 
-	return NULL;
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
+	stop->se.exec_start = rq_clock_task(rq);
+
+	return stop;
 }
 
 static void

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [tip:sched/core] sched/fair: Clean up the __clear_buddies_*() functions
  2012-02-11  5:05 [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair Peter Zijlstra
                   ` (3 preceding siblings ...)
  2014-02-11 12:16 ` [tip:sched/core] sched: Push put_prev_task() into pick_next_task( ) tip-bot for Peter Zijlstra
@ 2014-02-11 12:16 ` tip-bot for Peter Zijlstra
  2014-02-11 12:16 ` [tip:sched/core] sched/fair: Optimize cgroup pick_next_task_fair( ) tip-bot for Peter Zijlstra
  5 siblings, 0 replies; 14+ messages in thread
From: tip-bot for Peter Zijlstra @ 2014-02-11 12:16 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, peterz, tglx

Commit-ID:  f10447998a59b97747c16258a9c6e6a1512f27f3
Gitweb:     http://git.kernel.org/tip/f10447998a59b97747c16258a9c6e6a1512f27f3
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Sat, 11 Feb 2012 06:05:00 +0100
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Mon, 10 Feb 2014 16:17:16 +0100

sched/fair: Clean up the __clear_buddies_*() functions

Slightly easier code flow, no functional changes.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c4bb0ac..8461721 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2739,10 +2739,10 @@ static void __clear_buddies_last(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->last == se)
-			cfs_rq->last = NULL;
-		else
+		if (cfs_rq->last != se)
 			break;
+
+		cfs_rq->last = NULL;
 	}
 }
 
@@ -2750,10 +2750,10 @@ static void __clear_buddies_next(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->next == se)
-			cfs_rq->next = NULL;
-		else
+		if (cfs_rq->next != se)
 			break;
+
+		cfs_rq->next = NULL;
 	}
 }
 
@@ -2761,10 +2761,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->skip == se)
-			cfs_rq->skip = NULL;
-		else
+		if (cfs_rq->skip != se)
 			break;
+
+		cfs_rq->skip = NULL;
 	}
 }
 

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [tip:sched/core] sched/fair: Optimize cgroup pick_next_task_fair( )
  2012-02-11  5:05 [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair Peter Zijlstra
                   ` (4 preceding siblings ...)
  2014-02-11 12:16 ` [tip:sched/core] sched/fair: Clean up the __clear_buddies_*() functions tip-bot for Peter Zijlstra
@ 2014-02-11 12:16 ` tip-bot for Peter Zijlstra
  5 siblings, 0 replies; 14+ messages in thread
From: tip-bot for Peter Zijlstra @ 2014-02-11 12:16 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, torvalds, peterz, akpm, tj, tglx

Commit-ID:  678d5718d8d099421b0dd54c01b0528f4aaf5919
Gitweb:     http://git.kernel.org/tip/678d5718d8d099421b0dd54c01b0528f4aaf5919
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Sat, 11 Feb 2012 06:05:00 +0100
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Mon, 10 Feb 2014 16:17:19 +0100

sched/fair: Optimize cgroup pick_next_task_fair()

Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
path") it is likely we pick a new task from the same cgroup, doing a put
and then set on all intermediate entities is a waste of time, so try to
avoid this.

Measured using:

  mount nodev /cgroup -t cgroup -o cpu
  cd /cgroup
  mkdir a; cd a
  mkdir b; cd b
  mkdir c; cd c
  echo $$ > tasks
  perf stat --repeat 10 -- taskset 1 perf bench sched pipe

PRE :      4.542422684 seconds time elapsed   ( +-  0.33% )
POST:      4.389409991 seconds time elapsed   ( +-  0.32% )

Which shows a significant improvement of ~3.5%

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 110 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8461721..a81b241 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2907,17 +2907,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
  * 3) pick the "last" process, for cache locality
  * 4) do not run the "skip" process, if something else is available
  */
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	struct sched_entity *se = __pick_first_entity(cfs_rq);
-	struct sched_entity *left = se;
+	struct sched_entity *left = __pick_first_entity(cfs_rq);
+	struct sched_entity *se;
+
+	/*
+	 * If curr is set we have to see if its left of the leftmost entity
+	 * still in the tree, provided there was anything in the tree at all.
+	 */
+	if (!left || (curr && entity_before(curr, left)))
+		left = curr;
+
+	se = left; /* ideally we run the leftmost entity */
 
 	/*
 	 * Avoid running the skip buddy, if running something else can
 	 * be done without getting too unfair.
 	 */
 	if (cfs_rq->skip == se) {
-		struct sched_entity *second = __pick_next_entity(se);
+		struct sched_entity *second;
+
+		if (se == curr) {
+			second = __pick_first_entity(cfs_rq);
+		} else {
+			second = __pick_next_entity(se);
+			if (!second || (curr && entity_before(curr, second)))
+				second = curr;
+		}
+
 		if (second && wakeup_preempt_entity(second, left) < 1)
 			se = second;
 	}
@@ -2939,7 +2958,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 	return se;
 }
 
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
@@ -3594,22 +3613,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 }
 
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_bandwidth_used())
-		return;
+		return false;
 
 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
-		return;
+		return false;
 
 	/*
 	 * it's possible for a throttled entity to be forced into a running
 	 * state (e.g. set_curr_task), in this case we're finished.
 	 */
 	if (cfs_rq_throttled(cfs_rq))
-		return;
+		return true;
 
 	throttle_cfs_rq(cfs_rq);
+	return true;
 }
 
 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3719,7 +3739,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 }
 
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 
@@ -4658,9 +4678,86 @@ preempt:
 static struct task_struct *
 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 {
-	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
+	struct task_struct *p;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (!cfs_rq->nr_running)
+		return NULL;
+
+	if (!prev || prev->sched_class != &fair_sched_class)
+		goto simple;
+
+	/*
+	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+	 * likely that a next task is from the same cgroup as the current.
+	 *
+	 * Therefore attempt to avoid putting and setting the entire cgroup
+	 * hierarchy, only change the part that actually changes.
+	 */
+
+	do {
+		struct sched_entity *curr = cfs_rq->curr;
+
+		/*
+		 * Since we got here without doing put_prev_entity() we also
+		 * have to consider cfs_rq->curr. If it is still a runnable
+		 * entity, update_curr() will update its vruntime, otherwise
+		 * forget we've ever seen it.
+		 */
+		if (curr && curr->on_rq)
+			update_curr(cfs_rq);
+		else
+			curr = NULL;
+
+		/*
+		 * This call to check_cfs_rq_runtime() will do the throttle and
+		 * dequeue its entity in the parent(s). Therefore the 'simple'
+		 * nr_running test will indeed be correct.
+		 */
+		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+			goto simple;
+
+		se = pick_next_entity(cfs_rq, curr);
+		cfs_rq = group_cfs_rq(se);
+	} while (cfs_rq);
+
+	p = task_of(se);
+
+	/*
+	 * Since we haven't yet done put_prev_entity and if the selected task
+	 * is a different task than we started out with, try and touch the
+	 * least amount of cfs_rqs.
+	 */
+	if (prev != p) {
+		struct sched_entity *pse = &prev->se;
+
+		while (!(cfs_rq = is_same_group(se, pse))) {
+			int se_depth = se->depth;
+			int pse_depth = pse->depth;
+
+			if (se_depth <= pse_depth) {
+				put_prev_entity(cfs_rq_of(pse), pse);
+				pse = parent_entity(pse);
+			}
+			if (se_depth >= pse_depth) {
+				set_next_entity(cfs_rq_of(se), se);
+				se = parent_entity(se);
+			}
+		}
+
+		put_prev_entity(cfs_rq, pse);
+		set_next_entity(cfs_rq, se);
+	}
+
+	if (hrtick_enabled(rq))
+		hrtick_start_fair(rq, p);
+
+	return p;
+simple:
+	cfs_rq = &rq->cfs;
+#endif
 
 	if (!cfs_rq->nr_running)
 		return NULL;
@@ -4669,12 +4766,13 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 		prev->sched_class->put_prev_task(rq, prev);
 
 	do {
-		se = pick_next_entity(cfs_rq);
+		se = pick_next_entity(cfs_rq, NULL);
 		set_next_entity(cfs_rq, se);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
 	p = task_of(se);
+
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )
  2014-02-11 12:16 ` [tip:sched/core] sched: Push put_prev_task() into pick_next_task( ) tip-bot for Peter Zijlstra
@ 2014-02-12  7:00   ` Kirill Tkhai
  2014-02-12 11:43     ` Peter Zijlstra
  2014-02-12 14:06     ` Peter Zijlstra
  0 siblings, 2 replies; 14+ messages in thread
From: Kirill Tkhai @ 2014-02-12  7:00 UTC (permalink / raw)
  To: mingo, hpa, linux-kernel, peterz, tglx, linux-tip-commits



11.02.2014, 16:17, "tip-bot for Peter Zijlstra" <tipbot@zytor.com>:
> Commit-ID:  606dba2e289446600a0b68422ed2019af5355c12
> Gitweb:     http://git.kernel.org/tip/606dba2e289446600a0b68422ed2019af5355c12
> Author:     Peter Zijlstra <peterz@infradead.org>
> AuthorDate: Sat, 11 Feb 2012 06:05:00 +0100
> Committer:  Ingo Molnar <mingo@kernel.org>
> CommitDate: Mon, 10 Feb 2014 16:17:13 +0100
>
> sched: Push put_prev_task() into pick_next_task()
>
> In order to avoid having to do put/set on a whole cgroup hierarchy
> when we context switch, push the put into pick_next_task() so that
> both operations are in the same function. Further changes then allow
> us to possibly optimize away redundant work.
>
> Signed-off-by: Peter Zijlstra <peterz@infradead.org>
> Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
> Signed-off-by: Ingo Molnar <mingo@kernel.org>
> ---
>  kernel/sched/core.c      | 21 ++++++++-------------
>  kernel/sched/deadline.c  |  5 ++++-
>  kernel/sched/fair.c      |  6 +++++-
>  kernel/sched/idle_task.c |  6 +++++-
>  kernel/sched/rt.c        | 27 ++++++++++++++++-----------
>  kernel/sched/sched.h     |  8 +++++++-
>  kernel/sched/stop_task.c | 16 ++++++++++------
>  7 files changed, 55 insertions(+), 34 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 417cf65..dedb5f0 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2579,18 +2579,11 @@ static inline void schedule_debug(struct task_struct *prev)
>          schedstat_inc(this_rq(), sched_count);
>  }
>
> -static void put_prev_task(struct rq *rq, struct task_struct *prev)
> -{
> - if (prev->on_rq || rq->skip_clock_update < 0)
> - update_rq_clock(rq);
> - prev->sched_class->put_prev_task(rq, prev);
> -}
> -
>  /*
>   * Pick up the highest-prio task:
>   */
>  static inline struct task_struct *
> -pick_next_task(struct rq *rq)
> +pick_next_task(struct rq *rq, struct task_struct *prev)
>  {
>          const struct sched_class *class;
>          struct task_struct *p;
> @@ -2600,13 +2593,13 @@ pick_next_task(struct rq *rq)
>           * the fair class we can call that function directly:
>           */
>          if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
> - p = fair_sched_class.pick_next_task(rq);
> + p = fair_sched_class.pick_next_task(rq, prev);
>                  if (likely(p))
>                          return p;
>          }
>
>          for_each_class(class) {
> - p = class->pick_next_task(rq);
> + p = class->pick_next_task(rq, prev);
>                  if (p)
>                          return p;
>          }
> @@ -2714,8 +2707,10 @@ need_resched:
>                          rq->idle_stamp = 0;
>          }
>
> - put_prev_task(rq, prev);
> - next = pick_next_task(rq);
> + if (prev->on_rq || rq->skip_clock_update < 0)
> + update_rq_clock(rq);
> +
> + next = pick_next_task(rq, prev);
>          clear_tsk_need_resched(prev);
>          clear_preempt_need_resched();
>          rq->skip_clock_update = 0;
> @@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
>                  if (rq->nr_running == 1)
>                          break;
>
> - next = pick_next_task(rq);
> + next = pick_next_task(rq, NULL);

pick_next_task() firstly checks for prev->sched_class, doesn't it ???

The same for pick_next_task_rt().

>                  BUG_ON(!next);
>                  next->sched_class->put_prev_task(rq, next);
>
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index b5700bc..50797d5 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -990,7 +990,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
>          return rb_entry(left, struct sched_dl_entity, rb_node);
>  }
>
> -struct task_struct *pick_next_task_dl(struct rq *rq)
> +struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
>  {
>          struct sched_dl_entity *dl_se;
>          struct task_struct *p;
> @@ -1001,6 +1001,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
>          if (unlikely(!dl_rq->dl_nr_running))
>                  return NULL;
>
> + if (prev)
> + prev->sched_class->put_prev_task(rq, prev);
> +
>          dl_se = pick_next_dl_entity(rq, dl_rq);
>          BUG_ON(!dl_se);
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 748a7ac..c4bb0ac 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4655,7 +4655,8 @@ preempt:
>                  set_last_buddy(se);
>  }
>
> -static struct task_struct *pick_next_task_fair(struct rq *rq)
> +static struct task_struct *
> +pick_next_task_fair(struct rq *rq, struct task_struct *prev)
>  {
>          struct task_struct *p;
>          struct cfs_rq *cfs_rq = &rq->cfs;
> @@ -4664,6 +4665,9 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
>          if (!cfs_rq->nr_running)
>                  return NULL;
>
> + if (prev)
> + prev->sched_class->put_prev_task(rq, prev);
> +
>          do {
>                  se = pick_next_entity(cfs_rq);
>                  set_next_entity(cfs_rq, se);
> diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
> index 516c3d9..e5c922a 100644
> --- a/kernel/sched/idle_task.c
> +++ b/kernel/sched/idle_task.c
> @@ -33,8 +33,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
>          resched_task(rq->idle);
>  }
>
> -static struct task_struct *pick_next_task_idle(struct rq *rq)
> +static struct task_struct *
> +pick_next_task_idle(struct rq *rq, struct task_struct *prev)
>  {
> + if (prev)
> + prev->sched_class->put_prev_task(rq, prev);
> +
>          schedstat_inc(rq, sched_goidle);
>  #ifdef CONFIG_SMP
>          /* Trigger the post schedule to do an idle_enter for CFS */
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index a2740b7..a15ca1c 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1310,15 +1310,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
>  {
>          struct sched_rt_entity *rt_se;
>          struct task_struct *p;
> - struct rt_rq *rt_rq;
> -
> - rt_rq = &rq->rt;
> -
> - if (!rt_rq->rt_nr_running)
> - return NULL;
> -
> - if (rt_rq_throttled(rt_rq))
> - return NULL;
> + struct rt_rq *rt_rq  = &rq->rt;
>
>          do {
>                  rt_se = pick_next_rt_entity(rq, rt_rq);
> @@ -1332,9 +1324,22 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
>          return p;
>  }
>
> -static struct task_struct *pick_next_task_rt(struct rq *rq)
> +static struct task_struct *
> +pick_next_task_rt(struct rq *rq, struct task_struct *prev)
>  {
> - struct task_struct *p = _pick_next_task_rt(rq);
> + struct task_struct *p;
> + struct rt_rq *rt_rq = &rq->rt;
> +
> + if (!rt_rq->rt_nr_running)
> + return NULL;
> +
> + if (rt_rq_throttled(rt_rq))
> + return NULL;
> +
> + if (prev)
> + prev->sched_class->put_prev_task(rq, prev);
> +
> + p = _pick_next_task_rt(rq);
>
>          /* The running task is never eligible for pushing */
>          if (p)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index bb89991..c534cf4 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1105,7 +1105,13 @@ struct sched_class {
>
>          void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
>
> - struct task_struct * (*pick_next_task) (struct rq *rq);
> + /*
> + * It is the responsibility of the pick_next_task() method that will
> + * return the next task to call put_prev_task() on the @prev task or
> + * something equivalent.
> + */
> + struct task_struct * (*pick_next_task) (struct rq *rq,
> + struct task_struct *prev);
>          void (*put_prev_task) (struct rq *rq, struct task_struct *p);
>
>  #ifdef CONFIG_SMP
> diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
> index fdb6bb0..a4147c9 100644
> --- a/kernel/sched/stop_task.c
> +++ b/kernel/sched/stop_task.c
> @@ -23,16 +23,20 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
>          /* we're never preempted */
>  }
>
> -static struct task_struct *pick_next_task_stop(struct rq *rq)
> +static struct task_struct *
> +pick_next_task_stop(struct rq *rq, struct task_struct *prev)
>  {
>          struct task_struct *stop = rq->stop;
>
> - if (stop && stop->on_rq) {
> - stop->se.exec_start = rq_clock_task(rq);
> - return stop;
> - }
> + if (!stop || !stop->on_rq)
> + return NULL;
>
> - return NULL;
> + if (prev)
> + prev->sched_class->put_prev_task(rq, prev);
> +
> + stop->se.exec_start = rq_clock_task(rq);
> +
> + return stop;
>  }
>
>  static void
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )
  2014-02-12  7:00   ` Kirill Tkhai
@ 2014-02-12 11:43     ` Peter Zijlstra
  2014-02-12 14:06     ` Peter Zijlstra
  1 sibling, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2014-02-12 11:43 UTC (permalink / raw)
  To: Kirill Tkhai; +Cc: mingo, hpa, linux-kernel, tglx, linux-tip-commits

On Wed, Feb 12, 2014 at 11:00:53AM +0400, Kirill Tkhai wrote:
> > @@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
> >                  if (rq->nr_running == 1)
> >                          break;
> >
> > - next = pick_next_task(rq);
> > + next = pick_next_task(rq, NULL);
> 
> pick_next_task() firstly checks for prev->sched_class, doesn't it ???
> 
> The same for pick_next_task_rt().

Right you are... and pick_next_task_dl(). Smatch caught the _{rt,dl}
ones, it failed to spot this one though.



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )
  2014-02-12  7:00   ` Kirill Tkhai
  2014-02-12 11:43     ` Peter Zijlstra
@ 2014-02-12 14:06     ` Peter Zijlstra
  2014-02-12 14:24       ` Kirill Tkhai
  1 sibling, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2014-02-12 14:06 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: mingo, hpa, linux-kernel, tglx, linux-tip-commits,
	Steven Rostedt, Juri Lelli, Dan Carpenter

On Wed, Feb 12, 2014 at 11:00:53AM +0400, Kirill Tkhai wrote:
> > @@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
> >                  if (rq->nr_running == 1)
> >                          break;
> >
> > - next = pick_next_task(rq);
> > + next = pick_next_task(rq, NULL);
> 
> pick_next_task() firstly checks for prev->sched_class, doesn't it ???
> 
> The same for pick_next_task_rt().


OK, how about something like this?

---
Subject: sched: Fix hotplug task migration
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 12 Feb 2014 10:49:30 +0100

Dan Carpenter reported:

> kernel/sched/rt.c:1347 pick_next_task_rt() warn: variable dereferenced before check 'prev' (see line 1338)
> kernel/sched/deadline.c:1011 pick_next_task_dl() warn: variable dereferenced before check 'prev' (see line 1005)

Kirill also spotted that migrate_tasks() will have an instant NULL
deref because pick_next_task() will immediately deref prev.

Instead of fixing all the corner cases because migrate_tasks() can
pass in a NULL prev task in the unlikely case of hot-un-plug, provide
a fake task such that we can remove all the NULL checks from the far
more common paths.

A further problem; not previously spotted; is that because we pushed
pre_schedule() and idle_balance() into pick_next_task() we now need to
avoid those getting called and pulling more tasks on our dying CPU.

We avoid pull_{dl,rt}_task() by setting fake_task.prio to MAX_PRIO+1.
We also note that since we call pick_next_task() exactly the amount of
times we have runnable tasks present, we should never land in
idle_balance().

Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()")
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Reported-by: Kirill Tkhai <tkhai@yandex.ru>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140212094930.GB3545@laptop.programming.kicks-ass.net
---
 kernel/sched/core.c      |   18 +++++++++++++++++-
 kernel/sched/deadline.c  |    3 +--
 kernel/sched/fair.c      |    5 ++---
 kernel/sched/idle_task.c |    3 +--
 kernel/sched/rt.c        |    3 +--
 kernel/sched/stop_task.c |    3 +--
 6 files changed, 23 insertions(+), 12 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4681,6 +4681,22 @@ static void calc_load_migrate(struct rq
 		atomic_long_add(delta, &calc_load_tasks);
 }
 
+static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static const struct sched_class fake_sched_class = {
+	.put_prev_task = put_prev_task_fake,
+};
+
+static struct task_struct fake_task = {
+	/*
+	 * Avoid pull_{rt,dl}_task()
+	 */
+	.prio = MAX_PRIO + 1,
+	.sched_class = &fake_sched_class,
+};
+
 /*
  * Migrate all tasks from the rq, sleeping tasks will be migrated by
  * try_to_wake_up()->select_task_rq().
@@ -4721,7 +4737,7 @@ static void migrate_tasks(unsigned int d
 		if (rq->nr_running == 1)
 			break;
 
-		next = pick_next_task(rq, NULL);
+		next = pick_next_task(rq, &fake_task);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1008,8 +1008,7 @@ struct task_struct *pick_next_task_dl(st
 	if (unlikely(!dl_rq->dl_nr_running))
 		return NULL;
 
-	if (prev)
-		prev->sched_class->put_prev_task(rq, prev);
+	prev->sched_class->put_prev_task(rq, prev);
 
 	dl_se = pick_next_dl_entity(rq, dl_rq);
 	BUG_ON(!dl_se);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4690,7 +4690,7 @@ pick_next_task_fair(struct rq *rq, struc
 	if (!cfs_rq->nr_running)
 		goto idle;
 
-	if (!prev || prev->sched_class != &fair_sched_class)
+	if (prev->sched_class != &fair_sched_class)
 		goto simple;
 
 	/*
@@ -4766,8 +4766,7 @@ pick_next_task_fair(struct rq *rq, struc
 	if (!cfs_rq->nr_running)
 		goto idle;
 
-	if (prev)
-		prev->sched_class->put_prev_task(rq, prev);
+	prev->sched_class->put_prev_task(rq, prev);
 
 	do {
 		se = pick_next_entity(cfs_rq, NULL);
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -26,8 +26,7 @@ static void check_preempt_curr_idle(stru
 static struct task_struct *
 pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
-	if (prev)
-		prev->sched_class->put_prev_task(rq, prev);
+	prev->sched_class->put_prev_task(rq, prev);
 
 	schedstat_inc(rq, sched_goidle);
 #ifdef CONFIG_SMP
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1344,8 +1344,7 @@ pick_next_task_rt(struct rq *rq, struct
 	if (rt_rq_throttled(rt_rq))
 		return NULL;
 
-	if (prev)
-		prev->sched_class->put_prev_task(rq, prev);
+	prev->sched_class->put_prev_task(rq, prev);
 
 	p = _pick_next_task_rt(rq);
 
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -31,8 +31,7 @@ pick_next_task_stop(struct rq *rq, struc
 	if (!stop || !stop->on_rq)
 		return NULL;
 
-	if (prev)
-		prev->sched_class->put_prev_task(rq, prev);
+	prev->sched_class->put_prev_task(rq, prev);
 
 	stop->se.exec_start = rq_clock_task(rq);
 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )
  2014-02-12 14:06     ` Peter Zijlstra
@ 2014-02-12 14:24       ` Kirill Tkhai
  2014-02-12 14:54         ` Peter Zijlstra
  0 siblings, 1 reply; 14+ messages in thread
From: Kirill Tkhai @ 2014-02-12 14:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, hpa, linux-kernel, tglx, linux-tip-commits,
	Steven Rostedt, Juri Lelli, Dan Carpenter



12.02.2014, 18:06, "Peter Zijlstra" <peterz@infradead.org>:
> On Wed, Feb 12, 2014 at 11:00:53AM +0400, Kirill Tkhai wrote:
>
>>>  @@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
>>>                   if (rq->nr_running == 1)
>>>                           break;
>>>
>>>  - next = pick_next_task(rq);
>>>  + next = pick_next_task(rq, NULL);
>>  pick_next_task() firstly checks for prev->sched_class, doesn't it ???
>>
>>  The same for pick_next_task_rt().
>
> OK, how about something like this?

Good for me.

Maybe pack prev->sched_class->put_prev_task() into inline function?

Kirill

> ---
> Subject: sched: Fix hotplug task migration
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Wed, 12 Feb 2014 10:49:30 +0100
>
> Dan Carpenter reported:
>
>>  kernel/sched/rt.c:1347 pick_next_task_rt() warn: variable dereferenced before check 'prev' (see line 1338)
>>  kernel/sched/deadline.c:1011 pick_next_task_dl() warn: variable dereferenced before check 'prev' (see line 1005)
>
> Kirill also spotted that migrate_tasks() will have an instant NULL
> deref because pick_next_task() will immediately deref prev.
>
> Instead of fixing all the corner cases because migrate_tasks() can
> pass in a NULL prev task in the unlikely case of hot-un-plug, provide
> a fake task such that we can remove all the NULL checks from the far
> more common paths.
>
> A further problem; not previously spotted; is that because we pushed
> pre_schedule() and idle_balance() into pick_next_task() we now need to
> avoid those getting called and pulling more tasks on our dying CPU.
>
> We avoid pull_{dl,rt}_task() by setting fake_task.prio to MAX_PRIO+1.
> We also note that since we call pick_next_task() exactly the amount of
> times we have runnable tasks present, we should never land in
> idle_balance().
>
> Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()")
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Juri Lelli <juri.lelli@gmail.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Reported-by: Kirill Tkhai <tkhai@yandex.ru>
> Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
> Signed-off-by: Peter Zijlstra <peterz@infradead.org>
> Link: http://lkml.kernel.org/r/20140212094930.GB3545@laptop.programming.kicks-ass.net
> ---
>  kernel/sched/core.c      |   18 +++++++++++++++++-
>  kernel/sched/deadline.c  |    3 +--
>  kernel/sched/fair.c      |    5 ++---
>  kernel/sched/idle_task.c |    3 +--
>  kernel/sched/rt.c        |    3 +--
>  kernel/sched/stop_task.c |    3 +--
>  6 files changed, 23 insertions(+), 12 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4681,6 +4681,22 @@ static void calc_load_migrate(struct rq
>                  atomic_long_add(delta, &calc_load_tasks);
>  }
>
> +static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
> +{
> +}
> +
> +static const struct sched_class fake_sched_class = {
> + .put_prev_task = put_prev_task_fake,
> +};
> +
> +static struct task_struct fake_task = {
> + /*
> + * Avoid pull_{rt,dl}_task()
> + */
> + .prio = MAX_PRIO + 1,
> + .sched_class = &fake_sched_class,
> +};
> +
>  /*
>   * Migrate all tasks from the rq, sleeping tasks will be migrated by
>   * try_to_wake_up()->select_task_rq().
> @@ -4721,7 +4737,7 @@ static void migrate_tasks(unsigned int d
>                  if (rq->nr_running == 1)
>                          break;
>
> - next = pick_next_task(rq, NULL);
> + next = pick_next_task(rq, &fake_task);
>                  BUG_ON(!next);
>                  next->sched_class->put_prev_task(rq, next);
>
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1008,8 +1008,7 @@ struct task_struct *pick_next_task_dl(st
>          if (unlikely(!dl_rq->dl_nr_running))
>                  return NULL;
>
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> + prev->sched_class->put_prev_task(rq, prev);
>
>          dl_se = pick_next_dl_entity(rq, dl_rq);
>          BUG_ON(!dl_se);
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4690,7 +4690,7 @@ pick_next_task_fair(struct rq *rq, struc
>          if (!cfs_rq->nr_running)
>                  goto idle;
>
> - if (!prev || prev->sched_class != &fair_sched_class)
> + if (prev->sched_class != &fair_sched_class)
>                  goto simple;
>
>          /*
> @@ -4766,8 +4766,7 @@ pick_next_task_fair(struct rq *rq, struc
>          if (!cfs_rq->nr_running)
>                  goto idle;
>
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> + prev->sched_class->put_prev_task(rq, prev);
>
>          do {
>                  se = pick_next_entity(cfs_rq, NULL);
> --- a/kernel/sched/idle_task.c
> +++ b/kernel/sched/idle_task.c
> @@ -26,8 +26,7 @@ static void check_preempt_curr_idle(stru
>  static struct task_struct *
>  pick_next_task_idle(struct rq *rq, struct task_struct *prev)
>  {
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> + prev->sched_class->put_prev_task(rq, prev);
>
>          schedstat_inc(rq, sched_goidle);
>  #ifdef CONFIG_SMP
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1344,8 +1344,7 @@ pick_next_task_rt(struct rq *rq, struct
>          if (rt_rq_throttled(rt_rq))
>                  return NULL;
>
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> + prev->sched_class->put_prev_task(rq, prev);
>
>          p = _pick_next_task_rt(rq);
>
> --- a/kernel/sched/stop_task.c
> +++ b/kernel/sched/stop_task.c
> @@ -31,8 +31,7 @@ pick_next_task_stop(struct rq *rq, struc
>          if (!stop || !stop->on_rq)
>                  return NULL;
>
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> + prev->sched_class->put_prev_task(rq, prev);
>
>          stop->se.exec_start = rq_clock_task(rq);

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )
  2014-02-12 14:24       ` Kirill Tkhai
@ 2014-02-12 14:54         ` Peter Zijlstra
  0 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2014-02-12 14:54 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: mingo, hpa, linux-kernel, tglx, linux-tip-commits,
	Steven Rostedt, Juri Lelli, Dan Carpenter

On Wed, Feb 12, 2014 at 06:24:18PM +0400, Kirill Tkhai wrote:
> Maybe pack prev->sched_class->put_prev_task() into inline function?

Good suggestion, done.

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2014-02-12 14:54 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-02-11  5:05 [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair Peter Zijlstra
2012-02-11  6:56 ` Mike Galbraith
2012-02-11 15:02   ` Peter Zijlstra
2012-02-16 23:20 ` Peter Zijlstra
2012-02-16 23:28   ` Peter Zijlstra
2014-02-11 12:16 ` [tip:sched/core] sched/fair: Track cgroup depth tip-bot for Peter Zijlstra
2014-02-11 12:16 ` [tip:sched/core] sched: Push put_prev_task() into pick_next_task( ) tip-bot for Peter Zijlstra
2014-02-12  7:00   ` Kirill Tkhai
2014-02-12 11:43     ` Peter Zijlstra
2014-02-12 14:06     ` Peter Zijlstra
2014-02-12 14:24       ` Kirill Tkhai
2014-02-12 14:54         ` Peter Zijlstra
2014-02-11 12:16 ` [tip:sched/core] sched/fair: Clean up the __clear_buddies_*() functions tip-bot for Peter Zijlstra
2014-02-11 12:16 ` [tip:sched/core] sched/fair: Optimize cgroup pick_next_task_fair( ) tip-bot for Peter Zijlstra

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).