All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair
@ 2012-02-11  5:05 Peter Zijlstra
  2012-02-11  6:56 ` Mike Galbraith
                   ` (5 more replies)
  0 siblings, 6 replies; 14+ messages in thread
From: Peter Zijlstra @ 2012-02-11  5:05 UTC (permalink / raw)
  To: Paul Turner, Venkatesh Pallipadi; +Cc: Ingo Molnar, Mike Galbraith, LKML

Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
path") it is likely we pick a new task from the same cgroup, doing a put
and then set on all intermediate entities is a waste of time, so try to
avoid this.

XXX check put_prev_task()'s update_rq_clock() magic..

Compile tested only.. inspired by pjt's fast switch stories.

Not-quite-signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h    |    4 +-
 kernel/sched/core.c      |   11 ++---
 kernel/sched/fair.c      |   94 +++++++++++++++++++++++++++++-----------------
 kernel/sched/idle_task.c |    6 ++-
 kernel/sched/rt.c        |   10 ++++-
 kernel/sched/stop_task.c |    6 ++-
 6 files changed, 85 insertions(+), 46 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a5be381..37e990b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1117,7 +1117,8 @@ struct sched_class {
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
 
-	struct task_struct * (*pick_next_task) (struct rq *rq);
+	struct task_struct * (*pick_next_task) (struct rq *rq, 
+						struct task_struct *prev);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
@@ -1210,6 +1211,7 @@ struct sched_entity {
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	int			depth;
 	struct sched_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
 	struct cfs_rq		*cfs_rq;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d7c4322..7628d25 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3120,7 +3120,7 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
 {
 	const struct sched_class *class;
 	struct task_struct *p;
@@ -3130,13 +3130,13 @@ pick_next_task(struct rq *rq)
 	 * the fair class we can call that function directly:
 	 */
 	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
-		p = fair_sched_class.pick_next_task(rq);
+		p = fair_sched_class.pick_next_task(rq, prev);
 		if (likely(p))
 			return p;
 	}
 
 	for_each_class(class) {
-		p = class->pick_next_task(rq);
+		p = class->pick_next_task(rq, prev);
 		if (p)
 			return p;
 	}
@@ -3197,8 +3197,7 @@ static void __sched __schedule(void)
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 
-	put_prev_task(rq, prev);
-	next = pick_next_task(rq);
+	next = pick_next_task(rq, prev);
 	clear_tsk_need_resched(prev);
 	rq->skip_clock_update = 0;
 
@@ -5102,7 +5101,7 @@ static void migrate_tasks(unsigned int dead_cpu)
 		if (rq->nr_running == 1)
 			break;
 
-		next = pick_next_task(rq);
+		next = pick_next_task(rq, NULL);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4ab60a2..4ea6f45 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -294,13 +294,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
 	if (se->cfs_rq == pse->cfs_rq)
-		return 1;
+		return se->cfs_rq;
 
-	return 0;
+	return NULL;
 }
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -308,17 +308,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 	return se->parent;
 }
 
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-	int depth = 0;
-
-	for_each_sched_entity(se)
-		depth++;
-
-	return depth;
-}
-
 static void
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
@@ -332,8 +321,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 	 */
 
 	/* First walk up until both entities are at same depth */
-	se_depth = depth_se(*se);
-	pse_depth = depth_se(*pse);
+	se_depth = (*se)->depth;
+	pse_depth = (*pse)->depth;
 
 	while (se_depth > pse_depth) {
 		se_depth--;
@@ -398,10 +387,10 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
-	return 1;
+	return cfs_rq_of(se); /* always the same rq */
 }
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -1134,10 +1123,10 @@ static void __clear_buddies_last(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->last == se)
-			cfs_rq->last = NULL;
-		else
+		if (cfs_rq->last != se)
 			break;
+
+		cfs_rq->last = NULL;
 	}
 }
 
@@ -1145,10 +1134,10 @@ static void __clear_buddies_next(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->next == se)
-			cfs_rq->next = NULL;
-		else
+		if (cfs_rq->next != se)
 			break;
+
+		cfs_rq->next = NULL;
 	}
 }
 
@@ -1156,10 +1145,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->skip == se)
-			cfs_rq->skip = NULL;
-		else
+		if (cfs_rq->skip != se)
 			break;
+
+		cfs_rq->skip = NULL;
 	}
 }
 
@@ -2991,22 +2980,51 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 		set_last_buddy(se);
 }
 
-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
-	struct sched_entity *se;
+	struct sched_entity *se, *pse;
 
 	if (!cfs_rq->nr_running)
 		return NULL;
 
+	if (prev && prev->sched_class != &fair_sched_class) {
+		prev->sched_class->put_prev_task(rq, prev);
+		prev = NULL;
+	}
+
 	do {
 		se = pick_next_entity(cfs_rq);
-		set_next_entity(cfs_rq, se);
+		if (!prev)
+			set_next_entity(cfs_rq, se);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
 	p = task_of(se);
+
+	if (prev) {
+		pse = &prev->se;
+
+		while (!(cfs_rq = is_same_group(se, pse))) {
+			int se_depth = se->depth;
+			int pse_depth = pse->depth;
+
+			if (se_depth <= pse_depth) {
+				put_prev_entity(cfs_rq_of(pse), pse);
+				pse = parent_entity(pse);
+			}
+			if (se_depth >= pse_depth) {
+				set_next_entity(cfs_rq_of(se), se);
+				se = parent_entity(se);
+			}
+		}
+
+		put_prev_entity(cfs_rq, pse);
+		set_next_entity(cfs_rq, se);
+	}
+
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 
@@ -5358,6 +5376,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
+	struct sched_entity *se = &p->se;
+
 	/*
 	 * If the task was not on the rq at the time of this cgroup movement
 	 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5383,14 +5403,15 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 	 * To prevent boost or penalty in the new cfs_rq caused by delta
 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
 	 */
-	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+	if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
 		on_rq = 1;
 
 	if (!on_rq)
-		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+		se->vruntime -= cfs_rq_of(se)->min_vruntime;
 	set_task_rq(p, task_cpu(p));
+	se->depth = se->parent->depth + 1;
 	if (!on_rq)
-		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+		se->vruntime += cfs_rq_of(se)->min_vruntime;
 }
 
 void free_fair_sched_group(struct task_group *tg)
@@ -5488,10 +5509,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 	if (!se)
 		return;
 
-	if (!parent)
+	if (!parent) {
 		se->cfs_rq = &rq->cfs;
-	else
+		se->depth = 0;
+	} else {
 		se->cfs_rq = parent->my_q;
+		se->depth = parent->depth + 1;
+	}
 
 	se->my_q = cfs_rq;
 	update_load_set(&se->load, 0);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c95..7f5d232 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -22,8 +22,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 	resched_task(rq->idle);
 }
 
-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
 	schedstat_inc(rq, sched_goidle);
 	calc_load_account_idle(rq);
 	return rq->idle;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f42ae7f..0f6c316 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1343,9 +1343,15 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 	return p;
 }
 
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 {
-	struct task_struct *p = _pick_next_task_rt(rq);
+	struct task_struct *p;
+
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+       
+	p = _pick_next_task_rt(rq);
 
 	/* The running task is never eligible for pushing */
 	if (p)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7b386e8..441cc57 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,10 +23,14 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
 	/* we're never preempted */
 }
 
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *stop = rq->stop;
 
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
 	if (stop && stop->on_rq)
 		return stop;
 



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair
  2012-02-11  5:05 [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair Peter Zijlstra
@ 2012-02-11  6:56 ` Mike Galbraith
  2012-02-11 15:02   ` Peter Zijlstra
  2012-02-16 23:20 ` Peter Zijlstra
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 14+ messages in thread
From: Mike Galbraith @ 2012-02-11  6:56 UTC (permalink / raw)
  To: Peter Zijlstra; +Cc: Paul Turner, Venkatesh Pallipadi, Ingo Molnar, LKML

On Sat, 2012-02-11 at 06:05 +0100, Peter Zijlstra wrote: 
> Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
> path") it is likely we pick a new task from the same cgroup, doing a put
> and then set on all intermediate entities is a waste of time, so try to
> avoid this.

Good idea, we need to lose some weight.

> XXX check put_prev_task()'s update_rq_clock() magic..

I made that even more lovable ;-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d7c4322..7c1cfa6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3175,6 +3175,7 @@ need_resched:
 		} else {
 			deactivate_task(rq, prev, DEQUEUE_SLEEP);
 			prev->on_rq = 0;
+			rq->skip_clock_update = 1;
 
 			/*
 			 * If a worker went to sleep, notify and ask workqueue
@@ -3200,9 +3201,9 @@ need_resched:
 	put_prev_task(rq, prev);
 	next = pick_next_task(rq);
 	clear_tsk_need_resched(prev);
-	rq->skip_clock_update = 0;
 
 	if (likely(prev != next)) {
+		rq->skip_clock_update = 0;
 		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
@@ -3220,6 +3221,7 @@ need_resched:
 		raw_spin_unlock_irq(&rq->lock);
 
 	post_schedule(rq);
+	rq->skip_clock_update = 0;
 
 	preempt_enable_no_resched();
 	if (need_resched())

> Compile tested only.. inspired by pjt's fast switch stories.

I was looking forward to seeing those fast switch patches, nice spot to
cut lard.

> Not-quite-signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>

I'll see if I can give it some runtime.

-Mike


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair
  2012-02-11  6:56 ` Mike Galbraith
@ 2012-02-11 15:02   ` Peter Zijlstra
  0 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2012-02-11 15:02 UTC (permalink / raw)
  To: Mike Galbraith; +Cc: Paul Turner, Venkatesh Pallipadi, Ingo Molnar, LKML

On Sat, 2012-02-11 at 07:56 +0100, Mike Galbraith wrote:
> On Sat, 2012-02-11 at 06:05 +0100, Peter Zijlstra wrote: 
> > Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
> > path") it is likely we pick a new task from the same cgroup, doing a put
> > and then set on all intermediate entities is a waste of time, so try to
> > avoid this.
> 
> Good idea, we need to lose some weight. 

I actually set out to do something else, but then ended up here.. weird
how that works ;-)

My initial idea was to try and get rid of the pick_next_entity()
downward walk by keeping that data up-to-date when we dequeue/enqueue.

That all got a tad convoluted, I'll have to try it again. Anyway, that
ended up needing to rework the put/set stuff, and that part wasn't as
bad.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair
  2012-02-11  5:05 [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair Peter Zijlstra
  2012-02-11  6:56 ` Mike Galbraith
@ 2012-02-16 23:20 ` Peter Zijlstra
  2012-02-16 23:28   ` Peter Zijlstra
  2014-02-11 12:16 ` [tip:sched/core] sched/fair: Track cgroup depth tip-bot for Peter Zijlstra
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2012-02-16 23:20 UTC (permalink / raw)
  To: Paul Turner; +Cc: Venkatesh Pallipadi, Ingo Molnar, Mike Galbraith, LKML

On Sat, 2012-02-11 at 06:05 +0100, Peter Zijlstra wrote:
> Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
> path") it is likely we pick a new task from the same cgroup, doing a put
> and then set on all intermediate entities is a waste of time, so try to
> avoid this.
> 
> XXX check put_prev_task()'s update_rq_clock() magic..
> 
> Compile tested only.. inspired by pjt's fast switch stories.
> 
> Not-quite-signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> 

Here's one that actually boots..

---
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1117,7 +1117,8 @@ struct sched_class {
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
 
-	struct task_struct * (*pick_next_task) (struct rq *rq);
+	struct task_struct * (*pick_next_task) (struct rq *rq,
+						struct task_struct *prev);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
@@ -1210,6 +1211,7 @@ struct sched_entity {
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	int			depth;
 	struct sched_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
 	struct cfs_rq		*cfs_rq;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3119,7 +3119,7 @@ static void put_prev_task(struct rq *rq,
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
 {
 	const struct sched_class *class;
 	struct task_struct *p;
@@ -3129,13 +3129,13 @@ pick_next_task(struct rq *rq)
 	 * the fair class we can call that function directly:
 	 */
 	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
-		p = fair_sched_class.pick_next_task(rq);
+		p = fair_sched_class.pick_next_task(rq, prev);
 		if (likely(p))
 			return p;
 	}
 
 	for_each_class(class) {
-		p = class->pick_next_task(rq);
+		p = class->pick_next_task(rq, prev);
 		if (p)
 			return p;
 	}
@@ -3196,8 +3196,9 @@ static void __sched __schedule(void)
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 
-	put_prev_task(rq, prev);
-	next = pick_next_task(rq);
+	if (prev->on_rq || rq->skip_clock_update < 0)
+		update_rq_clock(rq);
+	next = pick_next_task(rq, prev);
 	clear_tsk_need_resched(prev);
 	rq->skip_clock_update = 0;
 
@@ -5101,7 +5102,7 @@ static void migrate_tasks(unsigned int d
 		if (rq->nr_running == 1)
 			break;
 
-		next = pick_next_task(rq);
+		next = pick_next_task(rq, NULL);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -294,13 +294,13 @@ static inline void list_del_leaf_cfs_rq(
 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
 	if (se->cfs_rq == pse->cfs_rq)
-		return 1;
+		return se->cfs_rq;
 
-	return 0;
+	return NULL;
 }
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -308,17 +308,6 @@ static inline struct sched_entity *paren
 	return se->parent;
 }
 
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-	int depth = 0;
-
-	for_each_sched_entity(se)
-		depth++;
-
-	return depth;
-}
-
 static void
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
@@ -332,8 +321,8 @@ find_matching_se(struct sched_entity **s
 	 */
 
 	/* First walk up until both entities are at same depth */
-	se_depth = depth_se(*se);
-	pse_depth = depth_se(*pse);
+	se_depth = (*se)->depth;
+	pse_depth = (*pse)->depth;
 
 	while (se_depth > pse_depth) {
 		se_depth--;
@@ -398,10 +387,10 @@ static inline void list_del_leaf_cfs_rq(
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
-	return 1;
+	return cfs_rq_of(se); /* always the same rq */
 }
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -1136,10 +1125,10 @@ static void __clear_buddies_last(struct 
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->last == se)
-			cfs_rq->last = NULL;
-		else
+		if (cfs_rq->last != se)
 			break;
+
+		cfs_rq->last = NULL;
 	}
 }
 
@@ -1147,10 +1136,10 @@ static void __clear_buddies_next(struct 
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->next == se)
-			cfs_rq->next = NULL;
-		else
+		if (cfs_rq->next != se)
 			break;
+
+		cfs_rq->next = NULL;
 	}
 }
 
@@ -1158,10 +1147,10 @@ static void __clear_buddies_skip(struct 
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->skip == se)
-			cfs_rq->skip = NULL;
-		else
+		if (cfs_rq->skip != se)
 			break;
+
+		cfs_rq->skip = NULL;
 	}
 }
 
@@ -2993,22 +2982,52 @@ static void check_preempt_wakeup(struct 
 		set_last_buddy(se);
 }
 
-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
-	struct sched_entity *se;
+	struct sched_entity *se, *pse;
 
 	if (!cfs_rq->nr_running)
 		return NULL;
 
+	if (prev && (prev->sched_class != &fair_sched_class ||
+				cfs_rq->nr_running == 1)) {
+		prev->sched_class->put_prev_task(rq, prev);
+		prev = NULL;
+	}
+
 	do {
 		se = pick_next_entity(cfs_rq);
-		set_next_entity(cfs_rq, se);
+		if (!prev)
+			set_next_entity(cfs_rq, se);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
 	p = task_of(se);
+
+	if (prev) {
+		pse = &prev->se;
+
+		while (!(cfs_rq = is_same_group(se, pse))) {
+			int se_depth = se->depth;
+			int pse_depth = pse->depth;
+
+			if (se_depth <= pse_depth) {
+				put_prev_entity(cfs_rq_of(pse), pse);
+				pse = parent_entity(pse);
+			}
+			if (se_depth >= pse_depth) {
+				set_next_entity(cfs_rq_of(se), se);
+				se = parent_entity(se);
+			}
+		}
+
+		put_prev_entity(cfs_rq, pse);
+		set_next_entity(cfs_rq, se);
+	}
+
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 
@@ -5360,6 +5379,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
+	struct sched_entity *se = &p->se;
+
 	/*
 	 * If the task was not on the rq at the time of this cgroup movement
 	 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5385,14 +5406,16 @@ static void task_move_group_fair(struct 
 	 * To prevent boost or penalty in the new cfs_rq caused by delta
 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
 	 */
-	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+	if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
 		on_rq = 1;
 
 	if (!on_rq)
-		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+		se->vruntime -= cfs_rq_of(se)->min_vruntime;
 	set_task_rq(p, task_cpu(p));
+	if (se->parent)
+		se->depth = se->parent->depth + 1;
 	if (!on_rq)
-		p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+		se->vruntime += cfs_rq_of(se)->min_vruntime;
 }
 
 void free_fair_sched_group(struct task_group *tg)
@@ -5490,10 +5513,13 @@ void init_tg_cfs_entry(struct task_group
 	if (!se)
 		return;
 
-	if (!parent)
+	if (!parent) {
 		se->cfs_rq = &rq->cfs;
-	else
+		se->depth = 0;
+	} else {
 		se->cfs_rq = parent->my_q;
+		se->depth = parent->depth + 1;
+	}
 
 	se->my_q = cfs_rq;
 	update_load_set(&se->load, 0);
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -22,8 +22,12 @@ static void check_preempt_curr_idle(stru
 	resched_task(rq->idle);
 }
 
-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
 	schedstat_inc(rq, sched_goidle);
 	calc_load_account_idle(rq);
 	return rq->idle;
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1321,15 +1321,7 @@ static struct task_struct *_pick_next_ta
 {
 	struct sched_rt_entity *rt_se;
 	struct task_struct *p;
-	struct rt_rq *rt_rq;
-
-	rt_rq = &rq->rt;
-
-	if (!rt_rq->rt_nr_running)
-		return NULL;
-
-	if (rt_rq_throttled(rt_rq))
-		return NULL;
+	struct rt_rq *rt_rq  = &rq->rt;
 
 	do {
 		rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1343,9 +1335,22 @@ static struct task_struct *_pick_next_ta
 	return p;
 }
 
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 {
-	struct task_struct *p = _pick_next_task_rt(rq);
+	struct task_struct *p;
+	struct rt_rq *rt_rq = &rq->rt;
+
+	if (!rt_rq->rt_nr_running)
+		return NULL;
+
+	if (rt_rq_throttled(rt_rq))
+		return NULL;
+
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
+	p = _pick_next_task_rt(rq);
 
 	/* The running task is never eligible for pushing */
 	if (p)
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,12 +23,17 @@ check_preempt_curr_stop(struct rq *rq, s
 	/* we're never preempted */
 }
 
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *stop = rq->stop;
 
-	if (stop && stop->on_rq)
+	if (stop && stop->on_rq) {
+		if (prev)
+			prev->sched_class->put_prev_task(rq, prev);
+
 		return stop;
+	}
 
 	return NULL;
 }


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair
  2012-02-16 23:20 ` Peter Zijlstra
@ 2012-02-16 23:28   ` Peter Zijlstra
  0 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2012-02-16 23:28 UTC (permalink / raw)
  To: Paul Turner; +Cc: Venkatesh Pallipadi, Ingo Molnar, Mike Galbraith, LKML

On Fri, 2012-02-17 at 00:20 +0100, Peter Zijlstra wrote:
> +       if (prev && (prev->sched_class != &fair_sched_class ||
> +                               cfs_rq->nr_running == 1)) { 

this ==1 check is way ugly and a problem that needs addressing. Not
putting cfs_rq->curr back in the tree perturbs pick_next_entity()'s view
of things.

Probably easiest to pick leftmost and ->curr and go from there, but
that's for tomorrow or so.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [tip:sched/core] sched/fair: Track cgroup depth
  2012-02-11  5:05 [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair Peter Zijlstra
  2012-02-11  6:56 ` Mike Galbraith
  2012-02-16 23:20 ` Peter Zijlstra
@ 2014-02-11 12:16 ` tip-bot for Peter Zijlstra
  2014-02-11 12:16 ` [tip:sched/core] sched: Push put_prev_task() into pick_next_task( ) tip-bot for Peter Zijlstra
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 14+ messages in thread
From: tip-bot for Peter Zijlstra @ 2014-02-11 12:16 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, peterz, tglx

Commit-ID:  fed14d45f945042a15b09de48d7d3d58d9455fc4
Gitweb:     http://git.kernel.org/tip/fed14d45f945042a15b09de48d7d3d58d9455fc4
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Sat, 11 Feb 2012 06:05:00 +0100
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Mon, 10 Feb 2014 16:17:10 +0100

sched/fair: Track cgroup depth

Track depth in cgroup tree, this is useful for things like
find_matching_se() where you need to get to a common parent of two
sched entities.

Keeping the depth avoids having to calculate it on the spot, which
saves a number of possible cache-misses.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  1 +
 kernel/sched/fair.c   | 47 +++++++++++++++++++++--------------------------
 2 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e3d5564..555e27d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1078,6 +1078,7 @@ struct sched_entity {
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
+	int			depth;
 	struct sched_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
 	struct cfs_rq		*cfs_rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fea77..748a7ac 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
 	if (se->cfs_rq == pse->cfs_rq)
-		return 1;
+		return se->cfs_rq;
 
-	return 0;
+	return NULL;
 }
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 	return se->parent;
 }
 
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-	int depth = 0;
-
-	for_each_sched_entity(se)
-		depth++;
-
-	return depth;
-}
-
 static void
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 	 */
 
 	/* First walk up until both entities are at same depth */
-	se_depth = depth_se(*se);
-	pse_depth = depth_se(*pse);
+	se_depth = (*se)->depth;
+	pse_depth = (*pse)->depth;
 
 	while (se_depth > pse_depth) {
 		se_depth--;
@@ -426,10 +415,10 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
-	return 1;
+	return cfs_rq_of(se); /* always the same rq */
 }
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -7262,7 +7251,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
+	struct sched_entity *se = &p->se;
 	struct cfs_rq *cfs_rq;
+
 	/*
 	 * If the task was not on the rq at the time of this cgroup movement
 	 * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7288,23 +7279,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 	 * To prevent boost or penalty in the new cfs_rq caused by delta
 	 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
 	 */
-	if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+	if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
 		on_rq = 1;
 
 	if (!on_rq)
-		p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+		se->vruntime -= cfs_rq_of(se)->min_vruntime;
 	set_task_rq(p, task_cpu(p));
+	se->depth = se->parent ? se->parent->depth + 1 : 0;
 	if (!on_rq) {
-		cfs_rq = cfs_rq_of(&p->se);
-		p->se.vruntime += cfs_rq->min_vruntime;
+		cfs_rq = cfs_rq_of(se);
+		se->vruntime += cfs_rq->min_vruntime;
 #ifdef CONFIG_SMP
 		/*
 		 * migrate_task_rq_fair() will have removed our previous
 		 * contribution, but we must synchronize for ongoing future
 		 * decay.
 		 */
-		p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
-		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
 #endif
 	}
 }
@@ -7400,10 +7392,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 	if (!se)
 		return;
 
-	if (!parent)
+	if (!parent) {
 		se->cfs_rq = &rq->cfs;
-	else
+		se->depth = 0;
+	} else {
 		se->cfs_rq = parent->my_q;
+		se->depth = parent->depth + 1;
+	}
 
 	se->my_q = cfs_rq;
 	/* guarantee group entities always have weight */

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )
  2012-02-11  5:05 [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair Peter Zijlstra
                   ` (2 preceding siblings ...)
  2014-02-11 12:16 ` [tip:sched/core] sched/fair: Track cgroup depth tip-bot for Peter Zijlstra
@ 2014-02-11 12:16 ` tip-bot for Peter Zijlstra
  2014-02-12  7:00   ` Kirill Tkhai
  2014-02-11 12:16 ` [tip:sched/core] sched/fair: Clean up the __clear_buddies_*() functions tip-bot for Peter Zijlstra
  2014-02-11 12:16 ` [tip:sched/core] sched/fair: Optimize cgroup pick_next_task_fair( ) tip-bot for Peter Zijlstra
  5 siblings, 1 reply; 14+ messages in thread
From: tip-bot for Peter Zijlstra @ 2014-02-11 12:16 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, peterz, tglx

Commit-ID:  606dba2e289446600a0b68422ed2019af5355c12
Gitweb:     http://git.kernel.org/tip/606dba2e289446600a0b68422ed2019af5355c12
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Sat, 11 Feb 2012 06:05:00 +0100
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Mon, 10 Feb 2014 16:17:13 +0100

sched: Push put_prev_task() into pick_next_task()

In order to avoid having to do put/set on a whole cgroup hierarchy
when we context switch, push the put into pick_next_task() so that
both operations are in the same function. Further changes then allow
us to possibly optimize away redundant work.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/core.c      | 21 ++++++++-------------
 kernel/sched/deadline.c  |  5 ++++-
 kernel/sched/fair.c      |  6 +++++-
 kernel/sched/idle_task.c |  6 +++++-
 kernel/sched/rt.c        | 27 ++++++++++++++++-----------
 kernel/sched/sched.h     |  8 +++++++-
 kernel/sched/stop_task.c | 16 ++++++++++------
 7 files changed, 55 insertions(+), 34 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 417cf65..dedb5f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2579,18 +2579,11 @@ static inline void schedule_debug(struct task_struct *prev)
 	schedstat_inc(this_rq(), sched_count);
 }
 
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
-{
-	if (prev->on_rq || rq->skip_clock_update < 0)
-		update_rq_clock(rq);
-	prev->sched_class->put_prev_task(rq, prev);
-}
-
 /*
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
 {
 	const struct sched_class *class;
 	struct task_struct *p;
@@ -2600,13 +2593,13 @@ pick_next_task(struct rq *rq)
 	 * the fair class we can call that function directly:
 	 */
 	if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
-		p = fair_sched_class.pick_next_task(rq);
+		p = fair_sched_class.pick_next_task(rq, prev);
 		if (likely(p))
 			return p;
 	}
 
 	for_each_class(class) {
-		p = class->pick_next_task(rq);
+		p = class->pick_next_task(rq, prev);
 		if (p)
 			return p;
 	}
@@ -2714,8 +2707,10 @@ need_resched:
 			rq->idle_stamp = 0;
 	}
 
-	put_prev_task(rq, prev);
-	next = pick_next_task(rq);
+	if (prev->on_rq || rq->skip_clock_update < 0)
+		update_rq_clock(rq);
+
+	next = pick_next_task(rq, prev);
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
 	rq->skip_clock_update = 0;
@@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
 		if (rq->nr_running == 1)
 			break;
 
-		next = pick_next_task(rq);
+		next = pick_next_task(rq, NULL);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b5700bc..50797d5 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -990,7 +990,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
 	return rb_entry(left, struct sched_dl_entity, rb_node);
 }
 
-struct task_struct *pick_next_task_dl(struct rq *rq)
+struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 {
 	struct sched_dl_entity *dl_se;
 	struct task_struct *p;
@@ -1001,6 +1001,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
 	if (unlikely(!dl_rq->dl_nr_running))
 		return NULL;
 
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
 	dl_se = pick_next_dl_entity(rq, dl_rq);
 	BUG_ON(!dl_se);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 748a7ac..c4bb0ac 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4655,7 +4655,8 @@ preempt:
 		set_last_buddy(se);
 }
 
-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
@@ -4664,6 +4665,9 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 	if (!cfs_rq->nr_running)
 		return NULL;
 
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
 	do {
 		se = pick_next_entity(cfs_rq);
 		set_next_entity(cfs_rq, se);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9..e5c922a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -33,8 +33,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 	resched_task(rq->idle);
 }
 
-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
 	schedstat_inc(rq, sched_goidle);
 #ifdef CONFIG_SMP
 	/* Trigger the post schedule to do an idle_enter for CFS */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a2740b7..a15ca1c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1310,15 +1310,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 {
 	struct sched_rt_entity *rt_se;
 	struct task_struct *p;
-	struct rt_rq *rt_rq;
-
-	rt_rq = &rq->rt;
-
-	if (!rt_rq->rt_nr_running)
-		return NULL;
-
-	if (rt_rq_throttled(rt_rq))
-		return NULL;
+	struct rt_rq *rt_rq  = &rq->rt;
 
 	do {
 		rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1332,9 +1324,22 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 	return p;
 }
 
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 {
-	struct task_struct *p = _pick_next_task_rt(rq);
+	struct task_struct *p;
+	struct rt_rq *rt_rq = &rq->rt;
+
+	if (!rt_rq->rt_nr_running)
+		return NULL;
+
+	if (rt_rq_throttled(rt_rq))
+		return NULL;
+
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
+	p = _pick_next_task_rt(rq);
 
 	/* The running task is never eligible for pushing */
 	if (p)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bb89991..c534cf4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1105,7 +1105,13 @@ struct sched_class {
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
 
-	struct task_struct * (*pick_next_task) (struct rq *rq);
+	/*
+	 * It is the responsibility of the pick_next_task() method that will
+	 * return the next task to call put_prev_task() on the @prev task or
+	 * something equivalent.
+	 */
+	struct task_struct * (*pick_next_task) (struct rq *rq,
+						struct task_struct *prev);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fdb6bb0..a4147c9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,16 +23,20 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
 	/* we're never preempted */
 }
 
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 {
 	struct task_struct *stop = rq->stop;
 
-	if (stop && stop->on_rq) {
-		stop->se.exec_start = rq_clock_task(rq);
-		return stop;
-	}
+	if (!stop || !stop->on_rq)
+		return NULL;
 
-	return NULL;
+	if (prev)
+		prev->sched_class->put_prev_task(rq, prev);
+
+	stop->se.exec_start = rq_clock_task(rq);
+
+	return stop;
 }
 
 static void

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [tip:sched/core] sched/fair: Clean up the __clear_buddies_*() functions
  2012-02-11  5:05 [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair Peter Zijlstra
                   ` (3 preceding siblings ...)
  2014-02-11 12:16 ` [tip:sched/core] sched: Push put_prev_task() into pick_next_task( ) tip-bot for Peter Zijlstra
@ 2014-02-11 12:16 ` tip-bot for Peter Zijlstra
  2014-02-11 12:16 ` [tip:sched/core] sched/fair: Optimize cgroup pick_next_task_fair( ) tip-bot for Peter Zijlstra
  5 siblings, 0 replies; 14+ messages in thread
From: tip-bot for Peter Zijlstra @ 2014-02-11 12:16 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, peterz, tglx

Commit-ID:  f10447998a59b97747c16258a9c6e6a1512f27f3
Gitweb:     http://git.kernel.org/tip/f10447998a59b97747c16258a9c6e6a1512f27f3
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Sat, 11 Feb 2012 06:05:00 +0100
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Mon, 10 Feb 2014 16:17:16 +0100

sched/fair: Clean up the __clear_buddies_*() functions

Slightly easier code flow, no functional changes.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c4bb0ac..8461721 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2739,10 +2739,10 @@ static void __clear_buddies_last(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->last == se)
-			cfs_rq->last = NULL;
-		else
+		if (cfs_rq->last != se)
 			break;
+
+		cfs_rq->last = NULL;
 	}
 }
 
@@ -2750,10 +2750,10 @@ static void __clear_buddies_next(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->next == se)
-			cfs_rq->next = NULL;
-		else
+		if (cfs_rq->next != se)
 			break;
+
+		cfs_rq->next = NULL;
 	}
 }
 
@@ -2761,10 +2761,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
 {
 	for_each_sched_entity(se) {
 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
-		if (cfs_rq->skip == se)
-			cfs_rq->skip = NULL;
-		else
+		if (cfs_rq->skip != se)
 			break;
+
+		cfs_rq->skip = NULL;
 	}
 }
 

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [tip:sched/core] sched/fair: Optimize cgroup pick_next_task_fair( )
  2012-02-11  5:05 [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair Peter Zijlstra
                   ` (4 preceding siblings ...)
  2014-02-11 12:16 ` [tip:sched/core] sched/fair: Clean up the __clear_buddies_*() functions tip-bot for Peter Zijlstra
@ 2014-02-11 12:16 ` tip-bot for Peter Zijlstra
  5 siblings, 0 replies; 14+ messages in thread
From: tip-bot for Peter Zijlstra @ 2014-02-11 12:16 UTC (permalink / raw)
  To: linux-tip-commits
  Cc: linux-kernel, hpa, mingo, torvalds, peterz, akpm, tj, tglx

Commit-ID:  678d5718d8d099421b0dd54c01b0528f4aaf5919
Gitweb:     http://git.kernel.org/tip/678d5718d8d099421b0dd54c01b0528f4aaf5919
Author:     Peter Zijlstra <peterz@infradead.org>
AuthorDate: Sat, 11 Feb 2012 06:05:00 +0100
Committer:  Ingo Molnar <mingo@kernel.org>
CommitDate: Mon, 10 Feb 2014 16:17:19 +0100

sched/fair: Optimize cgroup pick_next_task_fair()

Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
path") it is likely we pick a new task from the same cgroup, doing a put
and then set on all intermediate entities is a waste of time, so try to
avoid this.

Measured using:

  mount nodev /cgroup -t cgroup -o cpu
  cd /cgroup
  mkdir a; cd a
  mkdir b; cd b
  mkdir c; cd c
  echo $$ > tasks
  perf stat --repeat 10 -- taskset 1 perf bench sched pipe

PRE :      4.542422684 seconds time elapsed   ( +-  0.33% )
POST:      4.389409991 seconds time elapsed   ( +-  0.32% )

Which shows a significant improvement of ~3.5%

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 110 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8461721..a81b241 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2907,17 +2907,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
  * 3) pick the "last" process, for cache locality
  * 4) do not run the "skip" process, if something else is available
  */
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-	struct sched_entity *se = __pick_first_entity(cfs_rq);
-	struct sched_entity *left = se;
+	struct sched_entity *left = __pick_first_entity(cfs_rq);
+	struct sched_entity *se;
+
+	/*
+	 * If curr is set we have to see if its left of the leftmost entity
+	 * still in the tree, provided there was anything in the tree at all.
+	 */
+	if (!left || (curr && entity_before(curr, left)))
+		left = curr;
+
+	se = left; /* ideally we run the leftmost entity */
 
 	/*
 	 * Avoid running the skip buddy, if running something else can
 	 * be done without getting too unfair.
 	 */
 	if (cfs_rq->skip == se) {
-		struct sched_entity *second = __pick_next_entity(se);
+		struct sched_entity *second;
+
+		if (se == curr) {
+			second = __pick_first_entity(cfs_rq);
+		} else {
+			second = __pick_next_entity(se);
+			if (!second || (curr && entity_before(curr, second)))
+				second = curr;
+		}
+
 		if (second && wakeup_preempt_entity(second, left) < 1)
 			se = second;
 	}
@@ -2939,7 +2958,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 	return se;
 }
 
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
@@ -3594,22 +3613,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 }
 
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	if (!cfs_bandwidth_used())
-		return;
+		return false;
 
 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
-		return;
+		return false;
 
 	/*
 	 * it's possible for a throttled entity to be forced into a running
 	 * state (e.g. set_curr_task), in this case we're finished.
 	 */
 	if (cfs_rq_throttled(cfs_rq))
-		return;
+		return true;
 
 	throttle_cfs_rq(cfs_rq);
+	return true;
 }
 
 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3719,7 +3739,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 }
 
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 
@@ -4658,9 +4678,86 @@ preempt:
 static struct task_struct *
 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 {
-	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
+	struct task_struct *p;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (!cfs_rq->nr_running)
+		return NULL;
+
+	if (!prev || prev->sched_class != &fair_sched_class)
+		goto simple;
+
+	/*
+	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+	 * likely that a next task is from the same cgroup as the current.
+	 *
+	 * Therefore attempt to avoid putting and setting the entire cgroup
+	 * hierarchy, only change the part that actually changes.
+	 */
+
+	do {
+		struct sched_entity *curr = cfs_rq->curr;
+
+		/*
+		 * Since we got here without doing put_prev_entity() we also
+		 * have to consider cfs_rq->curr. If it is still a runnable
+		 * entity, update_curr() will update its vruntime, otherwise
+		 * forget we've ever seen it.
+		 */
+		if (curr && curr->on_rq)
+			update_curr(cfs_rq);
+		else
+			curr = NULL;
+
+		/*
+		 * This call to check_cfs_rq_runtime() will do the throttle and
+		 * dequeue its entity in the parent(s). Therefore the 'simple'
+		 * nr_running test will indeed be correct.
+		 */
+		if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+			goto simple;
+
+		se = pick_next_entity(cfs_rq, curr);
+		cfs_rq = group_cfs_rq(se);
+	} while (cfs_rq);
+
+	p = task_of(se);
+
+	/*
+	 * Since we haven't yet done put_prev_entity and if the selected task
+	 * is a different task than we started out with, try and touch the
+	 * least amount of cfs_rqs.
+	 */
+	if (prev != p) {
+		struct sched_entity *pse = &prev->se;
+
+		while (!(cfs_rq = is_same_group(se, pse))) {
+			int se_depth = se->depth;
+			int pse_depth = pse->depth;
+
+			if (se_depth <= pse_depth) {
+				put_prev_entity(cfs_rq_of(pse), pse);
+				pse = parent_entity(pse);
+			}
+			if (se_depth >= pse_depth) {
+				set_next_entity(cfs_rq_of(se), se);
+				se = parent_entity(se);
+			}
+		}
+
+		put_prev_entity(cfs_rq, pse);
+		set_next_entity(cfs_rq, se);
+	}
+
+	if (hrtick_enabled(rq))
+		hrtick_start_fair(rq, p);
+
+	return p;
+simple:
+	cfs_rq = &rq->cfs;
+#endif
 
 	if (!cfs_rq->nr_running)
 		return NULL;
@@ -4669,12 +4766,13 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 		prev->sched_class->put_prev_task(rq, prev);
 
 	do {
-		se = pick_next_entity(cfs_rq);
+		se = pick_next_entity(cfs_rq, NULL);
 		set_next_entity(cfs_rq, se);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 
 	p = task_of(se);
+
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )
  2014-02-11 12:16 ` [tip:sched/core] sched: Push put_prev_task() into pick_next_task( ) tip-bot for Peter Zijlstra
@ 2014-02-12  7:00   ` Kirill Tkhai
  2014-02-12 11:43     ` Peter Zijlstra
  2014-02-12 14:06     ` Peter Zijlstra
  0 siblings, 2 replies; 14+ messages in thread
From: Kirill Tkhai @ 2014-02-12  7:00 UTC (permalink / raw)
  To: mingo, hpa, linux-kernel, peterz, tglx, linux-tip-commits



11.02.2014, 16:17, "tip-bot for Peter Zijlstra" <tipbot@zytor.com>:
> Commit-ID:  606dba2e289446600a0b68422ed2019af5355c12
> Gitweb:     http://git.kernel.org/tip/606dba2e289446600a0b68422ed2019af5355c12
> Author:     Peter Zijlstra <peterz@infradead.org>
> AuthorDate: Sat, 11 Feb 2012 06:05:00 +0100
> Committer:  Ingo Molnar <mingo@kernel.org>
> CommitDate: Mon, 10 Feb 2014 16:17:13 +0100
>
> sched: Push put_prev_task() into pick_next_task()
>
> In order to avoid having to do put/set on a whole cgroup hierarchy
> when we context switch, push the put into pick_next_task() so that
> both operations are in the same function. Further changes then allow
> us to possibly optimize away redundant work.
>
> Signed-off-by: Peter Zijlstra <peterz@infradead.org>
> Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
> Signed-off-by: Ingo Molnar <mingo@kernel.org>
> ---
>  kernel/sched/core.c      | 21 ++++++++-------------
>  kernel/sched/deadline.c  |  5 ++++-
>  kernel/sched/fair.c      |  6 +++++-
>  kernel/sched/idle_task.c |  6 +++++-
>  kernel/sched/rt.c        | 27 ++++++++++++++++-----------
>  kernel/sched/sched.h     |  8 +++++++-
>  kernel/sched/stop_task.c | 16 ++++++++++------
>  7 files changed, 55 insertions(+), 34 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 417cf65..dedb5f0 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2579,18 +2579,11 @@ static inline void schedule_debug(struct task_struct *prev)
>          schedstat_inc(this_rq(), sched_count);
>  }
>
> -static void put_prev_task(struct rq *rq, struct task_struct *prev)
> -{
> - if (prev->on_rq || rq->skip_clock_update < 0)
> - update_rq_clock(rq);
> - prev->sched_class->put_prev_task(rq, prev);
> -}
> -
>  /*
>   * Pick up the highest-prio task:
>   */
>  static inline struct task_struct *
> -pick_next_task(struct rq *rq)
> +pick_next_task(struct rq *rq, struct task_struct *prev)
>  {
>          const struct sched_class *class;
>          struct task_struct *p;
> @@ -2600,13 +2593,13 @@ pick_next_task(struct rq *rq)
>           * the fair class we can call that function directly:
>           */
>          if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
> - p = fair_sched_class.pick_next_task(rq);
> + p = fair_sched_class.pick_next_task(rq, prev);
>                  if (likely(p))
>                          return p;
>          }
>
>          for_each_class(class) {
> - p = class->pick_next_task(rq);
> + p = class->pick_next_task(rq, prev);
>                  if (p)
>                          return p;
>          }
> @@ -2714,8 +2707,10 @@ need_resched:
>                          rq->idle_stamp = 0;
>          }
>
> - put_prev_task(rq, prev);
> - next = pick_next_task(rq);
> + if (prev->on_rq || rq->skip_clock_update < 0)
> + update_rq_clock(rq);
> +
> + next = pick_next_task(rq, prev);
>          clear_tsk_need_resched(prev);
>          clear_preempt_need_resched();
>          rq->skip_clock_update = 0;
> @@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
>                  if (rq->nr_running == 1)
>                          break;
>
> - next = pick_next_task(rq);
> + next = pick_next_task(rq, NULL);

pick_next_task() firstly checks for prev->sched_class, doesn't it ???

The same for pick_next_task_rt().

>                  BUG_ON(!next);
>                  next->sched_class->put_prev_task(rq, next);
>
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index b5700bc..50797d5 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -990,7 +990,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
>          return rb_entry(left, struct sched_dl_entity, rb_node);
>  }
>
> -struct task_struct *pick_next_task_dl(struct rq *rq)
> +struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
>  {
>          struct sched_dl_entity *dl_se;
>          struct task_struct *p;
> @@ -1001,6 +1001,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
>          if (unlikely(!dl_rq->dl_nr_running))
>                  return NULL;
>
> + if (prev)
> + prev->sched_class->put_prev_task(rq, prev);
> +
>          dl_se = pick_next_dl_entity(rq, dl_rq);
>          BUG_ON(!dl_se);
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 748a7ac..c4bb0ac 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4655,7 +4655,8 @@ preempt:
>                  set_last_buddy(se);
>  }
>
> -static struct task_struct *pick_next_task_fair(struct rq *rq)
> +static struct task_struct *
> +pick_next_task_fair(struct rq *rq, struct task_struct *prev)
>  {
>          struct task_struct *p;
>          struct cfs_rq *cfs_rq = &rq->cfs;
> @@ -4664,6 +4665,9 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
>          if (!cfs_rq->nr_running)
>                  return NULL;
>
> + if (prev)
> + prev->sched_class->put_prev_task(rq, prev);
> +
>          do {
>                  se = pick_next_entity(cfs_rq);
>                  set_next_entity(cfs_rq, se);
> diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
> index 516c3d9..e5c922a 100644
> --- a/kernel/sched/idle_task.c
> +++ b/kernel/sched/idle_task.c
> @@ -33,8 +33,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
>          resched_task(rq->idle);
>  }
>
> -static struct task_struct *pick_next_task_idle(struct rq *rq)
> +static struct task_struct *
> +pick_next_task_idle(struct rq *rq, struct task_struct *prev)
>  {
> + if (prev)
> + prev->sched_class->put_prev_task(rq, prev);
> +
>          schedstat_inc(rq, sched_goidle);
>  #ifdef CONFIG_SMP
>          /* Trigger the post schedule to do an idle_enter for CFS */
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index a2740b7..a15ca1c 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1310,15 +1310,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
>  {
>          struct sched_rt_entity *rt_se;
>          struct task_struct *p;
> - struct rt_rq *rt_rq;
> -
> - rt_rq = &rq->rt;
> -
> - if (!rt_rq->rt_nr_running)
> - return NULL;
> -
> - if (rt_rq_throttled(rt_rq))
> - return NULL;
> + struct rt_rq *rt_rq  = &rq->rt;
>
>          do {
>                  rt_se = pick_next_rt_entity(rq, rt_rq);
> @@ -1332,9 +1324,22 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
>          return p;
>  }
>
> -static struct task_struct *pick_next_task_rt(struct rq *rq)
> +static struct task_struct *
> +pick_next_task_rt(struct rq *rq, struct task_struct *prev)
>  {
> - struct task_struct *p = _pick_next_task_rt(rq);
> + struct task_struct *p;
> + struct rt_rq *rt_rq = &rq->rt;
> +
> + if (!rt_rq->rt_nr_running)
> + return NULL;
> +
> + if (rt_rq_throttled(rt_rq))
> + return NULL;
> +
> + if (prev)
> + prev->sched_class->put_prev_task(rq, prev);
> +
> + p = _pick_next_task_rt(rq);
>
>          /* The running task is never eligible for pushing */
>          if (p)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index bb89991..c534cf4 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1105,7 +1105,13 @@ struct sched_class {
>
>          void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
>
> - struct task_struct * (*pick_next_task) (struct rq *rq);
> + /*
> + * It is the responsibility of the pick_next_task() method that will
> + * return the next task to call put_prev_task() on the @prev task or
> + * something equivalent.
> + */
> + struct task_struct * (*pick_next_task) (struct rq *rq,
> + struct task_struct *prev);
>          void (*put_prev_task) (struct rq *rq, struct task_struct *p);
>
>  #ifdef CONFIG_SMP
> diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
> index fdb6bb0..a4147c9 100644
> --- a/kernel/sched/stop_task.c
> +++ b/kernel/sched/stop_task.c
> @@ -23,16 +23,20 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
>          /* we're never preempted */
>  }
>
> -static struct task_struct *pick_next_task_stop(struct rq *rq)
> +static struct task_struct *
> +pick_next_task_stop(struct rq *rq, struct task_struct *prev)
>  {
>          struct task_struct *stop = rq->stop;
>
> - if (stop && stop->on_rq) {
> - stop->se.exec_start = rq_clock_task(rq);
> - return stop;
> - }
> + if (!stop || !stop->on_rq)
> + return NULL;
>
> - return NULL;
> + if (prev)
> + prev->sched_class->put_prev_task(rq, prev);
> +
> + stop->se.exec_start = rq_clock_task(rq);
> +
> + return stop;
>  }
>
>  static void
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )
  2014-02-12  7:00   ` Kirill Tkhai
@ 2014-02-12 11:43     ` Peter Zijlstra
  2014-02-12 14:06     ` Peter Zijlstra
  1 sibling, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2014-02-12 11:43 UTC (permalink / raw)
  To: Kirill Tkhai; +Cc: mingo, hpa, linux-kernel, tglx, linux-tip-commits

On Wed, Feb 12, 2014 at 11:00:53AM +0400, Kirill Tkhai wrote:
> > @@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
> >                  if (rq->nr_running == 1)
> >                          break;
> >
> > - next = pick_next_task(rq);
> > + next = pick_next_task(rq, NULL);
> 
> pick_next_task() firstly checks for prev->sched_class, doesn't it ???
> 
> The same for pick_next_task_rt().

Right you are... and pick_next_task_dl(). Smatch caught the _{rt,dl}
ones, it failed to spot this one though.



^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )
  2014-02-12  7:00   ` Kirill Tkhai
  2014-02-12 11:43     ` Peter Zijlstra
@ 2014-02-12 14:06     ` Peter Zijlstra
  2014-02-12 14:24       ` Kirill Tkhai
  1 sibling, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2014-02-12 14:06 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: mingo, hpa, linux-kernel, tglx, linux-tip-commits,
	Steven Rostedt, Juri Lelli, Dan Carpenter

On Wed, Feb 12, 2014 at 11:00:53AM +0400, Kirill Tkhai wrote:
> > @@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
> >                  if (rq->nr_running == 1)
> >                          break;
> >
> > - next = pick_next_task(rq);
> > + next = pick_next_task(rq, NULL);
> 
> pick_next_task() firstly checks for prev->sched_class, doesn't it ???
> 
> The same for pick_next_task_rt().


OK, how about something like this?

---
Subject: sched: Fix hotplug task migration
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 12 Feb 2014 10:49:30 +0100

Dan Carpenter reported:

> kernel/sched/rt.c:1347 pick_next_task_rt() warn: variable dereferenced before check 'prev' (see line 1338)
> kernel/sched/deadline.c:1011 pick_next_task_dl() warn: variable dereferenced before check 'prev' (see line 1005)

Kirill also spotted that migrate_tasks() will have an instant NULL
deref because pick_next_task() will immediately deref prev.

Instead of fixing all the corner cases because migrate_tasks() can
pass in a NULL prev task in the unlikely case of hot-un-plug, provide
a fake task such that we can remove all the NULL checks from the far
more common paths.

A further problem; not previously spotted; is that because we pushed
pre_schedule() and idle_balance() into pick_next_task() we now need to
avoid those getting called and pulling more tasks on our dying CPU.

We avoid pull_{dl,rt}_task() by setting fake_task.prio to MAX_PRIO+1.
We also note that since we call pick_next_task() exactly the amount of
times we have runnable tasks present, we should never land in
idle_balance().

Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()")
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Juri Lelli <juri.lelli@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Reported-by: Kirill Tkhai <tkhai@yandex.ru>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20140212094930.GB3545@laptop.programming.kicks-ass.net
---
 kernel/sched/core.c      |   18 +++++++++++++++++-
 kernel/sched/deadline.c  |    3 +--
 kernel/sched/fair.c      |    5 ++---
 kernel/sched/idle_task.c |    3 +--
 kernel/sched/rt.c        |    3 +--
 kernel/sched/stop_task.c |    3 +--
 6 files changed, 23 insertions(+), 12 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4681,6 +4681,22 @@ static void calc_load_migrate(struct rq
 		atomic_long_add(delta, &calc_load_tasks);
 }
 
+static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static const struct sched_class fake_sched_class = {
+	.put_prev_task = put_prev_task_fake,
+};
+
+static struct task_struct fake_task = {
+	/*
+	 * Avoid pull_{rt,dl}_task()
+	 */
+	.prio = MAX_PRIO + 1,
+	.sched_class = &fake_sched_class,
+};
+
 /*
  * Migrate all tasks from the rq, sleeping tasks will be migrated by
  * try_to_wake_up()->select_task_rq().
@@ -4721,7 +4737,7 @@ static void migrate_tasks(unsigned int d
 		if (rq->nr_running == 1)
 			break;
 
-		next = pick_next_task(rq, NULL);
+		next = pick_next_task(rq, &fake_task);
 		BUG_ON(!next);
 		next->sched_class->put_prev_task(rq, next);
 
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1008,8 +1008,7 @@ struct task_struct *pick_next_task_dl(st
 	if (unlikely(!dl_rq->dl_nr_running))
 		return NULL;
 
-	if (prev)
-		prev->sched_class->put_prev_task(rq, prev);
+	prev->sched_class->put_prev_task(rq, prev);
 
 	dl_se = pick_next_dl_entity(rq, dl_rq);
 	BUG_ON(!dl_se);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4690,7 +4690,7 @@ pick_next_task_fair(struct rq *rq, struc
 	if (!cfs_rq->nr_running)
 		goto idle;
 
-	if (!prev || prev->sched_class != &fair_sched_class)
+	if (prev->sched_class != &fair_sched_class)
 		goto simple;
 
 	/*
@@ -4766,8 +4766,7 @@ pick_next_task_fair(struct rq *rq, struc
 	if (!cfs_rq->nr_running)
 		goto idle;
 
-	if (prev)
-		prev->sched_class->put_prev_task(rq, prev);
+	prev->sched_class->put_prev_task(rq, prev);
 
 	do {
 		se = pick_next_entity(cfs_rq, NULL);
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -26,8 +26,7 @@ static void check_preempt_curr_idle(stru
 static struct task_struct *
 pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
-	if (prev)
-		prev->sched_class->put_prev_task(rq, prev);
+	prev->sched_class->put_prev_task(rq, prev);
 
 	schedstat_inc(rq, sched_goidle);
 #ifdef CONFIG_SMP
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1344,8 +1344,7 @@ pick_next_task_rt(struct rq *rq, struct
 	if (rt_rq_throttled(rt_rq))
 		return NULL;
 
-	if (prev)
-		prev->sched_class->put_prev_task(rq, prev);
+	prev->sched_class->put_prev_task(rq, prev);
 
 	p = _pick_next_task_rt(rq);
 
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -31,8 +31,7 @@ pick_next_task_stop(struct rq *rq, struc
 	if (!stop || !stop->on_rq)
 		return NULL;
 
-	if (prev)
-		prev->sched_class->put_prev_task(rq, prev);
+	prev->sched_class->put_prev_task(rq, prev);
 
 	stop->se.exec_start = rq_clock_task(rq);
 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )
  2014-02-12 14:06     ` Peter Zijlstra
@ 2014-02-12 14:24       ` Kirill Tkhai
  2014-02-12 14:54         ` Peter Zijlstra
  0 siblings, 1 reply; 14+ messages in thread
From: Kirill Tkhai @ 2014-02-12 14:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, hpa, linux-kernel, tglx, linux-tip-commits,
	Steven Rostedt, Juri Lelli, Dan Carpenter



12.02.2014, 18:06, "Peter Zijlstra" <peterz@infradead.org>:
> On Wed, Feb 12, 2014 at 11:00:53AM +0400, Kirill Tkhai wrote:
>
>>>  @@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
>>>                   if (rq->nr_running == 1)
>>>                           break;
>>>
>>>  - next = pick_next_task(rq);
>>>  + next = pick_next_task(rq, NULL);
>>  pick_next_task() firstly checks for prev->sched_class, doesn't it ???
>>
>>  The same for pick_next_task_rt().
>
> OK, how about something like this?

Good for me.

Maybe pack prev->sched_class->put_prev_task() into inline function?

Kirill

> ---
> Subject: sched: Fix hotplug task migration
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Wed, 12 Feb 2014 10:49:30 +0100
>
> Dan Carpenter reported:
>
>>  kernel/sched/rt.c:1347 pick_next_task_rt() warn: variable dereferenced before check 'prev' (see line 1338)
>>  kernel/sched/deadline.c:1011 pick_next_task_dl() warn: variable dereferenced before check 'prev' (see line 1005)
>
> Kirill also spotted that migrate_tasks() will have an instant NULL
> deref because pick_next_task() will immediately deref prev.
>
> Instead of fixing all the corner cases because migrate_tasks() can
> pass in a NULL prev task in the unlikely case of hot-un-plug, provide
> a fake task such that we can remove all the NULL checks from the far
> more common paths.
>
> A further problem; not previously spotted; is that because we pushed
> pre_schedule() and idle_balance() into pick_next_task() we now need to
> avoid those getting called and pulling more tasks on our dying CPU.
>
> We avoid pull_{dl,rt}_task() by setting fake_task.prio to MAX_PRIO+1.
> We also note that since we call pick_next_task() exactly the amount of
> times we have runnable tasks present, we should never land in
> idle_balance().
>
> Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()")
> Cc: Steven Rostedt <rostedt@goodmis.org>
> Cc: Juri Lelli <juri.lelli@gmail.com>
> Cc: Ingo Molnar <mingo@kernel.org>
> Reported-by: Kirill Tkhai <tkhai@yandex.ru>
> Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
> Signed-off-by: Peter Zijlstra <peterz@infradead.org>
> Link: http://lkml.kernel.org/r/20140212094930.GB3545@laptop.programming.kicks-ass.net
> ---
>  kernel/sched/core.c      |   18 +++++++++++++++++-
>  kernel/sched/deadline.c  |    3 +--
>  kernel/sched/fair.c      |    5 ++---
>  kernel/sched/idle_task.c |    3 +--
>  kernel/sched/rt.c        |    3 +--
>  kernel/sched/stop_task.c |    3 +--
>  6 files changed, 23 insertions(+), 12 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4681,6 +4681,22 @@ static void calc_load_migrate(struct rq
>                  atomic_long_add(delta, &calc_load_tasks);
>  }
>
> +static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
> +{
> +}
> +
> +static const struct sched_class fake_sched_class = {
> + .put_prev_task = put_prev_task_fake,
> +};
> +
> +static struct task_struct fake_task = {
> + /*
> + * Avoid pull_{rt,dl}_task()
> + */
> + .prio = MAX_PRIO + 1,
> + .sched_class = &fake_sched_class,
> +};
> +
>  /*
>   * Migrate all tasks from the rq, sleeping tasks will be migrated by
>   * try_to_wake_up()->select_task_rq().
> @@ -4721,7 +4737,7 @@ static void migrate_tasks(unsigned int d
>                  if (rq->nr_running == 1)
>                          break;
>
> - next = pick_next_task(rq, NULL);
> + next = pick_next_task(rq, &fake_task);
>                  BUG_ON(!next);
>                  next->sched_class->put_prev_task(rq, next);
>
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1008,8 +1008,7 @@ struct task_struct *pick_next_task_dl(st
>          if (unlikely(!dl_rq->dl_nr_running))
>                  return NULL;
>
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> + prev->sched_class->put_prev_task(rq, prev);
>
>          dl_se = pick_next_dl_entity(rq, dl_rq);
>          BUG_ON(!dl_se);
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4690,7 +4690,7 @@ pick_next_task_fair(struct rq *rq, struc
>          if (!cfs_rq->nr_running)
>                  goto idle;
>
> - if (!prev || prev->sched_class != &fair_sched_class)
> + if (prev->sched_class != &fair_sched_class)
>                  goto simple;
>
>          /*
> @@ -4766,8 +4766,7 @@ pick_next_task_fair(struct rq *rq, struc
>          if (!cfs_rq->nr_running)
>                  goto idle;
>
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> + prev->sched_class->put_prev_task(rq, prev);
>
>          do {
>                  se = pick_next_entity(cfs_rq, NULL);
> --- a/kernel/sched/idle_task.c
> +++ b/kernel/sched/idle_task.c
> @@ -26,8 +26,7 @@ static void check_preempt_curr_idle(stru
>  static struct task_struct *
>  pick_next_task_idle(struct rq *rq, struct task_struct *prev)
>  {
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> + prev->sched_class->put_prev_task(rq, prev);
>
>          schedstat_inc(rq, sched_goidle);
>  #ifdef CONFIG_SMP
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1344,8 +1344,7 @@ pick_next_task_rt(struct rq *rq, struct
>          if (rt_rq_throttled(rt_rq))
>                  return NULL;
>
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> + prev->sched_class->put_prev_task(rq, prev);
>
>          p = _pick_next_task_rt(rq);
>
> --- a/kernel/sched/stop_task.c
> +++ b/kernel/sched/stop_task.c
> @@ -31,8 +31,7 @@ pick_next_task_stop(struct rq *rq, struc
>          if (!stop || !stop->on_rq)
>                  return NULL;
>
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> + prev->sched_class->put_prev_task(rq, prev);
>
>          stop->se.exec_start = rq_clock_task(rq);

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )
  2014-02-12 14:24       ` Kirill Tkhai
@ 2014-02-12 14:54         ` Peter Zijlstra
  0 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2014-02-12 14:54 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: mingo, hpa, linux-kernel, tglx, linux-tip-commits,
	Steven Rostedt, Juri Lelli, Dan Carpenter

On Wed, Feb 12, 2014 at 06:24:18PM +0400, Kirill Tkhai wrote:
> Maybe pack prev->sched_class->put_prev_task() into inline function?

Good suggestion, done.

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2014-02-12 14:54 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-02-11  5:05 [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair Peter Zijlstra
2012-02-11  6:56 ` Mike Galbraith
2012-02-11 15:02   ` Peter Zijlstra
2012-02-16 23:20 ` Peter Zijlstra
2012-02-16 23:28   ` Peter Zijlstra
2014-02-11 12:16 ` [tip:sched/core] sched/fair: Track cgroup depth tip-bot for Peter Zijlstra
2014-02-11 12:16 ` [tip:sched/core] sched: Push put_prev_task() into pick_next_task( ) tip-bot for Peter Zijlstra
2014-02-12  7:00   ` Kirill Tkhai
2014-02-12 11:43     ` Peter Zijlstra
2014-02-12 14:06     ` Peter Zijlstra
2014-02-12 14:24       ` Kirill Tkhai
2014-02-12 14:54         ` Peter Zijlstra
2014-02-11 12:16 ` [tip:sched/core] sched/fair: Clean up the __clear_buddies_*() functions tip-bot for Peter Zijlstra
2014-02-11 12:16 ` [tip:sched/core] sched/fair: Optimize cgroup pick_next_task_fair( ) tip-bot for Peter Zijlstra

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.