All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC][PATCH 0/7] sched: balance callbacks
@ 2015-06-01 13:58 Peter Zijlstra
  2015-06-01 13:58 ` [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list Peter Zijlstra
                   ` (8 more replies)
  0 siblings, 9 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-01 13:58 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, juri.lelli, pang.xunlei, oleg, linux-kernel,
	Peter Zijlstra

Hi,

Mike stumbled over a cute bug where the RT/DL balancing ops caused a bug.

The exact scenario is __sched_setscheduler() changing a (runnable) task from
FIFO to OTHER. In swiched_from_rt(), where we do pull_rt_task() we temporarity
drop rq->lock. This gap allows regular cfs load-balancing to step in and
migrate our.

However, check_class_changed() will happily continue with switched_to_fair()
which assumes our task is still on the old rq and makes the kernel go boom.

Instead of trying to patch this up and make things complicated; simply disallow
these methods to drop rq->lock and extend the current post_schedule stuff into
a balancing callback list, and use that.

This survives Mike's testcase for well over an hour on my ivb-ep. I've not yet
tested it on anything bigger.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list
  2015-06-01 13:58 [RFC][PATCH 0/7] sched: balance callbacks Peter Zijlstra
@ 2015-06-01 13:58 ` Peter Zijlstra
  2015-06-03  4:41   ` Kamalesh Babulal
  2015-06-03  8:24   ` pang.xunlei
  2015-06-01 13:58 ` [RFC][PATCH 2/7] sched: Use replace normalize_task() with __sched_setscheduler() Peter Zijlstra
                   ` (7 subsequent siblings)
  8 siblings, 2 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-01 13:58 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, juri.lelli, pang.xunlei, oleg, linux-kernel,
	Peter Zijlstra

[-- Attachment #1: peterz-sched-post_schedule-1.patch --]
[-- Type: text/plain, Size: 6815 bytes --]

Generalize the post_schedule() stuff into a balance callback list.
This allows us to more easily use it outside of schedule() and cross
sched_class.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c     |   36 ++++++++++++++++++++++++------------
 kernel/sched/deadline.c |   19 ++++++++++---------
 kernel/sched/rt.c       |   25 +++++++++++--------------
 kernel/sched/sched.h    |   19 +++++++++++++++++--
 4 files changed, 62 insertions(+), 37 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2277,23 +2277,35 @@ static struct rq *finish_task_switch(str
 #ifdef CONFIG_SMP
 
 /* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
+static void __balance_callback(struct rq *rq)
 {
-	if (rq->post_schedule) {
-		unsigned long flags;
+	struct callback_head *head, *next;
+	void (*func)(struct rq *rq);
+	unsigned long flags;
 
-		raw_spin_lock_irqsave(&rq->lock, flags);
-		if (rq->curr->sched_class->post_schedule)
-			rq->curr->sched_class->post_schedule(rq);
-		raw_spin_unlock_irqrestore(&rq->lock, flags);
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	head = rq->balance_callback;
+	rq->balance_callback = NULL;
+	while (head) {
+		func = (void (*)(struct rq *))head->func;
+		next = head->next;
+		head->next = NULL;
+		head = next;
 
-		rq->post_schedule = 0;
+		func(rq);
 	}
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+	if (unlikely(rq->balance_callback))
+		__balance_callback(rq);
 }
 
 #else
 
-static inline void post_schedule(struct rq *rq)
+static inline void balance_callback(struct rq *rq)
 {
 }
 
@@ -2311,7 +2323,7 @@ asmlinkage __visible void schedule_tail(
 	/* finish_task_switch() drops rq->lock and enables preemtion */
 	preempt_disable();
 	rq = finish_task_switch(prev);
-	post_schedule(rq);
+	balance_callback(rq);
 	preempt_enable();
 
 	if (current->set_child_tid)
@@ -2822,7 +2834,7 @@ static void __sched __schedule(void)
 	} else
 		raw_spin_unlock_irq(&rq->lock);
 
-	post_schedule(rq);
+	balance_callback(rq);
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -7216,7 +7228,7 @@ void __init sched_init(void)
 		rq->sd = NULL;
 		rq->rd = NULL;
 		rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
-		rq->post_schedule = 0;
+		rq->balance_callback = NULL;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,9 +213,16 @@ static inline bool need_pull_dl_task(str
 	return dl_task(prev);
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
+
+static void push_dl_tasks(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
 {
-	rq->post_schedule = has_pushable_dl_tasks(rq);
+	if (!has_pushable_dl_tasks(rq))
+		return;
+
+	queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
 }
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
@@ -1126,7 +1133,7 @@ struct task_struct *pick_next_task_dl(st
 	if (hrtick_enabled(rq))
 		start_hrtick_dl(rq, p);
 
-	set_post_schedule(rq);
+	queue_push_tasks(rq);
 
 	return p;
 }
@@ -1544,11 +1551,6 @@ static int pull_dl_task(struct rq *this_
 	return ret;
 }
 
-static void post_schedule_dl(struct rq *rq)
-{
-	push_dl_tasks(rq);
-}
-
 /*
  * Since the task is not running and a reschedule is not going to happen
  * anytime soon on its runqueue, we try pushing it away now.
@@ -1784,7 +1786,6 @@ const struct sched_class dl_sched_class
 	.set_cpus_allowed       = set_cpus_allowed_dl,
 	.rq_online              = rq_online_dl,
 	.rq_offline             = rq_offline_dl,
-	.post_schedule		= post_schedule_dl,
 	.task_woken		= task_woken_dl,
 #endif
 
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -354,13 +354,16 @@ static inline int has_pushable_tasks(str
 	return !plist_head_empty(&rq->rt.pushable_tasks);
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, rt_balance_head);
+
+static void push_rt_tasks(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
 {
-	/*
-	 * We detect this state here so that we can avoid taking the RQ
-	 * lock again later if there is no need to push
-	 */
-	rq->post_schedule = has_pushable_tasks(rq);
+	if (!has_pushable_tasks(rq))
+		return;
+
+	queue_balance_callback(rq, &per_cpu(rt_balance_head, rq->cpu), push_rt_tasks);
 }
 
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -417,7 +420,7 @@ static inline int pull_rt_task(struct rq
 	return 0;
 }
 
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1497,7 +1500,7 @@ pick_next_task_rt(struct rq *rq, struct
 	/* The running task is never eligible for pushing */
 	dequeue_pushable_task(rq, p);
 
-	set_post_schedule(rq);
+	queue_push_tasks(rq);
 
 	return p;
 }
@@ -2042,11 +2045,6 @@ static int pull_rt_task(struct rq *this_
 	return ret;
 }
 
-static void post_schedule_rt(struct rq *rq)
-{
-	push_rt_tasks(rq);
-}
-
 /*
  * If we are not running and we are not going to reschedule soon, we should
  * try to push tasks away now
@@ -2318,7 +2316,6 @@ const struct sched_class rt_sched_class
 	.set_cpus_allowed       = set_cpus_allowed_rt,
 	.rq_online              = rq_online_rt,
 	.rq_offline             = rq_offline_rt,
-	.post_schedule		= post_schedule_rt,
 	.task_woken		= task_woken_rt,
 	.switched_from		= switched_from_rt,
 #endif
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -624,9 +624,10 @@ struct rq {
 	unsigned long cpu_capacity;
 	unsigned long cpu_capacity_orig;
 
+	struct callback_head *balance_callback;
+
 	unsigned char idle_balance;
 	/* For active balancing */
-	int post_schedule;
 	int active_balance;
 	int push_cpu;
 	struct cpu_stop_work active_balance_work;
@@ -695,6 +696,21 @@ struct rq {
 #endif
 };
 
+static inline void
+queue_balance_callback(struct rq *rq,
+		       struct callback_head *head,
+		       void (*func)(struct rq *rq))
+{
+	lockdep_assert_held(&rq->lock);
+
+	if (unlikely(head->next))
+		return;
+
+	head->func = (void (*)(struct callback_head *))func;
+	head->next = rq->balance_callback;
+	rq->balance_callback = head;
+}
+
 static inline int cpu_of(struct rq *rq)
 {
 #ifdef CONFIG_SMP
@@ -1192,7 +1208,6 @@ struct sched_class {
 	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
 	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
 
-	void (*post_schedule) (struct rq *this_rq);
 	void (*task_waking) (struct task_struct *task);
 	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
 



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [RFC][PATCH 2/7] sched: Use replace normalize_task() with __sched_setscheduler()
  2015-06-01 13:58 [RFC][PATCH 0/7] sched: balance callbacks Peter Zijlstra
  2015-06-01 13:58 ` [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list Peter Zijlstra
@ 2015-06-01 13:58 ` Peter Zijlstra
  2015-06-01 13:58 ` [RFC][PATCH 3/7] sched: Allow balance callbacks for check_class_changed() Peter Zijlstra
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-01 13:58 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, juri.lelli, pang.xunlei, oleg, linux-kernel,
	Peter Zijlstra

[-- Attachment #1: peterz-sched-post_schedule-6.patch --]
[-- Type: text/plain, Size: 3953 bytes --]

Reduce duplicate logic; normalize_task() is a simplified version of
__sched_setscheduler(). Parametrize the difference and collapse.

This reduces the amount of check_class_changed() sites.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |   65 ++++++++++++++++++----------------------------------
 1 file changed, 23 insertions(+), 42 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3428,7 +3428,7 @@ static bool dl_param_changed(struct task
 
 static int __sched_setscheduler(struct task_struct *p,
 				const struct sched_attr *attr,
-				bool user)
+				bool user, bool pi)
 {
 	int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
 		      MAX_RT_PRIO - 1 - attr->sched_priority;
@@ -3614,18 +3614,20 @@ static int __sched_setscheduler(struct t
 	p->sched_reset_on_fork = reset_on_fork;
 	oldprio = p->prio;
 
-	/*
-	 * Take priority boosted tasks into account. If the new
-	 * effective priority is unchanged, we just store the new
-	 * normal parameters and do not touch the scheduler class and
-	 * the runqueue. This will be done when the task deboost
-	 * itself.
-	 */
-	new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-	if (new_effective_prio == oldprio) {
-		__setscheduler_params(p, attr);
-		task_rq_unlock(rq, p, &flags);
-		return 0;
+	if (pi) {
+		/*
+		 * Take priority boosted tasks into account. If the new
+		 * effective priority is unchanged, we just store the new
+		 * normal parameters and do not touch the scheduler class and
+		 * the runqueue. This will be done when the task deboost
+		 * itself.
+		 */
+		new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+		if (new_effective_prio == oldprio) {
+			__setscheduler_params(p, attr);
+			task_rq_unlock(rq, p, &flags);
+			return 0;
+		}
 	}
 
 	queued = task_on_rq_queued(p);
@@ -3636,7 +3638,7 @@ static int __sched_setscheduler(struct t
 		put_prev_task(rq, p);
 
 	prev_class = p->sched_class;
-	__setscheduler(rq, p, attr, true);
+	__setscheduler(rq, p, attr, pi);
 
 	if (running)
 		p->sched_class->set_curr_task(rq);
@@ -3651,7 +3653,8 @@ static int __sched_setscheduler(struct t
 	check_class_changed(rq, p, prev_class, oldprio);
 	task_rq_unlock(rq, p, &flags);
 
-	rt_mutex_adjust_pi(p);
+	if (pi)
+		rt_mutex_adjust_pi(p);
 
 	return 0;
 }
@@ -3672,7 +3675,7 @@ static int _sched_setscheduler(struct ta
 		attr.sched_policy = policy;
 	}
 
-	return __sched_setscheduler(p, &attr, check);
+	return __sched_setscheduler(p, &attr, check, true);
 }
 /**
  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
@@ -3693,7 +3696,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 
 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
 {
-	return __sched_setscheduler(p, attr, true);
+	return __sched_setscheduler(p, attr, true, true);
 }
 EXPORT_SYMBOL_GPL(sched_setattr);
 
@@ -7354,32 +7357,12 @@ EXPORT_SYMBOL(___might_sleep);
 #endif
 
 #ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void normalize_rt_tasks(void)
 {
-	const struct sched_class *prev_class = p->sched_class;
+	struct task_struct *g, *p;
 	struct sched_attr attr = {
 		.sched_policy = SCHED_NORMAL,
 	};
-	int old_prio = p->prio;
-	int queued;
-
-	queued = task_on_rq_queued(p);
-	if (queued)
-		dequeue_task(rq, p, 0);
-	__setscheduler(rq, p, &attr, false);
-	if (queued) {
-		enqueue_task(rq, p, 0);
-		resched_curr(rq);
-	}
-
-	check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
-	struct task_struct *g, *p;
-	unsigned long flags;
-	struct rq *rq;
 
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p) {
@@ -7406,9 +7389,7 @@ void normalize_rt_tasks(void)
 			continue;
 		}
 
-		rq = task_rq_lock(p, &flags);
-		normalize_task(rq, p);
-		task_rq_unlock(rq, p, &flags);
+		__sched_setscheduler(p, &attr, false, false);
 	}
 	read_unlock(&tasklist_lock);
 }



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [RFC][PATCH 3/7] sched: Allow balance callbacks for check_class_changed()
  2015-06-01 13:58 [RFC][PATCH 0/7] sched: balance callbacks Peter Zijlstra
  2015-06-01 13:58 ` [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list Peter Zijlstra
  2015-06-01 13:58 ` [RFC][PATCH 2/7] sched: Use replace normalize_task() with __sched_setscheduler() Peter Zijlstra
@ 2015-06-01 13:58 ` Peter Zijlstra
  2015-06-02 13:58   ` Kirill Tkhai
  2015-06-01 13:58 ` [RFC][PATCH 4/7] sched,rt: Remove return value from pull_rt_task() Peter Zijlstra
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-01 13:58 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, juri.lelli, pang.xunlei, oleg, linux-kernel,
	Peter Zijlstra

[-- Attachment #1: peterz-sched-post_schedule-7.patch --]
[-- Type: text/plain, Size: 3024 bytes --]

In order to remove dropping rq->lock from the
switched_{to,from}()/prio_changed() sched_class methods, run the
balance callbacks after it.

We need to remove dropping rq->lock because its buggy,
suppose using sched_setattr()/sched_setscheduler() to change a running
task from FIFO to OTHER.

By the time we get to switched_from_rt() the task is already enqueued
on the cfs runqueues. If switched_from_rt() does pull_rt_task() and
drops rq->lock, load-balancing can come in and move our task @p to
another rq.

The subsequent switched_to_fair() still assumes @p is on @rq and bad
things will happen.

By using balance callbacks we delay the load-balancing operations
{rt,dl}x{push,pull} until we've done all the important work and the
task is fully set up.

Furthermore, the balance callbacks do not know about @p, therefore
they cannot get confused like this.

Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/core.c |   25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1001,7 +1001,11 @@ inline int task_curr(const struct task_s
 }
 
 /*
- * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
+ * use the balance_callback list if you want balancing.
+ *
+ * this means any call to check_class_changed() must be followed by a call to
+ * balance_callback().
  */
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 				       const struct sched_class *prev_class,
@@ -1010,7 +1014,7 @@ static inline void check_class_changed(s
 	if (prev_class != p->sched_class) {
 		if (prev_class->switched_from)
 			prev_class->switched_from(rq, p);
-		/* Possble rq->lock 'hole'.  */
+
 		p->sched_class->switched_to(rq, p);
 	} else if (oldprio != p->prio || dl_task(p))
 		p->sched_class->prio_changed(rq, p, oldprio);
@@ -1491,8 +1495,12 @@ ttwu_do_wakeup(struct rq *rq, struct tas
 
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
-	if (p->sched_class->task_woken)
+	if (p->sched_class->task_woken) {
+		/*
+		 * XXX can drop rq->lock; most likely ok.
+		 */
 		p->sched_class->task_woken(rq, p);
+	}
 
 	if (rq->idle_stamp) {
 		u64 delta = rq_clock(rq) - rq->idle_stamp;
@@ -3094,7 +3102,11 @@ void rt_mutex_setprio(struct task_struct
 
 	check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
+	preempt_disable(); /* avoid rq from going away on us */
 	__task_rq_unlock(rq);
+
+	balance_callback(rq);
+	preempt_enable();
 }
 #endif
 
@@ -3655,11 +3667,18 @@ static int __sched_setscheduler(struct t
 	}
 
 	check_class_changed(rq, p, prev_class, oldprio);
+	preempt_disable(); /* avoid rq from going away on us */
 	task_rq_unlock(rq, p, &flags);
 
 	if (pi)
 		rt_mutex_adjust_pi(p);
 
+	/*
+	 * Run balance callbacks after we've adjusted the PI chain.
+	 */
+	balance_callback(rq);
+	preempt_enable();
+
 	return 0;
 }
 



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [RFC][PATCH 4/7] sched,rt: Remove return value from pull_rt_task()
  2015-06-01 13:58 [RFC][PATCH 0/7] sched: balance callbacks Peter Zijlstra
                   ` (2 preceding siblings ...)
  2015-06-01 13:58 ` [RFC][PATCH 3/7] sched: Allow balance callbacks for check_class_changed() Peter Zijlstra
@ 2015-06-01 13:58 ` Peter Zijlstra
  2015-06-01 13:58 ` [RFC][PATCH 5/7] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks Peter Zijlstra
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-01 13:58 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, juri.lelli, pang.xunlei, oleg, linux-kernel,
	Peter Zijlstra

[-- Attachment #1: peterz-sched-post_schedule-2.patch --]
[-- Type: text/plain, Size: 2409 bytes --]

In order to be able to use pull_rt_task() from a callback, we need to
do away with the return value.

Since the return value indicates if we should reschedule, do this
inside the function. Since not all callers currently do this, this can
increase the number of reschedules due rt balancing.

Too many reschedules is not a correctness issues, too few are.


Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/rt.c |   22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -260,7 +260,7 @@ int alloc_rt_sched_group(struct task_gro
 
 #ifdef CONFIG_SMP
 
-static int pull_rt_task(struct rq *this_rq);
+static void pull_rt_task(struct rq *this_rq);
 
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 {
@@ -415,9 +415,8 @@ static inline bool need_pull_rt_task(str
 	return false;
 }
 
-static inline int pull_rt_task(struct rq *this_rq)
+static inline void pull_rt_task(struct rq *this_rq)
 {
-	return 0;
 }
 
 static inline void queue_push_tasks(struct rq *rq)
@@ -1955,14 +1954,15 @@ static void push_irq_work_func(struct ir
 }
 #endif /* HAVE_RT_PUSH_IPI */
 
-static int pull_rt_task(struct rq *this_rq)
+static void pull_rt_task(struct rq *this_rq)
 {
-	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	int this_cpu = this_rq->cpu, cpu;
+	bool resched = false;
 	struct task_struct *p;
 	struct rq *src_rq;
 
 	if (likely(!rt_overloaded(this_rq)))
-		return 0;
+		return;
 
 	/*
 	 * Match the barrier from rt_set_overloaded; this guarantees that if we
@@ -1973,7 +1973,7 @@ static int pull_rt_task(struct rq *this_
 #ifdef HAVE_RT_PUSH_IPI
 	if (sched_feat(RT_PUSH_IPI)) {
 		tell_cpu_to_push(this_rq);
-		return 0;
+		return;
 	}
 #endif
 
@@ -2026,7 +2026,7 @@ static int pull_rt_task(struct rq *this_
 			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
-			ret = 1;
+			resched = true;
 
 			deactivate_task(src_rq, p, 0);
 			set_task_cpu(p, this_cpu);
@@ -2042,7 +2042,8 @@ static int pull_rt_task(struct rq *this_
 		double_unlock_balance(this_rq, src_rq);
 	}
 
-	return ret;
+	if (resched)
+		resched_curr(this_rq);
 }
 
 /*
@@ -2138,8 +2139,7 @@ static void switched_from_rt(struct rq *
 	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
 		return;
 
-	if (pull_rt_task(rq))
-		resched_curr(rq);
+	pull_rt_task(rq);
 }
 
 void __init init_sched_rt_class(void)



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [RFC][PATCH 5/7] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks
  2015-06-01 13:58 [RFC][PATCH 0/7] sched: balance callbacks Peter Zijlstra
                   ` (3 preceding siblings ...)
  2015-06-01 13:58 ` [RFC][PATCH 4/7] sched,rt: Remove return value from pull_rt_task() Peter Zijlstra
@ 2015-06-01 13:58 ` Peter Zijlstra
  2015-06-01 13:58 ` [RFC][PATCH 6/7] sched,dl: Remove return value from pull_dl_task() Peter Zijlstra
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-01 13:58 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, juri.lelli, pang.xunlei, oleg, linux-kernel,
	Peter Zijlstra

[-- Attachment #1: peterz-sched-post_schedule-3.patch --]
[-- Type: text/plain, Size: 3009 bytes --]

Remove the direct {push,pull} balancing operations from
switched_{from,to}_rt() / prio_changed_rt() and use the balance
callback queue.

Again, err on the side of too many reschedules; since too few is a
hard bug while too many is just annoying.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/rt.c |   35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -354,16 +354,23 @@ static inline int has_pushable_tasks(str
 	return !plist_head_empty(&rq->rt.pushable_tasks);
 }
 
-static DEFINE_PER_CPU(struct callback_head, rt_balance_head);
+static DEFINE_PER_CPU(struct callback_head, rt_push_head);
+static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 
 static void push_rt_tasks(struct rq *);
+static void pull_rt_task(struct rq *);
 
 static inline void queue_push_tasks(struct rq *rq)
 {
 	if (!has_pushable_tasks(rq))
 		return;
 
-	queue_balance_callback(rq, &per_cpu(rt_balance_head, rq->cpu), push_rt_tasks);
+	queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+	queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 }
 
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -2139,7 +2146,7 @@ static void switched_from_rt(struct rq *
 	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
 		return;
 
-	pull_rt_task(rq);
+	queue_pull_task(rq);
 }
 
 void __init init_sched_rt_class(void)
@@ -2160,8 +2167,6 @@ void __init init_sched_rt_class(void)
  */
 static void switched_to_rt(struct rq *rq, struct task_struct *p)
 {
-	int check_resched = 1;
-
 	/*
 	 * If we are already running, then there's nothing
 	 * that needs to be done. But if we are not running
@@ -2171,13 +2176,12 @@ static void switched_to_rt(struct rq *rq
 	 */
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
-		    /* Don't resched if we changed runqueues */
-		    push_rt_task(rq) && rq != task_rq(p))
-			check_resched = 0;
-#endif /* CONFIG_SMP */
-		if (check_resched && p->prio < rq->curr->prio)
+		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+			queue_push_tasks(rq);
+#else
+		if (p->prio < rq->curr->prio)
 			resched_curr(rq);
+#endif /* CONFIG_SMP */
 	}
 }
 
@@ -2198,14 +2202,13 @@ prio_changed_rt(struct rq *rq, struct ta
 		 * may need to pull tasks to this runqueue.
 		 */
 		if (oldprio < p->prio)
-			pull_rt_task(rq);
+			queue_pull_task(rq);
+
 		/*
 		 * If there's a higher priority task waiting to run
-		 * then reschedule. Note, the above pull_rt_task
-		 * can release the rq lock and p could migrate.
-		 * Only reschedule if p is still on the same runqueue.
+		 * then reschedule.
 		 */
-		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
+		if (p->prio > rq->rt.highest_prio.curr)
 			resched_curr(rq);
 #else
 		/* For UP simply resched on drop of prio */



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [RFC][PATCH 6/7] sched,dl: Remove return value from pull_dl_task()
  2015-06-01 13:58 [RFC][PATCH 0/7] sched: balance callbacks Peter Zijlstra
                   ` (4 preceding siblings ...)
  2015-06-01 13:58 ` [RFC][PATCH 5/7] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks Peter Zijlstra
@ 2015-06-01 13:58 ` Peter Zijlstra
  2015-06-01 13:58 ` [RFC][PATCH 7/7] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks Peter Zijlstra
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-01 13:58 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, juri.lelli, pang.xunlei, oleg, linux-kernel,
	Peter Zijlstra

[-- Attachment #1: peterz-sched-post_schedule-4.patch --]
[-- Type: text/plain, Size: 2136 bytes --]

In order to be able to use pull_dl_task() from a callback, we need to
do away with the return value.

Since the return value indicates if we should reschedule, do this
inside the function. Since not all callers currently do this, this can
increase the number of reschedules due rt balancing.

Too many reschedules is not a correctness issues, too few are.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/deadline.c |   20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -298,9 +298,8 @@ static inline bool need_pull_dl_task(str
 	return false;
 }
 
-static inline int pull_dl_task(struct rq *rq)
+static inline void pull_dl_task(struct rq *rq)
 {
-	return 0;
 }
 
 static inline void set_post_schedule(struct rq *rq)
@@ -1041,7 +1040,7 @@ static void check_preempt_equal_dl(struc
 	resched_curr(rq);
 }
 
-static int pull_dl_task(struct rq *this_rq);
+static void pull_dl_task(struct rq *this_rq);
 
 #endif /* CONFIG_SMP */
 
@@ -1472,15 +1471,16 @@ static void push_dl_tasks(struct rq *rq)
 		;
 }
 
-static int pull_dl_task(struct rq *this_rq)
+static void pull_dl_task(struct rq *this_rq)
 {
-	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	int this_cpu = this_rq->cpu, cpu;
 	struct task_struct *p;
+	bool resched = false;
 	struct rq *src_rq;
 	u64 dmin = LONG_MAX;
 
 	if (likely(!dl_overloaded(this_rq)))
-		return 0;
+		return;
 
 	/*
 	 * Match the barrier from dl_set_overloaded; this guarantees that if we
@@ -1535,7 +1535,7 @@ static int pull_dl_task(struct rq *this_
 					   src_rq->curr->dl.deadline))
 				goto skip;
 
-			ret = 1;
+			resched = true;
 
 			deactivate_task(src_rq, p, 0);
 			set_task_cpu(p, this_cpu);
@@ -1548,7 +1548,8 @@ static int pull_dl_task(struct rq *this_
 		double_unlock_balance(this_rq, src_rq);
 	}
 
-	return ret;
+	if (resched)
+		resched_curr(this_rq);
 }
 
 /*
@@ -1704,8 +1705,7 @@ static void switched_from_dl(struct rq *
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
 
-	if (pull_dl_task(rq))
-		resched_curr(rq);
+	pull_dl_task(rq);
 }
 
 /*



^ permalink raw reply	[flat|nested] 26+ messages in thread

* [RFC][PATCH 7/7] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks
  2015-06-01 13:58 [RFC][PATCH 0/7] sched: balance callbacks Peter Zijlstra
                   ` (5 preceding siblings ...)
  2015-06-01 13:58 ` [RFC][PATCH 6/7] sched,dl: Remove return value from pull_dl_task() Peter Zijlstra
@ 2015-06-01 13:58 ` Peter Zijlstra
  2015-06-03  4:52   ` Kamalesh Babulal
  2015-06-01 14:16 ` [RFC][PATCH 0/7] sched: " Peter Zijlstra
  2015-06-01 14:42 ` Mike Galbraith
  8 siblings, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-01 13:58 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, juri.lelli, pang.xunlei, oleg, linux-kernel,
	Peter Zijlstra

[-- Attachment #1: peterz-sched-post_schedule-5.patch --]
[-- Type: text/plain, Size: 2934 bytes --]

Remove the direct {push,pull} balancing operations from
switched_{from,to}_dl() / prio_changed_dl() and use the balance
callback queue.

Again, err on the side of too many reschedules; since too few is a
hard bug while too many is just annoying.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/deadline.c |   41 ++++++++++++++++++++---------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,16 +213,23 @@ static inline bool need_pull_dl_task(str
 	return dl_task(prev);
 }
 
-static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
+static DEFINE_PER_CPU(struct callback_head, dl_push_head);
+static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
 
 static void push_dl_tasks(struct rq *);
+static void pull_dl_task(struct rq *);
 
 static inline void queue_push_tasks(struct rq *rq)
 {
 	if (!has_pushable_dl_tasks(rq))
 		return;
 
-	queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
+	queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+	queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
 }
 
 static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
@@ -1040,8 +1047,6 @@ static void check_preempt_equal_dl(struc
 	resched_curr(rq);
 }
 
-static void pull_dl_task(struct rq *this_rq);
-
 #endif /* CONFIG_SMP */
 
 /*
@@ -1705,7 +1710,7 @@ static void switched_from_dl(struct rq *
 	if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
 		return;
 
-	pull_dl_task(rq);
+	queue_pull_task(rq);
 }
 
 /*
@@ -1714,21 +1719,16 @@ static void switched_from_dl(struct rq *
  */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
-	int check_resched = 1;
-
 	if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
-		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
-			push_dl_task(rq) && rq != task_rq(p))
-			/* Only reschedule if pushing failed */
-			check_resched = 0;
-#endif /* CONFIG_SMP */
-		if (check_resched) {
-			if (dl_task(rq->curr))
-				check_preempt_curr_dl(rq, p, 0);
-			else
-				resched_curr(rq);
-		}
+		if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
+			queue_push_tasks(rq);
+#else
+		if (dl_task(rq->curr))
+			check_preempt_curr_dl(rq, p, 0);
+		else
+			resched_curr(rq);
+#endif
 	}
 }
 
@@ -1748,15 +1748,14 @@ static void prio_changed_dl(struct rq *r
 		 * or lowering its prio, so...
 		 */
 		if (!rq->dl.overloaded)
-			pull_dl_task(rq);
+			queue_pull_task(rq);
 
 		/*
 		 * If we now have a earlier deadline task than p,
 		 * then reschedule, provided p is still on this
 		 * runqueue.
 		 */
-		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
-		    rq->curr == p)
+		if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
 			resched_curr(rq);
 #else
 		/*



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 0/7] sched: balance callbacks
  2015-06-01 13:58 [RFC][PATCH 0/7] sched: balance callbacks Peter Zijlstra
                   ` (6 preceding siblings ...)
  2015-06-01 13:58 ` [RFC][PATCH 7/7] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks Peter Zijlstra
@ 2015-06-01 14:16 ` Peter Zijlstra
  2015-06-01 14:42 ` Mike Galbraith
  8 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-01 14:16 UTC (permalink / raw)
  To: umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, juri.lelli, pang.xunlei, oleg, linux-kernel

On Mon, Jun 01, 2015 at 03:58:18PM +0200, Peter Zijlstra wrote:
> Hi,
> 
> Mike stumbled over a cute bug where the RT/DL balancing ops caused a bug.
> 
> The exact scenario is __sched_setscheduler() changing a (runnable) task from
> FIFO to OTHER. In swiched_from_rt(), where we do pull_rt_task() we temporarity
> drop rq->lock. This gap allows regular cfs load-balancing to step in and
> migrate our.

s/\./ task&/

> However, check_class_changed() will happily continue with switched_to_fair()
> which assumes our task is still on the old rq and makes the kernel go boom.
> 
> Instead of trying to patch this up and make things complicated; simply disallow
> these methods to drop rq->lock and extend the current post_schedule stuff into
> a balancing callback list, and use that.
> 
> This survives Mike's testcase for well over an hour on my ivb-ep. I've not yet
> tested it on anything bigger.
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 0/7] sched: balance callbacks
  2015-06-01 13:58 [RFC][PATCH 0/7] sched: balance callbacks Peter Zijlstra
                   ` (7 preceding siblings ...)
  2015-06-01 14:16 ` [RFC][PATCH 0/7] sched: " Peter Zijlstra
@ 2015-06-01 14:42 ` Mike Galbraith
  2015-06-02  6:48   ` Mike Galbraith
  8 siblings, 1 reply; 26+ messages in thread
From: Mike Galbraith @ 2015-06-01 14:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, ktkhai, rostedt, juri.lelli, pang.xunlei, oleg, linux-kernel

On Mon, 2015-06-01 at 15:58 +0200, Peter Zijlstra wrote:

> This survives Mike's testcase for well over an hour on my ivb-ep. I've not yet
> tested it on anything bigger.

I'll plug it into -rt, and beat it up.  I don't have any dl toys though.

	-Mike



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 0/7] sched: balance callbacks
  2015-06-01 14:42 ` Mike Galbraith
@ 2015-06-02  6:48   ` Mike Galbraith
  0 siblings, 0 replies; 26+ messages in thread
From: Mike Galbraith @ 2015-06-02  6:48 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: mingo, ktkhai, rostedt, juri.lelli, pang.xunlei, oleg, linux-kernel

On Mon, 2015-06-01 at 16:42 +0200, Mike Galbraith wrote:
> On Mon, 2015-06-01 at 15:58 +0200, Peter Zijlstra wrote:
> 
> > This survives Mike's testcase for well over an hour on my ivb-ep. I've not yet
> > tested it on anything bigger.
> 
> I'll plug it into -rt, and beat it up.  I don't have any dl toys though.

Did that, 64 core box is happy camper.

	-Mike


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 3/7] sched: Allow balance callbacks for check_class_changed()
  2015-06-01 13:58 ` [RFC][PATCH 3/7] sched: Allow balance callbacks for check_class_changed() Peter Zijlstra
@ 2015-06-02 13:58   ` Kirill Tkhai
  2015-06-02 14:27     ` Peter Zijlstra
  0 siblings, 1 reply; 26+ messages in thread
From: Kirill Tkhai @ 2015-06-02 13:58 UTC (permalink / raw)
  To: Peter Zijlstra, umgwanakikbuti, mingo
  Cc: ktkhai, rostedt, juri.lelli, pang.xunlei, oleg, linux-kernel

01.06.2015, 17:13, "Peter Zijlstra" <peterz@infradead.org>:
> In order to remove dropping rq->lock from the
> switched_{to,from}()/prio_changed() sched_class methods, run the
> balance callbacks after it.
>
> We need to remove dropping rq->lock because its buggy,
> suppose using sched_setattr()/sched_setscheduler() to change a running
> task from FIFO to OTHER.
>
> By the time we get to switched_from_rt() the task is already enqueued
> on the cfs runqueues. If switched_from_rt() does pull_rt_task() and
> drops rq->lock, load-balancing can come in and move our task @p to
> another rq.
>
> The subsequent switched_to_fair() still assumes @p is on @rq and bad
> things will happen.
>
> By using balance callbacks we delay the load-balancing operations
> {rt,dl}x{push,pull} until we've done all the important work and the
> task is fully set up.
>
> Furthermore, the balance callbacks do not know about @p, therefore
> they cannot get confused like this.
>
> Reported-by: Mike Galbraith <umgwanakikbuti@gmail.com>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/core.c |   25 ++++++++++++++++++++++---
>  1 file changed, 22 insertions(+), 3 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1001,7 +1001,11 @@ inline int task_curr(const struct task_s
>  }
>
>  /*
> - * Can drop rq->lock because from sched_class::switched_from() methods drop it.
> + * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
> + * use the balance_callback list if you want balancing.
> + *
> + * this means any call to check_class_changed() must be followed by a call to
> + * balance_callback().
>   */
>  static inline void check_class_changed(struct rq *rq, struct task_struct *p,
>                                         const struct sched_class *prev_class,
> @@ -1010,7 +1014,7 @@ static inline void check_class_changed(s
>          if (prev_class != p->sched_class) {
>                  if (prev_class->switched_from)
>                          prev_class->switched_from(rq, p);
> - /* Possble rq->lock 'hole'.  */
> +

But switched_from_dl()->cancel_dl_timer() still unlocks rq->lock.

It seems we should drop it (cancel_dl_timer) and move hrtimer_cancel()
from switched_from_dl() to finish_task_switch(). It will be executed
for all classes and completely take the functionality we implement
cancel_dl_timer() for.

>                  p->sched_class->switched_to(rq, p);
>          } else if (oldprio != p->prio || dl_task(p))
>                  p->sched_class->prio_changed(rq, p, oldprio);

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 3/7] sched: Allow balance callbacks for check_class_changed()
  2015-06-02 13:58   ` Kirill Tkhai
@ 2015-06-02 14:27     ` Peter Zijlstra
  2015-06-02 16:07       ` Peter Zijlstra
  0 siblings, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-02 14:27 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, juri.lelli, pang.xunlei,
	oleg, linux-kernel

On Tue, 2015-06-02 at 16:58 +0300, Kirill Tkhai wrote:
> 01.06.2015, 17:13, "Peter Zijlstra" <peterz@infradead.org>:

> > @@ -1010,7 +1014,7 @@ static inline void check_class_changed(s
> >          if (prev_class != p->sched_class) {
> >                  if (prev_class->switched_from)
> >                          prev_class->switched_from(rq, p);
> > - /* Possble rq->lock 'hole'.  */
> > +
> 
> But switched_from_dl()->cancel_dl_timer() still unlocks rq->lock.
> 
> It seems we should drop it (cancel_dl_timer) and move hrtimer_cancel()
> from switched_from_dl() to finish_task_switch(). It will be executed
> for all classes and completely take the functionality we implement
> cancel_dl_timer() for.

*groan* yes.. I don't like moving it into generic code though.

more thinking required.

> >                  p->sched_class->switched_to(rq, p);
> >          } else if (oldprio != p->prio || dl_task(p))
> >                  p->sched_class->prio_changed(rq, p, oldprio);


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 3/7] sched: Allow balance callbacks for check_class_changed()
  2015-06-02 14:27     ` Peter Zijlstra
@ 2015-06-02 16:07       ` Peter Zijlstra
  2015-06-02 16:27         ` Kirill Tkhai
  0 siblings, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-02 16:07 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, juri.lelli, pang.xunlei,
	oleg, linux-kernel, luca.abeni

On Tue, Jun 02, 2015 at 04:27:10PM +0200, Peter Zijlstra wrote:
> On Tue, 2015-06-02 at 16:58 +0300, Kirill Tkhai wrote:
> > 01.06.2015, 17:13, "Peter Zijlstra" <peterz@infradead.org>:
> 
> > > @@ -1010,7 +1014,7 @@ static inline void check_class_changed(s
> > >          if (prev_class != p->sched_class) {
> > >                  if (prev_class->switched_from)
> > >                          prev_class->switched_from(rq, p);
> > > - /* Possble rq->lock 'hole'.  */
> > > +
> > 
> > But switched_from_dl()->cancel_dl_timer() still unlocks rq->lock.
> > 
> > It seems we should drop it (cancel_dl_timer) and move hrtimer_cancel()
> > from switched_from_dl() to finish_task_switch(). It will be executed
> > for all classes and completely take the functionality we implement
> > cancel_dl_timer() for.
> 
> *groan* yes.. I don't like moving it into generic code though.
> 
> more thinking required.

How about something like so then?

---
Subject: sched,deadline: Fix sched class hopping CBS hole

We still have a few pending issues with the deadline code, one of which
is that switching between scheduling classes can 'leak' CBS state.

Close the hole by retaining the current CBS state when leaving
SCHED_DEADLINE and unconditionally programming the deadline timer.

The timer will then reset the CBS state if the task is still
!SCHED_DEADLINE by the time it hits.

If the task were to die, task_dead_dl() will cancel the timer, so no
lingering state.

Cc: Luca Abeni <luca.abeni@unitn.it>
Cc: Juri Lelli <juri.lelli@arm.com>
Fixes: 40767b0dc768 ("sched/deadline: Fix deadline parameter modification handling")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 kernel/sched/deadline.c | 71 ++++++++++++++++++++-----------------------------
 1 file changed, 29 insertions(+), 42 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 7336fe5fea30..625ed74f1467 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -572,20 +572,26 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 	rq = task_rq_lock(p, &flags);
 
 	/*
-	 * We need to take care of several possible races here:
-	 *
-	 *   - the task might have changed its scheduling policy
-	 *     to something different than SCHED_DEADLINE
-	 *   - the task might have changed its reservation parameters
-	 *     (through sched_setattr())
-	 *   - the task might have been boosted by someone else and
-	 *     might be in the boosting/deboosting path
-	 *
-	 * In all this cases we bail out, as the task is already
-	 * in the runqueue or is going to be enqueued back anyway.
+	 * The task might have changed its scheduling policy to something
+	 * different than SCHED_DEADLINE (through switched_fromd_dl()).
 	 */
-	if (!dl_task(p) || dl_se->dl_new ||
-	    dl_se->dl_boosted || !dl_se->dl_throttled)
+	if (!dl_task(p)) {
+		__dl_clear_params(p);
+		goto unlock;
+	}
+
+	/*
+	 * There's only two ways to have dl_new set; 1) __sched_fork(), and a
+	 * fresh task should not have a pending timer, and 2) the above, which
+	 * does not get here.
+	 */
+	WARN_ON_ONCE(dl_se->dl_new);
+
+	/*
+	 * The task might have been boosted by someone else and might be in the
+	 * boosting/deboosting path, its not throttled.
+	 */
+	if (dl_se->dl_boosted || !dl_se->dl_throttled)
 		goto unlock;
 
 	sched_clock_tick();
@@ -1683,37 +1689,18 @@ void __init init_sched_dl_class(void)
 
 #endif /* CONFIG_SMP */
 
-/*
- *  Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
- */
-static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
-{
-	struct hrtimer *dl_timer = &p->dl.dl_timer;
-
-	/* Nobody will change task's class if pi_lock is held */
-	lockdep_assert_held(&p->pi_lock);
-
-	if (hrtimer_active(dl_timer)) {
-		int ret = hrtimer_try_to_cancel(dl_timer);
-
-		if (unlikely(ret == -1)) {
-			/*
-			 * Note, p may migrate OR new deadline tasks
-			 * may appear in rq when we are unlocking it.
-			 * A caller of us must be fine with that.
-			 */
-			raw_spin_unlock(&rq->lock);
-			hrtimer_cancel(dl_timer);
-			raw_spin_lock(&rq->lock);
-		}
-	}
-}
-
 static void switched_from_dl(struct rq *rq, struct task_struct *p)
 {
-	/* XXX we should retain the bw until 0-lag */
-	cancel_dl_timer(rq, p);
-	__dl_clear_params(p);
+	/*
+	 * Start the deadline timer; if we switch back to dl before this we'll
+	 * continue consuming our current CBS slice. If we stay outside of
+	 * SCHED_DEADLINE until the deadline passes, the timer will reset the
+	 * task.
+	 *
+	 * task_dead_dl() will cancel our timer if we happen to die while
+	 * its still pending.
+	 */
+	start_dl_timer(&p->dl, false);
 
 	/*
 	 * Since this might be the only -deadline task on the rq,

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 3/7] sched: Allow balance callbacks for check_class_changed()
  2015-06-02 16:07       ` Peter Zijlstra
@ 2015-06-02 16:27         ` Kirill Tkhai
  2015-06-03  7:32           ` Peter Zijlstra
  0 siblings, 1 reply; 26+ messages in thread
From: Kirill Tkhai @ 2015-06-02 16:27 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, juri.lelli, pang.xunlei,
	oleg, linux-kernel, luca.abeni

В Вт, 02/06/2015 в 18:07 +0200, Peter Zijlstra пишет:
On Tue, Jun 02, 2015 at 04:27:10PM +0200, Peter Zijlstra wrote:
> > On Tue, 2015-06-02 at 16:58 +0300, Kirill Tkhai wrote:
> > > 01.06.2015, 17:13, "Peter Zijlstra" <peterz@infradead.org>:
> > 
> > > > @@ -1010,7 +1014,7 @@ static inline void check_class_changed(s
> > > >          if (prev_class != p->sched_class) {
> > > >                  if (prev_class->switched_from)
> > > >                          prev_class->switched_from(rq, p);
> > > > - /* Possble rq->lock 'hole'.  */
> > > > +
> > > 
> > > But switched_from_dl()->cancel_dl_timer() still unlocks rq->lock.
> > > 
> > > It seems we should drop it (cancel_dl_timer) and move hrtimer_cancel()
> > > from switched_from_dl() to finish_task_switch(). It will be executed
> > > for all classes and completely take the functionality we implement
> > > cancel_dl_timer() for.
> > 
> > *groan* yes.. I don't like moving it into generic code though.
> > 
> > more thinking required.
> 
> How about something like so then?
> 
> ---
> Subject: sched,deadline: Fix sched class hopping CBS hole
> 
> We still have a few pending issues with the deadline code, one of which
> is that switching between scheduling classes can 'leak' CBS state.
> 
> Close the hole by retaining the current CBS state when leaving
> SCHED_DEADLINE and unconditionally programming the deadline timer.
> 
> The timer will then reset the CBS state if the task is still
> !SCHED_DEADLINE by the time it hits.
> 
> If the task were to die, task_dead_dl() will cancel the timer, so no
> lingering state.
> 
> Cc: Luca Abeni <luca.abeni@unitn.it>
> Cc: Juri Lelli <juri.lelli@arm.com>
> Fixes: 40767b0dc768 ("sched/deadline: Fix deadline parameter modification handling")
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  kernel/sched/deadline.c | 71 ++++++++++++++++++++-----------------------------
>  1 file changed, 29 insertions(+), 42 deletions(-)
> 
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index 7336fe5fea30..625ed74f1467 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -572,20 +572,26 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
>  	rq = task_rq_lock(p, &flags);
>  
>  	/*
> -	 * We need to take care of several possible races here:
> -	 *
> -	 *   - the task might have changed its scheduling policy
> -	 *     to something different than SCHED_DEADLINE
> -	 *   - the task might have changed its reservation parameters
> -	 *     (through sched_setattr())
> -	 *   - the task might have been boosted by someone else and
> -	 *     might be in the boosting/deboosting path
> -	 *
> -	 * In all this cases we bail out, as the task is already
> -	 * in the runqueue or is going to be enqueued back anyway.
> +	 * The task might have changed its scheduling policy to something
> +	 * different than SCHED_DEADLINE (through switched_fromd_dl()).
>  	 */
> -	if (!dl_task(p) || dl_se->dl_new ||
> -	    dl_se->dl_boosted || !dl_se->dl_throttled)
> +	if (!dl_task(p)) {
> +		__dl_clear_params(p);
> +		goto unlock;
> +	}
> +
> +	/*
> +	 * There's only two ways to have dl_new set; 1) __sched_fork(), and a
> +	 * fresh task should not have a pending timer, and 2) the above, which
> +	 * does not get here.
> +	 */
> +	WARN_ON_ONCE(dl_se->dl_new);
> +
> +	/*
> +	 * The task might have been boosted by someone else and might be in the
> +	 * boosting/deboosting path, its not throttled.
> +	 */
> +	if (dl_se->dl_boosted || !dl_se->dl_throttled)
>  		goto unlock;
>  
>  	sched_clock_tick();
> @@ -1683,37 +1689,18 @@ void __init init_sched_dl_class(void)
>  
>  #endif /* CONFIG_SMP */
>  
> -/*
> - *  Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
> - */
> -static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
> -{
> -	struct hrtimer *dl_timer = &p->dl.dl_timer;
> -
> -	/* Nobody will change task's class if pi_lock is held */
> -	lockdep_assert_held(&p->pi_lock);
> -
> -	if (hrtimer_active(dl_timer)) {
> -		int ret = hrtimer_try_to_cancel(dl_timer);
> -
> -		if (unlikely(ret == -1)) {
> -			/*
> -			 * Note, p may migrate OR new deadline tasks
> -			 * may appear in rq when we are unlocking it.
> -			 * A caller of us must be fine with that.
> -			 */
> -			raw_spin_unlock(&rq->lock);
> -			hrtimer_cancel(dl_timer);
> -			raw_spin_lock(&rq->lock);
> -		}
> -	}
> -}
> -
>  static void switched_from_dl(struct rq *rq, struct task_struct *p)
>  {
> -	/* XXX we should retain the bw until 0-lag */
> -	cancel_dl_timer(rq, p);
> -	__dl_clear_params(p);
> +	/*
> +	 * Start the deadline timer; if we switch back to dl before this we'll
> +	 * continue consuming our current CBS slice. If we stay outside of
> +	 * SCHED_DEADLINE until the deadline passes, the timer will reset the
> +	 * task.
> +	 *
> +	 * task_dead_dl() will cancel our timer if we happen to die while
> +	 * its still pending.

task_dead_dl() is called for tasks of deadline class only. So if we do that,
the timer may be executed after final task's dead.

> +	 */
> +	start_dl_timer(&p->dl, false);
>  
>  	/*
>  	 * Since this might be the only -deadline task on the rq,
> 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list
  2015-06-01 13:58 ` [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list Peter Zijlstra
@ 2015-06-03  4:41   ` Kamalesh Babulal
  2015-06-03  7:21     ` Peter Zijlstra
  2015-06-03  8:24   ` pang.xunlei
  1 sibling, 1 reply; 26+ messages in thread
From: Kamalesh Babulal @ 2015-06-03  4:41 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, juri.lelli, pang.xunlei,
	oleg, linux-kernel

* Peter Zijlstra <peterz@infradead.org> [2015-06-01 15:58:19]:

[...]

> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -213,9 +213,16 @@ static inline bool need_pull_dl_task(str
>  	return dl_task(prev);
>  }
>  
> -static inline void set_post_schedule(struct rq *rq)
> +static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
> +
> +static void push_dl_tasks(struct rq *);
> +
> +static inline void queue_push_tasks(struct rq *rq)
>  {
> -	rq->post_schedule = has_pushable_dl_tasks(rq);
> +	if (!has_pushable_dl_tasks(rq))
> +		return;
> +
> +	queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
>  }

When compiling with CONFIG_SMP=n, following build warning is triggered:
  CC      kernel/sched/deadline.o
kernel/sched/deadline.c: In function ‘pick_next_task_dl’:
kernel/sched/deadline.c:1136:2: error: implicit declaration of function ‘queue_push_tasks’ [-Werror=implicit-function-declaration]
  queue_push_tasks(rq);
  ^
cc1: some warnings being treated as errors

set_post_schedule() exist for CONFIG_SMP=n case, which was not modified to
queue_push_tasks().

[...]
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -624,9 +624,10 @@ struct rq {
>  	unsigned long cpu_capacity;
>  	unsigned long cpu_capacity_orig;
>  
> +	struct callback_head *balance_callback;
> +
>  	unsigned char idle_balance;
>  	/* For active balancing */
> -	int post_schedule;
>  	int active_balance;
>  	int push_cpu;
>  	struct cpu_stop_work active_balance_work;
> @@ -695,6 +696,21 @@ struct rq {
>  #endif
>  };
>  
> +static inline void
> +queue_balance_callback(struct rq *rq,
> +		       struct callback_head *head,
> +		       void (*func)(struct rq *rq))
> +{
> +	lockdep_assert_held(&rq->lock);
> +
> +	if (unlikely(head->next))
> +		return;
> +
> +	head->func = (void (*)(struct callback_head *))func;
> +	head->next = rq->balance_callback;
> +	rq->balance_callback = head;
> +}
> +

While compiling with CONFIG_SMP=n, another build error is seen:

In file included from kernel/sched/core.c:86:0:
kernel/sched/sched.h: In function ‘queue_balance_callback’:
kernel/sched/sched.h:710:17: error: ‘struct rq’ has no member named ‘balance_callback’
  head->next = rq->balance_callback;
                 ^
kernel/sched/sched.h:711:4: error: ‘struct rq’ has no member named ‘balance_callback’
  rq->balance_callback = head;
    ^

Guarding queue_balance_callback() with #ifdef CONFIG_SMP fixes
the issue, as all of the call sites are also with #ifdef  CONFIG_SMP

Thanks,
Kamalesh


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 7/7] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks
  2015-06-01 13:58 ` [RFC][PATCH 7/7] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks Peter Zijlstra
@ 2015-06-03  4:52   ` Kamalesh Babulal
  0 siblings, 0 replies; 26+ messages in thread
From: Kamalesh Babulal @ 2015-06-03  4:52 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, juri.lelli, pang.xunlei,
	oleg, linux-kernel

* Peter Zijlstra <peterz@infradead.org> [2015-06-01 15:58:25]:

[...]

>  
> -static DEFINE_PER_CPU(struct callback_head, dl_balance_head);
> +static DEFINE_PER_CPU(struct callback_head, dl_push_head);
> +static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
>  
>  static void push_dl_tasks(struct rq *);
> +static void pull_dl_task(struct rq *);
>  
>  static inline void queue_push_tasks(struct rq *rq)
>  {
>  	if (!has_pushable_dl_tasks(rq))
>  		return;
>  
> -	queue_balance_callback(rq, &per_cpu(dl_balance_head, rq->cpu), push_dl_tasks);
> +	queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
> +}
> +
> +static inline void queue_pull_task(struct rq *rq)
> +{
> +	queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
>  }

queue_pull_task() is not defined for CONFIG_SMP=n case, following build error
is seen while building the kernel with SMP=n:

  CC      kernel/sched/deadline.o
kernel/sched/deadline.c: In function ‘switched_from_dl’:
kernel/sched/deadline.c:1713:2: error: implicit declaration of function ‘queue_pull_task’ [-Werror=implicit-function-declaration]
  queue_pull_task(rq);
  ^
cc1: some warnings being treated as errors

Thanks,
Kamalesh.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list
  2015-06-03  4:41   ` Kamalesh Babulal
@ 2015-06-03  7:21     ` Peter Zijlstra
  0 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03  7:21 UTC (permalink / raw)
  To: Kamalesh Babulal
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, juri.lelli, pang.xunlei,
	oleg, linux-kernel

On Wed, Jun 03, 2015 at 10:11:59AM +0530, Kamalesh Babulal wrote:
> set_post_schedule() exist for CONFIG_SMP=n case, which was not modified to
> queue_push_tasks().

Yeah, already noticed the SMP=n borkage, fixed it after posting the rfc.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 3/7] sched: Allow balance callbacks for check_class_changed()
  2015-06-02 16:27         ` Kirill Tkhai
@ 2015-06-03  7:32           ` Peter Zijlstra
  2015-06-03 10:40             ` Kirill Tkhai
  2015-06-03 10:42             ` Kirill Tkhai
  0 siblings, 2 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03  7:32 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, juri.lelli, pang.xunlei,
	oleg, linux-kernel, luca.abeni

On Tue, Jun 02, 2015 at 07:27:19PM +0300, Kirill Tkhai wrote:
> > +	 * task_dead_dl() will cancel our timer if we happen to die while
> > +	 * its still pending.
> 
> task_dead_dl() is called for tasks of deadline class only. So if we do that,
> the timer may be executed after final task's dead.

Indeed; sleep deprived brain misses the obvious :/

I can't seem to come up with anything much better than pulling that
hrtimer_cancel() into finish_task_switch(), however sad that is.

Something like:

  for_each_class(class) {
	if (class->task_dead)
		class->task_dead(prev);
  }

Would be nicest, but will slow down the common case. And a callback list
has the downside of having to preallocate entries for every task,
causing mostly pointless memory overhead.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list
  2015-06-01 13:58 ` [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list Peter Zijlstra
  2015-06-03  4:41   ` Kamalesh Babulal
@ 2015-06-03  8:24   ` pang.xunlei
  2015-06-03  8:55     ` Peter Zijlstra
  1 sibling, 1 reply; 26+ messages in thread
From: pang.xunlei @ 2015-06-03  8:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: juri.lelli, ktkhai, linux-kernel, mingo, oleg, pang.xunlei,
	Peter Zijlstra, rostedt, umgwanakikbuti

Hi Peter,

This may increase the overhead of schedule() a bit, as it will have 
more work to do.

check_class_changed():
        if (prev_class->switched_from)
                        prev_class->switched_from(rq, p);
                /* Possble rq->lock 'hole'.  */
                p->sched_class->switched_to(rq, p);

For above cases, why can't we just add a judgement in switched_to_fair() 
as follows:
if (rq != task_rq(p))
        return;

switched_to_rt() and switched_to_dl() already did the similar judgement.

Then the following operation is usually task_rq_unlock() or the like, 
seems no other harm with the old rq of p.

-Xunlei

Peter Zijlstra <peterz@infradead.org> wrote 2015-06-01 PM 09:58:19:
> [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback 
list
> 
> Generalize the post_schedule() stuff into a balance callback list.
> This allows us to more easily use it outside of schedule() and cross
> sched_class.
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>

--------------------------------------------------------
ZTE Information Security Notice: The information contained in this mail (and any attachment transmitted herewith) is privileged and confidential and is intended for the exclusive use of the addressee(s).  If you are not an intended recipient, any disclosure, reproduction, distribution or other dissemination or use of the information contained is strictly prohibited.  If you have received this mail in error, please delete it and notify us immediately.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list
  2015-06-03  8:24   ` pang.xunlei
@ 2015-06-03  8:55     ` Peter Zijlstra
  2015-06-05  7:13       ` pang.xunlei
  0 siblings, 1 reply; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03  8:55 UTC (permalink / raw)
  To: pang.xunlei
  Cc: juri.lelli, ktkhai, linux-kernel, mingo, oleg, pang.xunlei,
	rostedt, umgwanakikbuti

A: Because it messes up the order in which people normally read text.
Q: Why is top-posting such a bad thing?
A: Top-posting.
Q: What is the most annoying thing in e-mail?

On Wed, Jun 03, 2015 at 04:24:05PM +0800, pang.xunlei@zte.com.cn wrote:
> Hi Peter,
> 
> This may increase the overhead of schedule() a bit, as it will have 
> more work to do.

How so? It replaces the post_schedule() muck and should not be more
expensive than that.

It will make sched_setscheduler() etc.. a little more expensive, but
that doesn't matter, those are not critical things at all.

> check_class_changed():
>         if (prev_class->switched_from)
>                         prev_class->switched_from(rq, p);
>                 /* Possble rq->lock 'hole'.  */
>                 p->sched_class->switched_to(rq, p);
> 
> For above cases, why can't we just add a judgement in switched_to_fair() 
> as follows:
> if (rq != task_rq(p))
>         return;

Because its too easy to get wrong. There have been many instances of
bugs caused by this dropping of rq->lock.

And sure you can patch it up, once you find it, but I would really
rather prevent these things if at all possible.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 3/7] sched: Allow balance callbacks for check_class_changed()
  2015-06-03  7:32           ` Peter Zijlstra
@ 2015-06-03 10:40             ` Kirill Tkhai
  2015-06-03 10:42             ` Kirill Tkhai
  1 sibling, 0 replies; 26+ messages in thread
From: Kirill Tkhai @ 2015-06-03 10:40 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Kirill Tkhai, umgwanakikbuti, mingo, ktkhai, rostedt, juri.lelli,
	pang.xunlei, oleg, linux-kernel, luca.abeni

В Ср, 03/06/2015 в 09:32 +0200, Peter Zijlstra пишет:
> On Tue, Jun 02, 2015 at 07:27:19PM +0300, Kirill Tkhai wrote:
> > > +	 * task_dead_dl() will cancel our timer if we happen to die while
> > > +	 * its still pending.
> > 
> > task_dead_dl() is called for tasks of deadline class only. So if we do that,
> > the timer may be executed after final task's dead.
> 
> Indeed; sleep deprived brain misses the obvious :/
> 
> I can't seem to come up with anything much better than pulling that
> hrtimer_cancel() into finish_task_switch(), however sad that is.

Yeah, class-specific manipulation in generic function finish_task_switch()
worsen modularity.

I have an idea, but it require small hrtimer modification. We may use
task_struct counters {get,put}_task_struct(), when we're starting or cancelling
the timer, and in timer handler. But it require to return back the commit:

commit 61699e13072a89880aa584dcc64c6da465fb2ccc
Author: Thomas Gleixner <tglx@linutronix.de>
Date:   Tue Apr 14 21:09:23 2015 +0000

    hrtimer: Remove hrtimer_start() return value

because restart of the timer races with time handler.

Also, it's not clearly for me if it's safe to do the final put_task_struct()
from irq context and how it worsens life of RT people.

> Something like:
> 
>   for_each_class(class) {
> 	if (class->task_dead)
> 		class->task_dead(prev);
>   }
> 
> Would be nicest, but will slow down the common case. And a callback list
> has the downside of having to preallocate entries for every task,
> causing mostly pointless memory overhead.
> 
> 



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 3/7] sched: Allow balance callbacks for check_class_changed()
  2015-06-03  7:32           ` Peter Zijlstra
  2015-06-03 10:40             ` Kirill Tkhai
@ 2015-06-03 10:42             ` Kirill Tkhai
  2015-06-03 10:45               ` Peter Zijlstra
  1 sibling, 1 reply; 26+ messages in thread
From: Kirill Tkhai @ 2015-06-03 10:42 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, juri.lelli, pang.xunlei,
	oleg, linux-kernel, luca.abeni

[resend]

03.06.2015, 10:32, "Peter Zijlstra" <peterz@infradead.org>:
> On Tue, Jun 02, 2015 at 07:27:19PM +0300, Kirill Tkhai wrote:
>>>  + * task_dead_dl() will cancel our timer if we happen to die while
>>>  + * its still pending.
>>  task_dead_dl() is called for tasks of deadline class only. So if we do that,
>>  the timer may be executed after final task's dead.
>
> Indeed; sleep deprived brain misses the obvious :/

I have an idea, but it require small hrtimer modification. We may use
task_struct counters {get,put}_task_struct(), when we're starting or cancelling
the timer, and in timer handler. But it require to return back the commit:

commit 61699e13072a89880aa584dcc64c6da465fb2ccc
Author: Thomas Gleixner <tglx@linutronix.de>
Date:   Tue Apr 14 21:09:23 2015 +0000

    hrtimer: Remove hrtimer_start() return value

because restart of the timer races with time handler.

Also, it's not clearly for me if it's safe to do the final put_task_struct()
from irq context and how it worsens life of RT people.

> I can't seem to come up with anything much better than pulling that
> hrtimer_cancel() into finish_task_switch(), however sad that is.
>
> Something like:
>
>   for_each_class(class) {
>         if (class->task_dead)
>                 class->task_dead(prev);
>   }
>
> Would be nicest, but will slow down the common case. And a callback list
> has the downside of having to preallocate entries for every task,
> causing mostly pointless memory overhead.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 3/7] sched: Allow balance callbacks for check_class_changed()
  2015-06-03 10:42             ` Kirill Tkhai
@ 2015-06-03 10:45               ` Peter Zijlstra
  0 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-03 10:45 UTC (permalink / raw)
  To: Kirill Tkhai
  Cc: umgwanakikbuti, mingo, ktkhai, rostedt, juri.lelli, pang.xunlei,
	oleg, linux-kernel, luca.abeni

On Wed, Jun 03, 2015 at 01:42:41PM +0300, Kirill Tkhai wrote:
> [resend]
> 
> 03.06.2015, 10:32, "Peter Zijlstra" <peterz@infradead.org>:
> > On Tue, Jun 02, 2015 at 07:27:19PM +0300, Kirill Tkhai wrote:
> >>>  + * task_dead_dl() will cancel our timer if we happen to die while
> >>>  + * its still pending.
> >>  task_dead_dl() is called for tasks of deadline class only. So if we do that,
> >>  the timer may be executed after final task's dead.
> >
> > Indeed; sleep deprived brain misses the obvious :/
> 
> I have an idea, but it require small hrtimer modification. We may use
> task_struct counters {get,put}_task_struct(), when we're starting or cancelling
> the timer, and in timer handler. But it require to return back the commit:

Yeah, already working on that. :-)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list
  2015-06-03  8:55     ` Peter Zijlstra
@ 2015-06-05  7:13       ` pang.xunlei
  2015-06-05  7:19         ` Peter Zijlstra
  0 siblings, 1 reply; 26+ messages in thread
From: pang.xunlei @ 2015-06-05  7:13 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: juri.lelli, ktkhai, linux-kernel, mingo, oleg, pang.xunlei,
	rostedt, umgwanakikbuti

Hi Peter,

Peter Zijlstra <peterz@infradead.org> wrote 2015-06-03 PM 04:55:27:
> 
> Re: [RFC][PATCH 1/7] sched: Replace post_schedule with a balance 
callback list
> 
> A: Because it messes up the order in which people normally read text.
> Q: Why is top-posting such a bad thing?
> A: Top-posting.
> Q: What is the most annoying thing in e-mail?

Sure, will follow. Thanks for the reminding.

> 
> On Wed, Jun 03, 2015 at 04:24:05PM +0800, pang.xunlei@zte.com.cn wrote:
> > Hi Peter,
> > 
> > This may increase the overhead of schedule() a bit, as it will have 
> > more work to do.
> 
> How so? It replaces the post_schedule() muck and should not be more
> expensive than that.
> 
> It will make sched_setscheduler() etc.. a little more expensive, but
> that doesn't matter, those are not critical things at all.

Another side effect it may have is that it will introduce some latency, 
because we have to wait for next schedule() point to do the balancing. 
prio_changed_rt()->pull_rt_task() is not rare cases when using PI futex.

-Xunlei

> 
> > check_class_changed():
> >         if (prev_class->switched_from)
> >                         prev_class->switched_from(rq, p);
> >                 /* Possble rq->lock 'hole'.  */
> >                 p->sched_class->switched_to(rq, p);
> > 
> > For above cases, why can't we just add a judgement in 
switched_to_fair() 
> > as follows:
> > if (rq != task_rq(p))
> >         return;
> 
> Because its too easy to get wrong. There have been many instances of
> bugs caused by this dropping of rq->lock.
> 
> And sure you can patch it up, once you find it, but I would really
> rather prevent these things if at all possible.

--------------------------------------------------------
ZTE Information Security Notice: The information contained in this mail (and any attachment transmitted herewith) is privileged and confidential and is intended for the exclusive use of the addressee(s).  If you are not an intended recipient, any disclosure, reproduction, distribution or other dissemination or use of the information contained is strictly prohibited.  If you have received this mail in error, please delete it and notify us immediately.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list
  2015-06-05  7:13       ` pang.xunlei
@ 2015-06-05  7:19         ` Peter Zijlstra
  0 siblings, 0 replies; 26+ messages in thread
From: Peter Zijlstra @ 2015-06-05  7:19 UTC (permalink / raw)
  To: pang.xunlei
  Cc: juri.lelli, ktkhai, linux-kernel, mingo, oleg, pang.xunlei,
	rostedt, umgwanakikbuti

On Fri, Jun 05, 2015 at 03:13:38PM +0800, pang.xunlei@zte.com.cn wrote:
> > It will make sched_setscheduler() etc.. a little more expensive, but
> > that doesn't matter, those are not critical things at all.
> 
> Another side effect it may have is that it will introduce some latency, 
> because we have to wait for next schedule() point to do the balancing. 
> prio_changed_rt()->pull_rt_task() is not rare cases when using PI futex.

See patch 3, that adds the balance_callback() to sched_setscheduler()
and rt_mutex_setprio() to avoid just that.


^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2015-06-05  7:19 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-01 13:58 [RFC][PATCH 0/7] sched: balance callbacks Peter Zijlstra
2015-06-01 13:58 ` [RFC][PATCH 1/7] sched: Replace post_schedule with a balance callback list Peter Zijlstra
2015-06-03  4:41   ` Kamalesh Babulal
2015-06-03  7:21     ` Peter Zijlstra
2015-06-03  8:24   ` pang.xunlei
2015-06-03  8:55     ` Peter Zijlstra
2015-06-05  7:13       ` pang.xunlei
2015-06-05  7:19         ` Peter Zijlstra
2015-06-01 13:58 ` [RFC][PATCH 2/7] sched: Use replace normalize_task() with __sched_setscheduler() Peter Zijlstra
2015-06-01 13:58 ` [RFC][PATCH 3/7] sched: Allow balance callbacks for check_class_changed() Peter Zijlstra
2015-06-02 13:58   ` Kirill Tkhai
2015-06-02 14:27     ` Peter Zijlstra
2015-06-02 16:07       ` Peter Zijlstra
2015-06-02 16:27         ` Kirill Tkhai
2015-06-03  7:32           ` Peter Zijlstra
2015-06-03 10:40             ` Kirill Tkhai
2015-06-03 10:42             ` Kirill Tkhai
2015-06-03 10:45               ` Peter Zijlstra
2015-06-01 13:58 ` [RFC][PATCH 4/7] sched,rt: Remove return value from pull_rt_task() Peter Zijlstra
2015-06-01 13:58 ` [RFC][PATCH 5/7] sched,rt: Convert switched_{from,to}_rt() / prio_changed_rt() to balance callbacks Peter Zijlstra
2015-06-01 13:58 ` [RFC][PATCH 6/7] sched,dl: Remove return value from pull_dl_task() Peter Zijlstra
2015-06-01 13:58 ` [RFC][PATCH 7/7] sched,dl: Convert switched_{from,to}_dl() / prio_changed_dl() to balance callbacks Peter Zijlstra
2015-06-03  4:52   ` Kamalesh Babulal
2015-06-01 14:16 ` [RFC][PATCH 0/7] sched: " Peter Zijlstra
2015-06-01 14:42 ` Mike Galbraith
2015-06-02  6:48   ` Mike Galbraith

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.